Actual source code: mpiaijcusparse.cu

petsc-3.4.2 2013-07-02
  1: #include "petscconf.h"
  2: PETSC_CUDA_EXTERN_C_BEGIN
  3: #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
  4: PETSC_CUDA_EXTERN_C_END
 5:  #include mpicusparsematimpl.h

  9: PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
 10: {
 11:   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
 12:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
 13:   PetscErrorCode     ierr;
 14:   PetscInt           i;

 17:   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
 18:   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
 19:   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
 20:   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);

 22:   PetscLayoutSetBlockSize(B->rmap,1);
 23:   PetscLayoutSetBlockSize(B->cmap,1);
 24:   PetscLayoutSetUp(B->rmap);
 25:   PetscLayoutSetUp(B->cmap);
 26:   if (d_nnz) {
 27:     for (i=0; i<B->rmap->n; i++) {
 28:       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
 29:     }
 30:   }
 31:   if (o_nnz) {
 32:     for (i=0; i<B->rmap->n; i++) {
 33:       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
 34:     }
 35:   }
 36:   if (!B->preallocated) {
 37:     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
 38:     MatCreate(PETSC_COMM_SELF,&b->A);
 39:     MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);
 40:     MatSetType(b->A,MATSEQAIJCUSPARSE);
 41:     PetscLogObjectParent(B,b->A);
 42:     MatCreate(PETSC_COMM_SELF,&b->B);
 43:     MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);
 44:     MatSetType(b->B,MATSEQAIJCUSPARSE);
 45:     PetscLogObjectParent(B,b->B);
 46:   }
 47:   MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);
 48:   MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);
 49:   MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);
 50:   MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);

 52:   B->preallocated = PETSC_TRUE;
 53:   return(0);
 54: }

 58: PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
 59: {

 63:   if (right) {
 64:     VecCreate(PetscObjectComm((PetscObject)mat),right);
 65:     VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
 66:     VecSetBlockSize(*right,mat->rmap->bs);
 67:     VecSetType(*right,VECCUSP);
 68:     VecSetLayout(*right,mat->cmap);
 69:   }
 70:   if (left) {
 71:     VecCreate(PetscObjectComm((PetscObject)mat),left);
 72:     VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
 73:     VecSetBlockSize(*left,mat->rmap->bs);
 74:     VecSetType(*left,VECCUSP);
 75:     VecSetLayout(*left,mat->rmap);


 78:   }
 79:   return(0);
 80: }


 85: PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
 86: {
 87:   /* This multiplication sequence is different sequence
 88:      than the CPU version. In particular, the diagonal block
 89:      multiplication kernel is launched in one stream. Then,
 90:      in a separate stream, the data transfers from DeviceToHost
 91:      (with MPI messaging in between), then HostToDevice are
 92:      launched. Once the data transfer stream is synchronized,
 93:      to ensure messaging is complete, the MatMultAdd kernel
 94:      is launched in the original (MatMult) stream to protect
 95:      against race conditions.

 97:      This sequence should only be called for GPU computation. */
 98:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
100:   PetscInt       nt;

103:   VecGetLocalSize(xx,&nt);
104:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
105:   VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
106:   (*a->A->ops->mult)(a->A,xx,yy);
107:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
108:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
109:   (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);
110:   VecScatterFinalizeForGPU(a->Mvctx);
111:   return(0);
112: }

116: PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
117: {
118:   /* This multiplication sequence is different sequence
119:      than the CPU version. In particular, the diagonal block
120:      multiplication kernel is launched in one stream. Then,
121:      in a separate stream, the data transfers from DeviceToHost
122:      (with MPI messaging in between), then HostToDevice are
123:      launched. Once the data transfer stream is synchronized,
124:      to ensure messaging is complete, the MatMultAdd kernel
125:      is launched in the original (MatMult) stream to protect
126:      against race conditions.

128:      This sequence should only be called for GPU computation. */
129:   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
131:   PetscInt       nt;

134:   VecGetLocalSize(xx,&nt);
135:   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
136:   VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);
137:   (*a->A->ops->multtranspose)(a->A,xx,yy);
138:   VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
139:   VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);
140:   (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);
141:   VecScatterFinalizeForGPU(a->Mvctx);
142:   return(0);
143: }

147: PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
148: {
149:   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
150:   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

153:   switch (op) {
154:   case MAT_CUSPARSE_MULT_DIAG:
155:     cusparseStruct->diagGPUMatFormat = format;
156:     break;
157:   case MAT_CUSPARSE_MULT_OFFDIAG:
158:     cusparseStruct->offdiagGPUMatFormat = format;
159:     break;
160:   case MAT_CUSPARSE_ALL:
161:     cusparseStruct->diagGPUMatFormat    = format;
162:     cusparseStruct->offdiagGPUMatFormat = format;
163:     break;
164:   default:
165:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
166:   }
167:   return(0);
168: }

172: PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
173: {
174:   MatCUSPARSEStorageFormat format;
175:   PetscErrorCode           ierr;
176:   PetscBool                flg;

179:   PetscOptionsHead("MPIAIJCUSPARSE options");
180:   PetscObjectOptionsBegin((PetscObject)A);
181:   if (A->factortype==MAT_FACTOR_NONE) {
182:     PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
183:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);
184:     if (flg) {
185:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);
186:     }
187:     PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
188:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);
189:     if (flg) {
190:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);
191:     }
192:     PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
193:                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);
194:     if (flg) {
195:       MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);
196:     }
197:   }
198:   PetscOptionsEnd();
199:   return(0);
200: }

204: PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
205: {
206:   PetscErrorCode     ierr;
207:   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
208:   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;

211:   try {
212:     delete cusparseStruct;
213:   } catch(char *ex) {
214:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
215:   }
216:   cusparseStruct = 0;

218:   MatDestroy_MPIAIJ(A);
219:   return(0);
220: }

224: PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
225: {
226:   PetscErrorCode     ierr;
227:   Mat_MPIAIJ         *a;
228:   Mat_MPIAIJCUSPARSE * cusparseStruct;

231:   MatCreate_MPIAIJ(A);
232:   PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);
233:   a        = (Mat_MPIAIJ*)A->data;
234:   a->spptr = new Mat_MPIAIJCUSPARSE;

236:   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
237:   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
238:   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;

240:   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
241:   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
242:   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
243:   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
244:   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;

246:   PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);
247:   PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);
248:   return(0);
249: }

251: /*@
252:    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
253:    (the default parallel PETSc format).  This matrix will ultimately pushed down
254:    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
255:    assembly performance the user should preallocate the matrix storage by setting
256:    the parameter nz (or the array nnz).  By setting these parameters accurately,
257:    performance during matrix assembly can be increased by more than a factor of 50.
258:    This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu
259:    to build/install PETSc to use different CUSPARSE base matrix types.

261:    Collective on MPI_Comm

263:    Input Parameters:
264: +  comm - MPI communicator, set to PETSC_COMM_SELF
265: .  m - number of rows
266: .  n - number of columns
267: .  nz - number of nonzeros per row (same for all rows)
268: -  nnz - array containing the number of nonzeros in the various rows
269:          (possibly different for each row) or NULL

271:    Output Parameter:
272: .  A - the matrix

274:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
275:    MatXXXXSetPreallocation() paradigm instead of this routine directly.
276:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

278:    Notes:
279:    If nnz is given then nz is ignored

281:    The AIJ format (also called the Yale sparse matrix format or
282:    compressed row storage), is fully compatible with standard Fortran 77
283:    storage.  That is, the stored row and column indices can begin at
284:    either one (as in Fortran) or zero.  See the users' manual for details.

286:    Specify the preallocated storage with either nz or nnz (not both).
287:    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
288:    allocation.  For large problems you MUST preallocate memory or you
289:    will get TERRIBLE performance, see the users' manual chapter on matrices.

291:    By default, this format uses inodes (identical nodes) when possible, to
292:    improve numerical efficiency of matrix-vector products and solves. We
293:    search for consecutive rows with the same nonzero structure, thereby
294:    reusing matrix information to achieve increased efficiency.

296:    Level: intermediate

298: .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
299: @*/
302: PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
303: {
305:   PetscMPIInt    size;

308:   MatCreate(comm,A);
309:   MatSetSizes(*A,m,n,M,N);
310:   MPI_Comm_size(comm,&size);
311:   if (size > 1) {
312:     MatSetType(*A,MATMPIAIJCUSPARSE);
313:     MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);
314:   } else {
315:     MatSetType(*A,MATSEQAIJCUSPARSE);
316:     MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);
317:   }
318:   return(0);
319: }

321: /*M
322:    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.

324:    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format.
325:    All matrix calculations are performed using the Nvidia CUSPARSE library. Use of the
326:    CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available
327:    in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different
328:    GPU storage formats with CUSPARSE matrix types.

330:    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
331:    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
332:    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
333:    for communicators controlling multiple processes.  It is recommended that you call both of
334:    the above preallocation routines for simplicity.

336:    Options Database Keys:
337: +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
338: .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
339: .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
340: -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).

342:   Level: beginner

344:  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
345: M
346: M*/