Actual source code: mpimatmatmult.c
petsc-3.4.2 2013-07-02
2: /*
3: Defines matrix-matrix product routines for pairs of MPIAIJ matrices
4: C = A * B
5: */
6: #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
7: #include <../src/mat/utils/freespace.h>
8: #include <../src/mat/impls/aij/mpi/mpiaij.h>
9: #include <petscbt.h>
10: #include <../src/mat/impls/dense/mpi/mpidense.h>
11: #include <petsc-private/vecimpl.h>
15: PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C)
16: {
20: if (scall == MAT_INITIAL_MATRIX) {
21: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
22: MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);
23: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
24: }
25: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
26: (*(*C)->ops->matmultnumeric)(A,B,*C);
27: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
28: return(0);
29: }
33: PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A)
34: {
36: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
37: Mat_PtAPMPI *ptap = a->ptap;
40: PetscFree2(ptap->startsj_s,ptap->startsj_r);
41: PetscFree(ptap->bufa);
42: MatDestroy(&ptap->P_loc);
43: MatDestroy(&ptap->P_oth);
44: MatDestroy(&ptap->Pt);
45: PetscFree(ptap->api);
46: PetscFree(ptap->apj);
47: PetscFree(ptap->apa);
48: ptap->destroy(A);
49: PetscFree(ptap);
50: return(0);
51: }
55: PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M)
56: {
58: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
59: Mat_PtAPMPI *ptap = a->ptap;
62: (*ptap->duplicate)(A,op,M);
64: (*M)->ops->destroy = ptap->destroy; /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */
65: (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */
66: return(0);
67: }
71: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
72: {
74: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
75: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
76: Mat_SeqAIJ *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
77: PetscInt *adi=ad->i,*adj,*aoi=ao->i,*aoj;
78: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
79: Mat_SeqAIJ *p_loc,*p_oth;
80: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
81: PetscScalar *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca;
82: PetscInt cm =C->rmap->n,anz,pnz;
83: Mat_PtAPMPI *ptap=c->ptap;
84: PetscInt *api,*apj,*apJ,i,j,k,row;
85: PetscInt cstart=C->cmap->rstart;
86: PetscInt cdnz,conz,k0,k1;
89: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
90: /*-----------------------------------------------------*/
91: /* update numerical values of P_oth and P_loc */
92: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
93: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
95: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
96: /*----------------------------------------------------------*/
97: /* get data from symbolic products */
98: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
99: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
100: pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
101: pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
103: /* get apa for storing dense row A[i,:]*P */
104: apa = ptap->apa;
106: api = ptap->api;
107: apj = ptap->apj;
108: for (i=0; i<cm; i++) {
109: /* diagonal portion of A */
110: anz = adi[i+1] - adi[i];
111: adj = ad->j + adi[i];
112: ada = ad->a + adi[i];
113: for (j=0; j<anz; j++) {
114: row = adj[j];
115: pnz = pi_loc[row+1] - pi_loc[row];
116: pj = pj_loc + pi_loc[row];
117: pa = pa_loc + pi_loc[row];
119: /* perform dense axpy */
120: valtmp = ada[j];
121: for (k=0; k<pnz; k++) {
122: apa[pj[k]] += valtmp*pa[k];
123: }
124: PetscLogFlops(2.0*pnz);
125: }
127: /* off-diagonal portion of A */
128: anz = aoi[i+1] - aoi[i];
129: aoj = ao->j + aoi[i];
130: aoa = ao->a + aoi[i];
131: for (j=0; j<anz; j++) {
132: row = aoj[j];
133: pnz = pi_oth[row+1] - pi_oth[row];
134: pj = pj_oth + pi_oth[row];
135: pa = pa_oth + pi_oth[row];
137: /* perform dense axpy */
138: valtmp = aoa[j];
139: for (k=0; k<pnz; k++) {
140: apa[pj[k]] += valtmp*pa[k];
141: }
142: PetscLogFlops(2.0*pnz);
143: }
145: /* set values in C */
146: apJ = apj + api[i];
147: cdnz = cd->i[i+1] - cd->i[i];
148: conz = co->i[i+1] - co->i[i];
150: /* 1st off-diagoanl part of C */
151: ca = coa + co->i[i];
152: k = 0;
153: for (k0=0; k0<conz; k0++) {
154: if (apJ[k] >= cstart) break;
155: ca[k0] = apa[apJ[k]];
156: apa[apJ[k]] = 0.0;
157: k++;
158: }
160: /* diagonal part of C */
161: ca = cda + cd->i[i];
162: for (k1=0; k1<cdnz; k1++) {
163: ca[k1] = apa[apJ[k]];
164: apa[apJ[k]] = 0.0;
165: k++;
166: }
168: /* 2nd off-diagoanl part of C */
169: ca = coa + co->i[i];
170: for (; k0<conz; k0++) {
171: ca[k0] = apa[apJ[k]];
172: apa[apJ[k]] = 0.0;
173: k++;
174: }
175: }
176: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
177: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
178: return(0);
179: }
183: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
184: {
185: PetscErrorCode ierr;
186: MPI_Comm comm;
187: Mat Cmpi;
188: Mat_PtAPMPI *ptap;
189: PetscFreeSpaceList free_space=NULL,current_space=NULL;
190: Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c;
191: Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
192: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
193: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
194: PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi;
195: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
196: PetscBT lnkbt;
197: PetscScalar *apa;
198: PetscReal afill;
199: PetscBool scalable=PETSC_TRUE;
200: PetscInt nlnk_max,armax,prmax;
203: PetscObjectGetComm((PetscObject)A,&comm);
204: if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) {
205: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend);
206: }
208: PetscObjectOptionsBegin((PetscObject)A);
209: PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,NULL);
210: if (scalable) {
211: MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(A,P,fill,C);
212: return(0);
213: }
214: PetscOptionsEnd();
216: /* create struct Mat_PtAPMPI and attached it to C later */
217: PetscNew(Mat_PtAPMPI,&ptap);
219: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
220: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
222: /* get P_loc by taking all local rows of P */
223: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
225: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
226: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
227: pi_loc = p_loc->i; pj_loc = p_loc->j;
228: pi_oth = p_oth->i; pj_oth = p_oth->j;
230: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
231: /*-------------------------------------------------------------------*/
232: PetscMalloc((am+2)*sizeof(PetscInt),&api);
233: ptap->api = api;
234: api[0] = 0;
236: /* create and initialize a linked list */
237: armax = ad->rmax+ao->rmax;
238: prmax = PetscMax(p_loc->rmax,p_oth->rmax);
239: nlnk_max = armax*prmax;
240: if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
241: PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);
243: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
244: PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);
246: current_space = free_space;
248: MatPreallocateInitialize(comm,am,pn,dnz,onz);
249: for (i=0; i<am; i++) {
250: /* diagonal portion of A */
251: nzi = adi[i+1] - adi[i];
252: for (j=0; j<nzi; j++) {
253: row = *adj++;
254: pnz = pi_loc[row+1] - pi_loc[row];
255: Jptr = pj_loc + pi_loc[row];
256: /* add non-zero cols of P into the sorted linked list lnk */
257: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
258: }
259: /* off-diagonal portion of A */
260: nzi = aoi[i+1] - aoi[i];
261: for (j=0; j<nzi; j++) {
262: row = *aoj++;
263: pnz = pi_oth[row+1] - pi_oth[row];
264: Jptr = pj_oth + pi_oth[row];
265: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
266: }
268: apnz = lnk[0];
269: api[i+1] = api[i] + apnz;
271: /* if free space is not available, double the total space in the list */
272: if (current_space->local_remaining<apnz) {
273: PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);
274: nspacedouble++;
275: }
277: /* Copy data into free space, then initialize lnk */
278: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
279: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
281: current_space->array += apnz;
282: current_space->local_used += apnz;
283: current_space->local_remaining -= apnz;
284: }
286: /* Allocate space for apj, initialize apj, and */
287: /* destroy list of free space and other temporary array(s) */
288: PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);
289: apj = ptap->apj;
290: PetscFreeSpaceContiguous(&free_space,ptap->apj);
291: PetscLLDestroy(lnk,lnkbt);
293: /* malloc apa to store dense row A[i,:]*P */
294: PetscMalloc(pN*sizeof(PetscScalar),&apa);
295: PetscMemzero(apa,pN*sizeof(PetscScalar));
297: ptap->apa = apa;
299: /* create and assemble symbolic parallel matrix Cmpi */
300: /*----------------------------------------------------*/
301: MatCreate(comm,&Cmpi);
302: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
303: MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);
305: MatSetType(Cmpi,MATMPIAIJ);
306: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
307: MatPreallocateFinalize(dnz,onz);
308: for (i=0; i<am; i++) {
309: row = i + rstart;
310: apnz = api[i+1] - api[i];
311: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
312: apj += apnz;
313: }
314: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
315: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
317: ptap->destroy = Cmpi->ops->destroy;
318: ptap->duplicate = Cmpi->ops->duplicate;
319: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
320: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
322: /* attach the supporting struct to Cmpi for reuse */
323: c = (Mat_MPIAIJ*)Cmpi->data;
324: c->ptap = ptap;
326: *C = Cmpi;
328: /* set MatInfo */
329: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
330: if (afill < 1.0) afill = 1.0;
331: Cmpi->info.mallocs = nspacedouble;
332: Cmpi->info.fill_ratio_given = fill;
333: Cmpi->info.fill_ratio_needed = afill;
335: #if defined(PETSC_USE_INFO)
336: if (api[am]) {
337: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
338: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
339: } else {
340: PetscInfo(Cmpi,"Empty matrix product\n");
341: }
342: #endif
343: return(0);
344: }
348: PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C)
349: {
353: if (scall == MAT_INITIAL_MATRIX) {
354: PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);
355: MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);
356: PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);
357: }
358: PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);
359: MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);
360: PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);
361: return(0);
362: }
364: typedef struct {
365: Mat workB;
366: PetscScalar *rvalues,*svalues;
367: MPI_Request *rwaits,*swaits;
368: } MPIAIJ_MPIDense;
372: PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx)
373: {
374: MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx;
375: PetscErrorCode ierr;
378: MatDestroy(&contents->workB);
379: PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);
380: PetscFree(contents);
381: return(0);
382: }
386: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C)
387: {
388: PetscErrorCode ierr;
389: Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data;
390: PetscInt nz = aij->B->cmap->n;
391: PetscContainer container;
392: MPIAIJ_MPIDense *contents;
393: VecScatter ctx = aij->Mvctx;
394: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
395: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
396: PetscInt m = A->rmap->n,n=B->cmap->n;
399: MatCreate(PetscObjectComm((PetscObject)B),C);
400: MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);
401: MatSetBlockSizes(*C,A->rmap->bs,B->cmap->bs);
402: MatSetType(*C,MATMPIDENSE);
403: MatMPIDenseSetPreallocation(*C,NULL);
404: MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);
405: MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);
407: (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense;
409: PetscNew(MPIAIJ_MPIDense,&contents);
410: /* Create work matrix used to store off processor rows of B needed for local product */
411: MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);
412: /* Create work arrays needed */
413: PetscMalloc4(B->cmap->N*from->starts[from->n],PetscScalar,&contents->rvalues,
414: B->cmap->N*to->starts[to->n],PetscScalar,&contents->svalues,
415: from->n,MPI_Request,&contents->rwaits,
416: to->n,MPI_Request,&contents->swaits);
418: PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);
419: PetscContainerSetPointer(container,contents);
420: PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);
421: PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);
422: PetscContainerDestroy(&container);
423: return(0);
424: }
428: /*
429: Performs an efficient scatter on the rows of B needed by this process; this is
430: a modification of the VecScatterBegin_() routines.
431: */
432: PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB)
433: {
434: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
435: PetscErrorCode ierr;
436: PetscScalar *b,*w,*svalues,*rvalues;
437: VecScatter ctx = aij->Mvctx;
438: VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata;
439: VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata;
440: PetscInt i,j,k;
441: PetscInt *sindices,*sstarts,*rindices,*rstarts;
442: PetscMPIInt *sprocs,*rprocs,nrecvs;
443: MPI_Request *swaits,*rwaits;
444: MPI_Comm comm;
445: PetscMPIInt tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n;
446: MPI_Status status;
447: MPIAIJ_MPIDense *contents;
448: PetscContainer container;
449: Mat workB;
452: PetscObjectGetComm((PetscObject)A,&comm);
453: PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);
454: if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist");
455: PetscContainerGetPointer(container,(void**)&contents);
457: workB = *outworkB = contents->workB;
458: if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n);
459: sindices = to->indices;
460: sstarts = to->starts;
461: sprocs = to->procs;
462: swaits = contents->swaits;
463: svalues = contents->svalues;
465: rindices = from->indices;
466: rstarts = from->starts;
467: rprocs = from->procs;
468: rwaits = contents->rwaits;
469: rvalues = contents->rvalues;
471: MatDenseGetArray(B,&b);
472: MatDenseGetArray(workB,&w);
474: for (i=0; i<from->n; i++) {
475: MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);
476: }
478: for (i=0; i<to->n; i++) {
479: /* pack a message at a time */
480: for (j=0; j<sstarts[i+1]-sstarts[i]; j++) {
481: for (k=0; k<ncols; k++) {
482: svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k];
483: }
484: }
485: MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);
486: }
488: nrecvs = from->n;
489: while (nrecvs) {
490: MPI_Waitany(from->n,rwaits,&imdex,&status);
491: nrecvs--;
492: /* unpack a message at a time */
493: for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) {
494: for (k=0; k<ncols; k++) {
495: w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k];
496: }
497: }
498: }
499: if (to->n) {MPI_Waitall(to->n,swaits,to->sstatus);}
501: MatDenseRestoreArray(B,&b);
502: MatDenseRestoreArray(workB,&w);
503: MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);
504: MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);
505: return(0);
506: }
507: extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat);
511: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C)
512: {
514: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
515: Mat_MPIDense *bdense = (Mat_MPIDense*)B->data;
516: Mat_MPIDense *cdense = (Mat_MPIDense*)C->data;
517: Mat workB;
520: /* diagonal block of A times all local rows of B*/
521: MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);
523: /* get off processor parts of B needed to complete the product */
524: MatMPIDenseScatter(A,B,C,&workB);
526: /* off-diagonal block of A times nonlocal rows of B */
527: MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);
528: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
529: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
530: return(0);
531: }
535: PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,Mat C)
536: {
538: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data;
539: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
540: Mat_SeqAIJ *cd = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
541: PetscInt *adi = ad->i,*adj,*aoi=ao->i,*aoj;
542: PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a;
543: Mat_SeqAIJ *p_loc,*p_oth;
544: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj;
545: PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca;
546: PetscInt cm = C->rmap->n,anz,pnz;
547: Mat_PtAPMPI *ptap = c->ptap;
548: PetscScalar *apa_sparse = ptap->apa;
549: PetscInt *api,*apj,*apJ,i,j,k,row;
550: PetscInt cstart = C->cmap->rstart;
551: PetscInt cdnz,conz,k0,k1,nextp;
554: /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
555: /*-----------------------------------------------------*/
556: /* update numerical values of P_oth and P_loc */
557: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
558: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
560: /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */
561: /*----------------------------------------------------------*/
562: /* get data from symbolic products */
563: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
564: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
565: pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a;
566: pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
568: api = ptap->api;
569: apj = ptap->apj;
570: for (i=0; i<cm; i++) {
571: apJ = apj + api[i];
573: /* diagonal portion of A */
574: anz = adi[i+1] - adi[i];
575: adj = ad->j + adi[i];
576: ada = ad->a + adi[i];
577: for (j=0; j<anz; j++) {
578: row = adj[j];
579: pnz = pi_loc[row+1] - pi_loc[row];
580: pj = pj_loc + pi_loc[row];
581: pa = pa_loc + pi_loc[row];
582: /* perform sparse axpy */
583: valtmp = ada[j];
584: nextp = 0;
585: for (k=0; nextp<pnz; k++) {
586: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
587: apa_sparse[k] += valtmp*pa[nextp++];
588: }
589: }
590: PetscLogFlops(2.0*pnz);
591: }
593: /* off-diagonal portion of A */
594: anz = aoi[i+1] - aoi[i];
595: aoj = ao->j + aoi[i];
596: aoa = ao->a + aoi[i];
597: for (j=0; j<anz; j++) {
598: row = aoj[j];
599: pnz = pi_oth[row+1] - pi_oth[row];
600: pj = pj_oth + pi_oth[row];
601: pa = pa_oth + pi_oth[row];
602: /* perform sparse axpy */
603: valtmp = aoa[j];
604: nextp = 0;
605: for (k=0; nextp<pnz; k++) {
606: if (apJ[k] == pj[nextp]) { /* column of AP == column of P */
607: apa_sparse[k] += valtmp*pa[nextp++];
608: }
609: }
610: PetscLogFlops(2.0*pnz);
611: }
613: /* set values in C */
614: cdnz = cd->i[i+1] - cd->i[i];
615: conz = co->i[i+1] - co->i[i];
617: /* 1st off-diagoanl part of C */
618: ca = coa + co->i[i];
619: k = 0;
620: for (k0=0; k0<conz; k0++) {
621: if (apJ[k] >= cstart) break;
622: ca[k0] = apa_sparse[k];
623: apa_sparse[k] = 0.0;
624: k++;
625: }
627: /* diagonal part of C */
628: ca = cda + cd->i[i];
629: for (k1=0; k1<cdnz; k1++) {
630: ca[k1] = apa_sparse[k];
631: apa_sparse[k] = 0.0;
632: k++;
633: }
635: /* 2nd off-diagoanl part of C */
636: ca = coa + co->i[i];
637: for (; k0<conz; k0++) {
638: ca[k0] = apa_sparse[k];
639: apa_sparse[k] = 0.0;
640: k++;
641: }
642: }
643: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
644: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
645: return(0);
646: }
648: /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ(), except using LLCondensed to avoid O(BN) memory requirement */
651: PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,PetscReal fill,Mat *C)
652: {
653: PetscErrorCode ierr;
654: MPI_Comm comm;
655: Mat Cmpi;
656: Mat_PtAPMPI *ptap;
657: PetscFreeSpaceList free_space = NULL,current_space=NULL;
658: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c;
659: Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth;
660: PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz;
661: PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart;
662: PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0;
663: PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n;
664: PetscInt nlnk_max,armax,prmax;
665: PetscReal afill;
666: PetscScalar *apa;
669: PetscObjectGetComm((PetscObject)A,&comm);
670: /* create struct Mat_PtAPMPI and attached it to C later */
671: PetscNew(Mat_PtAPMPI,&ptap);
673: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
674: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
676: /* get P_loc by taking all local rows of P */
677: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
679: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
680: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
681: pi_loc = p_loc->i; pj_loc = p_loc->j;
682: pi_oth = p_oth->i; pj_oth = p_oth->j;
684: /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */
685: /*-------------------------------------------------------------------*/
686: PetscMalloc((am+2)*sizeof(PetscInt),&api);
687: ptap->api = api;
688: api[0] = 0;
690: /* create and initialize a linked list */
691: armax = ad->rmax+ao->rmax;
692: prmax = PetscMax(p_loc->rmax,p_oth->rmax);
693: nlnk_max = armax*prmax;
694: if (!nlnk_max || nlnk_max > pN) nlnk_max = pN;
695: PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);
697: /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */
698: PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);
700: current_space = free_space;
702: MatPreallocateInitialize(comm,am,pn,dnz,onz);
703: for (i=0; i<am; i++) {
704: /* diagonal portion of A */
705: nzi = adi[i+1] - adi[i];
706: for (j=0; j<nzi; j++) {
707: row = *adj++;
708: pnz = pi_loc[row+1] - pi_loc[row];
709: Jptr = pj_loc + pi_loc[row];
710: /* add non-zero cols of P into the sorted linked list lnk */
711: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
712: }
713: /* off-diagonal portion of A */
714: nzi = aoi[i+1] - aoi[i];
715: for (j=0; j<nzi; j++) {
716: row = *aoj++;
717: pnz = pi_oth[row+1] - pi_oth[row];
718: Jptr = pj_oth + pi_oth[row];
719: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
720: }
722: apnz = *lnk;
723: api[i+1] = api[i] + apnz;
724: if (apnz > apnz_max) apnz_max = apnz;
726: /* if free space is not available, double the total space in the list */
727: if (current_space->local_remaining<apnz) {
728: PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);
729: nspacedouble++;
730: }
732: /* Copy data into free space, then initialize lnk */
733: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
734: MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);
736: current_space->array += apnz;
737: current_space->local_used += apnz;
738: current_space->local_remaining -= apnz;
739: }
741: /* Allocate space for apj, initialize apj, and */
742: /* destroy list of free space and other temporary array(s) */
743: PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);
744: apj = ptap->apj;
745: PetscFreeSpaceContiguous(&free_space,ptap->apj);
746: PetscLLCondensedDestroy_Scalable(lnk);
748: /* create and assemble symbolic parallel matrix Cmpi */
749: /*----------------------------------------------------*/
750: MatCreate(comm,&Cmpi);
751: MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);
752: MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);
753: MatSetType(Cmpi,MATMPIAIJ);
754: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
755: MatPreallocateFinalize(dnz,onz);
757: /* malloc apa for assembly Cmpi */
758: PetscMalloc(apnz_max*sizeof(PetscScalar),&apa);
759: PetscMemzero(apa,apnz_max*sizeof(PetscScalar));
761: ptap->apa = apa;
762: for (i=0; i<am; i++) {
763: row = i + rstart;
764: apnz = api[i+1] - api[i];
765: MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);
766: apj += apnz;
767: }
768: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
769: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
771: ptap->destroy = Cmpi->ops->destroy;
772: ptap->duplicate = Cmpi->ops->duplicate;
773: Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
774: Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult;
775: Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult;
777: /* attach the supporting struct to Cmpi for reuse */
778: c = (Mat_MPIAIJ*)Cmpi->data;
779: c->ptap = ptap;
781: *C = Cmpi;
783: /* set MatInfo */
784: afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5;
785: if (afill < 1.0) afill = 1.0;
786: Cmpi->info.mallocs = nspacedouble;
787: Cmpi->info.fill_ratio_given = fill;
788: Cmpi->info.fill_ratio_needed = afill;
790: #if defined(PETSC_USE_INFO)
791: if (api[am]) {
792: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
793: PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);
794: } else {
795: PetscInfo(Cmpi,"Empty matrix product\n");
796: }
797: #endif
798: return(0);
799: }
801: /*-------------------------------------------------------------------------*/
804: PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C)
805: {
807: PetscBool scalable=PETSC_TRUE,viamatmatmult=PETSC_FALSE;
810: PetscOptionsBool("-mattransposematmult_viamatmatmult","Use R=Pt and C=R*A","",viamatmatmult,&viamatmatmult,NULL);
811: if (viamatmatmult) {
812: Mat Pt;
813: Mat_PtAPMPI *ptap;
814: Mat_MPIAIJ *c;
815: if (scall == MAT_INITIAL_MATRIX) {
816: MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);
817: MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);
819: c = (Mat_MPIAIJ*)(*C)->data;
820: ptap = c->ptap;
821: ptap->Pt = Pt;
822: } else if (scall == MAT_REUSE_MATRIX) {
823: c = (Mat_MPIAIJ*)(*C)->data;
824: ptap = c->ptap;
825: Pt = ptap->Pt;
826: MatTranspose(P,scall,&Pt);
827: MatMatMult(Pt,A,scall,fill,C);
828: } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not supported");
829: return(0);
830: }
832: if (scall == MAT_INITIAL_MATRIX) {
833: PetscObjectOptionsBegin((PetscObject)A);
834: PetscOptionsBool("-mattransposematmult_scalable","Use a scalable but slower C=Pt*A","",scalable,&scalable,NULL);
835: if (scalable) {
836: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(P,A,fill,C);
837: } else {
838: MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);
839: }
840: PetscOptionsEnd();
841: }
842: (*(*C)->ops->mattransposemultnumeric)(P,A,*C);
843: return(0);
844: }
848: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C)
849: {
850: PetscErrorCode ierr;
851: Mat_Merge_SeqsToMPI *merge;
852: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
853: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
854: Mat_PtAPMPI *ptap;
855: PetscInt *adj,*aJ;
856: PetscInt i,j,k,anz,pnz,row,*cj;
857: MatScalar *ada,*aval,*ca,valtmp;
858: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
859: MPI_Comm comm;
860: PetscMPIInt size,rank,taga,*len_s;
861: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
862: PetscInt **buf_ri,**buf_rj;
863: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
864: MPI_Request *s_waits,*r_waits;
865: MPI_Status *status;
866: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
867: PetscInt *ai,*aj,*coi,*coj;
868: PetscInt *poJ,*pdJ;
869: Mat A_loc;
870: Mat_SeqAIJ *a_loc;
873: PetscObjectGetComm((PetscObject)C,&comm);
874: MPI_Comm_size(comm,&size);
875: MPI_Comm_rank(comm,&rank);
877: ptap = c->ptap;
878: merge = ptap->merge;
880: /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */
881: /*--------------------------------------------------------------*/
882: /* get data from symbolic products */
883: coi = merge->coi; coj = merge->coj;
884: PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);
885: PetscMemzero(coa,coi[pon]*sizeof(MatScalar));
887: bi = merge->bi; bj = merge->bj;
888: owners = merge->rowmap->range;
889: PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);
890: PetscMemzero(ba,bi[cm]*sizeof(MatScalar));
892: /* get A_loc by taking all local rows of A */
893: A_loc = ptap->A_loc;
894: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
895: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
896: ai = a_loc->i;
897: aj = a_loc->j;
899: PetscMalloc((A->cmap->N)*sizeof(PetscScalar),&aval); /* non-scalable!!! */
900: PetscMemzero(aval,A->cmap->N*sizeof(PetscScalar));
902: for (i=0; i<am; i++) {
903: /* 2-a) put A[i,:] to dense array aval */
904: anz = ai[i+1] - ai[i];
905: adj = aj + ai[i];
906: ada = a_loc->a + ai[i];
907: for (j=0; j<anz; j++) {
908: aval[adj[j]] = ada[j];
909: }
911: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
912: /*--------------------------------------------------------------*/
913: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
914: pnz = po->i[i+1] - po->i[i];
915: poJ = po->j + po->i[i];
916: pA = po->a + po->i[i];
917: for (j=0; j<pnz; j++) {
918: row = poJ[j];
919: cnz = coi[row+1] - coi[row];
920: cj = coj + coi[row];
921: ca = coa + coi[row];
922: /* perform dense axpy */
923: valtmp = pA[j];
924: for (k=0; k<cnz; k++) {
925: ca[k] += valtmp*aval[cj[k]];
926: }
927: PetscLogFlops(2.0*cnz);
928: }
930: /* put the value into Cd (diagonal part) */
931: pnz = pd->i[i+1] - pd->i[i];
932: pdJ = pd->j + pd->i[i];
933: pA = pd->a + pd->i[i];
934: for (j=0; j<pnz; j++) {
935: row = pdJ[j];
936: cnz = bi[row+1] - bi[row];
937: cj = bj + bi[row];
938: ca = ba + bi[row];
939: /* perform dense axpy */
940: valtmp = pA[j];
941: for (k=0; k<cnz; k++) {
942: ca[k] += valtmp*aval[cj[k]];
943: }
944: PetscLogFlops(2.0*cnz);
945: }
947: /* zero the current row of Pt*A */
948: aJ = aj + ai[i];
949: for (k=0; k<anz; k++) aval[aJ[k]] = 0.0;
950: }
952: /* 3) send and recv matrix values coa */
953: /*------------------------------------*/
954: buf_ri = merge->buf_ri;
955: buf_rj = merge->buf_rj;
956: len_s = merge->len_s;
957: PetscCommGetNewTag(comm,&taga);
958: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
960: PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);
961: for (proc=0,k=0; proc<size; proc++) {
962: if (!len_s[proc]) continue;
963: i = merge->owners_co[proc];
964: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
965: k++;
966: }
967: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
968: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
970: PetscFree2(s_waits,status);
971: PetscFree(r_waits);
972: PetscFree(coa);
974: /* 4) insert local Cseq and received values into Cmpi */
975: /*----------------------------------------------------*/
976: PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
977: for (k=0; k<merge->nrecv; k++) {
978: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
979: nrows = *(buf_ri_k[k]);
980: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
981: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
982: }
984: for (i=0; i<cm; i++) {
985: row = owners[rank] + i; /* global row index of C_seq */
986: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
987: ba_i = ba + bi[i];
988: bnz = bi[i+1] - bi[i];
989: /* add received vals into ba */
990: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
991: /* i-th row */
992: if (i == *nextrow[k]) {
993: cnz = *(nextci[k]+1) - *nextci[k];
994: cj = buf_rj[k] + *(nextci[k]);
995: ca = abuf_r[k] + *(nextci[k]);
996: nextcj = 0;
997: for (j=0; nextcj<cnz; j++) {
998: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
999: ba_i[j] += ca[nextcj++];
1000: }
1001: }
1002: nextrow[k]++; nextci[k]++;
1003: PetscLogFlops(2.0*cnz);
1004: }
1005: }
1006: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1007: }
1008: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1009: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1011: PetscFree(ba);
1012: PetscFree(abuf_r[0]);
1013: PetscFree(abuf_r);
1014: PetscFree3(buf_ri_k,nextrow,nextci);
1015: PetscFree(aval);
1016: return(0);
1017: }
1019: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1022: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C)
1023: {
1024: PetscErrorCode ierr;
1025: Mat Cmpi,A_loc,POt,PDt;
1026: Mat_PtAPMPI *ptap;
1027: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1028: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1029: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1030: PetscInt nnz;
1031: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1032: PetscInt am=A->rmap->n,pn=P->cmap->n;
1033: PetscBT lnkbt;
1034: MPI_Comm comm;
1035: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1036: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1037: PetscInt len,proc,*dnz,*onz,*owners;
1038: PetscInt nzi,*bi,*bj;
1039: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1040: MPI_Request *swaits,*rwaits;
1041: MPI_Status *sstatus,rstatus;
1042: Mat_Merge_SeqsToMPI *merge;
1043: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1044: PetscReal afill =1.0,afill_tmp;
1045: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1046: PetscScalar *vals;
1047: Mat_SeqAIJ *a_loc, *pdt,*pot;
1050: PetscObjectGetComm((PetscObject)A,&comm);
1051: /* check if matrix local sizes are compatible */
1052: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1053: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1054: }
1056: MPI_Comm_size(comm,&size);
1057: MPI_Comm_rank(comm,&rank);
1059: /* create struct Mat_PtAPMPI and attached it to C later */
1060: PetscNew(Mat_PtAPMPI,&ptap);
1062: /* get A_loc by taking all local rows of A */
1063: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1065: ptap->A_loc = A_loc;
1067: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1068: ai = a_loc->i;
1069: aj = a_loc->j;
1071: /* determine symbolic Co=(p->B)^T*A - send to others */
1072: /*----------------------------------------------------*/
1073: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1074: pdt = (Mat_SeqAIJ*)PDt->data;
1075: pdti = pdt->i; pdtj = pdt->j;
1077: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1078: pot = (Mat_SeqAIJ*)POt->data;
1079: poti = pot->i; potj = pot->j;
1081: /* then, compute symbolic Co = (p->B)^T*A */
1082: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */
1083: PetscMalloc((pon+1)*sizeof(PetscInt),&coi);
1084: coi[0] = 0;
1086: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1087: nnz = fill*(poti[pon] + ai[am]);
1088: PetscFreeSpaceGet(nnz,&free_space);
1089: current_space = free_space;
1091: /* create and initialize a linked list */
1092: i = PetscMax(pdt->rmax,pot->rmax);
1093: Crmax = i*a_loc->rmax*size;
1094: if (!Crmax || Crmax > aN) Crmax = aN;
1095: PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);
1097: for (i=0; i<pon; i++) {
1098: pnz = poti[i+1] - poti[i];
1099: ptJ = potj + poti[i];
1100: for (j=0; j<pnz; j++) {
1101: row = ptJ[j]; /* row of A_loc == col of Pot */
1102: anz = ai[row+1] - ai[row];
1103: Jptr = aj + ai[row];
1104: /* add non-zero cols of AP into the sorted linked list lnk */
1105: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1106: }
1107: nnz = lnk[0];
1109: /* If free space is not available, double the total space in the list */
1110: if (current_space->local_remaining<nnz) {
1111: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1112: nspacedouble++;
1113: }
1115: /* Copy data into free space, and zero out denserows */
1116: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1118: current_space->array += nnz;
1119: current_space->local_used += nnz;
1120: current_space->local_remaining -= nnz;
1122: coi[i+1] = coi[i] + nnz;
1123: }
1125: PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);
1126: PetscFreeSpaceContiguous(&free_space,coj);
1128: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1129: if (afill_tmp > afill) afill = afill_tmp;
1131: /* send j-array (coj) of Co to other processors */
1132: /*----------------------------------------------*/
1133: /* determine row ownership */
1134: PetscNew(Mat_Merge_SeqsToMPI,&merge);
1135: PetscLayoutCreate(comm,&merge->rowmap);
1137: merge->rowmap->n = pn;
1138: merge->rowmap->bs = 1;
1140: PetscLayoutSetUp(merge->rowmap);
1141: owners = merge->rowmap->range;
1143: /* determine the number of messages to send, their lengths */
1144: PetscMalloc(size*sizeof(PetscMPIInt),&len_si);
1145: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
1146: PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);
1148: len_s = merge->len_s;
1149: merge->nsend = 0;
1151: PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);
1152: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1154: proc = 0;
1155: for (i=0; i<pon; i++) {
1156: while (prmap[i] >= owners[proc+1]) proc++;
1157: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1158: len_s[proc] += coi[i+1] - coi[i];
1159: }
1161: len = 0; /* max length of buf_si[] */
1162: owners_co[0] = 0;
1163: for (proc=0; proc<size; proc++) {
1164: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1165: if (len_si[proc]) {
1166: merge->nsend++;
1167: len_si[proc] = 2*(len_si[proc] + 1);
1168: len += len_si[proc];
1169: }
1170: }
1172: /* determine the number and length of messages to receive for coi and coj */
1173: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1174: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1176: /* post the Irecv and Isend of coj */
1177: PetscCommGetNewTag(comm,&tagj);
1178: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1179: PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);
1180: for (proc=0, k=0; proc<size; proc++) {
1181: if (!len_s[proc]) continue;
1182: i = owners_co[proc];
1183: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1184: k++;
1185: }
1187: /* receives and sends of coj are complete */
1188: PetscMalloc(size*sizeof(MPI_Status),&sstatus);
1189: for (i=0; i<merge->nrecv; i++) {
1190: PetscMPIInt icompleted;
1191: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1192: }
1193: PetscFree(rwaits);
1194: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1196: /* send and recv coi */
1197: /*-------------------*/
1198: PetscCommGetNewTag(comm,&tagi);
1199: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1200: PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);
1201: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1202: for (proc=0,k=0; proc<size; proc++) {
1203: if (!len_s[proc]) continue;
1204: /* form outgoing message for i-structure:
1205: buf_si[0]: nrows to be sent
1206: [1:nrows]: row index (global)
1207: [nrows+1:2*nrows+1]: i-structure index
1208: */
1209: /*-------------------------------------------*/
1210: nrows = len_si[proc]/2 - 1;
1211: buf_si_i = buf_si + nrows+1;
1212: buf_si[0] = nrows;
1213: buf_si_i[0] = 0;
1214: nrows = 0;
1215: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1216: nzi = coi[i+1] - coi[i];
1217: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1218: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1219: nrows++;
1220: }
1221: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1222: k++;
1223: buf_si += len_si[proc];
1224: }
1225: i = merge->nrecv;
1226: while (i--) {
1227: PetscMPIInt icompleted;
1228: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1229: }
1230: PetscFree(rwaits);
1231: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1232: PetscFree(len_si);
1233: PetscFree(len_ri);
1234: PetscFree(swaits);
1235: PetscFree(sstatus);
1236: PetscFree(buf_s);
1238: /* compute the local portion of C (mpi mat) */
1239: /*------------------------------------------*/
1240: /* allocate bi array and free space for accumulating nonzero column info */
1241: PetscMalloc((pn+1)*sizeof(PetscInt),&bi);
1242: bi[0] = 0;
1244: /* set initial free space to be fill*(nnz(P) + nnz(A)) */
1245: nnz = fill*(pdti[pn] + poti[pon] + ai[am]);
1246: PetscFreeSpaceGet(nnz,&free_space);
1247: current_space = free_space;
1249: PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
1250: for (k=0; k<merge->nrecv; k++) {
1251: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1252: nrows = *buf_ri_k[k];
1253: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1254: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1255: }
1257: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1258: rmax = 0;
1259: for (i=0; i<pn; i++) {
1260: /* add pdt[i,:]*AP into lnk */
1261: pnz = pdti[i+1] - pdti[i];
1262: ptJ = pdtj + pdti[i];
1263: for (j=0; j<pnz; j++) {
1264: row = ptJ[j]; /* row of AP == col of Pt */
1265: anz = ai[row+1] - ai[row];
1266: Jptr = aj + ai[row];
1267: /* add non-zero cols of AP into the sorted linked list lnk */
1268: PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);
1269: }
1271: /* add received col data into lnk */
1272: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1273: if (i == *nextrow[k]) { /* i-th row */
1274: nzi = *(nextci[k]+1) - *nextci[k];
1275: Jptr = buf_rj[k] + *nextci[k];
1276: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1277: nextrow[k]++; nextci[k]++;
1278: }
1279: }
1280: nnz = lnk[0];
1282: /* if free space is not available, make more free space */
1283: if (current_space->local_remaining<nnz) {
1284: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1285: nspacedouble++;
1286: }
1287: /* copy data into free space, then initialize lnk */
1288: PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);
1289: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1291: current_space->array += nnz;
1292: current_space->local_used += nnz;
1293: current_space->local_remaining -= nnz;
1295: bi[i+1] = bi[i] + nnz;
1296: if (nnz > rmax) rmax = nnz;
1297: }
1298: PetscFree3(buf_ri_k,nextrow,nextci);
1300: PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);
1301: PetscFreeSpaceContiguous(&free_space,bj);
1303: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1304: if (afill_tmp > afill) afill = afill_tmp;
1305: PetscLLCondensedDestroy(lnk,lnkbt);
1306: MatDestroy(&POt);
1307: MatDestroy(&PDt);
1309: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1310: /*----------------------------------------------------------------------------------*/
1311: PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);
1312: PetscMemzero(vals,rmax*sizeof(PetscScalar));
1314: MatCreate(comm,&Cmpi);
1315: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1316: MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);
1317: MatSetType(Cmpi,MATMPIAIJ);
1318: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1319: MatPreallocateFinalize(dnz,onz);
1320: MatSetBlockSize(Cmpi,1);
1321: for (i=0; i<pn; i++) {
1322: row = i + rstart;
1323: nnz = bi[i+1] - bi[i];
1324: Jptr = bj + bi[i];
1325: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1326: }
1327: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1328: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1329: PetscFree(vals);
1331: merge->bi = bi;
1332: merge->bj = bj;
1333: merge->coi = coi;
1334: merge->coj = coj;
1335: merge->buf_ri = buf_ri;
1336: merge->buf_rj = buf_rj;
1337: merge->owners_co = owners_co;
1338: merge->destroy = Cmpi->ops->destroy;
1339: merge->duplicate = Cmpi->ops->duplicate;
1341: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ;
1342: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1344: /* attach the supporting struct to Cmpi for reuse */
1345: c = (Mat_MPIAIJ*)Cmpi->data;
1346: c->ptap = ptap;
1347: ptap->api = NULL;
1348: ptap->apj = NULL;
1349: ptap->merge = merge;
1350: ptap->rmax = rmax;
1352: *C = Cmpi;
1353: #if defined(PETSC_USE_INFO)
1354: if (bi[pn] != 0) {
1355: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
1356: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);
1357: } else {
1358: PetscInfo(Cmpi,"Empty matrix product\n");
1359: }
1360: #endif
1361: return(0);
1362: }
1366: PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,Mat C)
1367: {
1368: PetscErrorCode ierr;
1369: Mat_Merge_SeqsToMPI *merge;
1370: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1371: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
1372: Mat_PtAPMPI *ptap;
1373: PetscInt *adj;
1374: PetscInt i,j,k,anz,pnz,row,*cj,nexta;
1375: MatScalar *ada,*ca,valtmp;
1376: PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n;
1377: MPI_Comm comm;
1378: PetscMPIInt size,rank,taga,*len_s;
1379: PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci;
1380: PetscInt **buf_ri,**buf_rj;
1381: PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */
1382: MPI_Request *s_waits,*r_waits;
1383: MPI_Status *status;
1384: MatScalar **abuf_r,*ba_i,*pA,*coa,*ba;
1385: PetscInt *ai,*aj,*coi,*coj;
1386: PetscInt *poJ,*pdJ;
1387: Mat A_loc;
1388: Mat_SeqAIJ *a_loc;
1391: PetscObjectGetComm((PetscObject)C,&comm);
1392: MPI_Comm_size(comm,&size);
1393: MPI_Comm_rank(comm,&rank);
1395: ptap = c->ptap;
1396: merge = ptap->merge;
1398: /* 2) compute numeric C_seq = P_loc^T*A_loc */
1399: /*------------------------------------------*/
1400: /* get data from symbolic products */
1401: coi = merge->coi; coj = merge->coj;
1402: PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);
1403: PetscMemzero(coa,coi[pon]*sizeof(MatScalar));
1404: bi = merge->bi; bj = merge->bj;
1405: owners = merge->rowmap->range;
1406: PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);
1407: PetscMemzero(ba,bi[cm]*sizeof(MatScalar));
1409: /* get A_loc by taking all local rows of A */
1410: A_loc = ptap->A_loc;
1411: MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);
1412: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1413: ai = a_loc->i;
1414: aj = a_loc->j;
1416: for (i=0; i<am; i++) {
1417: anz = ai[i+1] - ai[i];
1418: adj = aj + ai[i];
1419: ada = a_loc->a + ai[i];
1421: /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */
1422: /*-------------------------------------------------------------*/
1423: /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */
1424: pnz = po->i[i+1] - po->i[i];
1425: poJ = po->j + po->i[i];
1426: pA = po->a + po->i[i];
1427: for (j=0; j<pnz; j++) {
1428: row = poJ[j];
1429: cj = coj + coi[row];
1430: ca = coa + coi[row];
1431: /* perform sparse axpy */
1432: nexta = 0;
1433: valtmp = pA[j];
1434: for (k=0; nexta<anz; k++) {
1435: if (cj[k] == adj[nexta]) {
1436: ca[k] += valtmp*ada[nexta];
1437: nexta++;
1438: }
1439: }
1440: PetscLogFlops(2.0*anz);
1441: }
1443: /* put the value into Cd (diagonal part) */
1444: pnz = pd->i[i+1] - pd->i[i];
1445: pdJ = pd->j + pd->i[i];
1446: pA = pd->a + pd->i[i];
1447: for (j=0; j<pnz; j++) {
1448: row = pdJ[j];
1449: cj = bj + bi[row];
1450: ca = ba + bi[row];
1451: /* perform sparse axpy */
1452: nexta = 0;
1453: valtmp = pA[j];
1454: for (k=0; nexta<anz; k++) {
1455: if (cj[k] == adj[nexta]) {
1456: ca[k] += valtmp*ada[nexta];
1457: nexta++;
1458: }
1459: }
1460: PetscLogFlops(2.0*anz);
1461: }
1462: }
1464: /* 3) send and recv matrix values coa */
1465: /*------------------------------------*/
1466: buf_ri = merge->buf_ri;
1467: buf_rj = merge->buf_rj;
1468: len_s = merge->len_s;
1469: PetscCommGetNewTag(comm,&taga);
1470: PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);
1472: PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);
1473: for (proc=0,k=0; proc<size; proc++) {
1474: if (!len_s[proc]) continue;
1475: i = merge->owners_co[proc];
1476: MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);
1477: k++;
1478: }
1479: if (merge->nrecv) {MPI_Waitall(merge->nrecv,r_waits,status);}
1480: if (merge->nsend) {MPI_Waitall(merge->nsend,s_waits,status);}
1482: PetscFree2(s_waits,status);
1483: PetscFree(r_waits);
1484: PetscFree(coa);
1486: /* 4) insert local Cseq and received values into Cmpi */
1487: /*----------------------------------------------------*/
1488: PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
1489: for (k=0; k<merge->nrecv; k++) {
1490: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1491: nrows = *(buf_ri_k[k]);
1492: nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */
1493: nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */
1494: }
1496: for (i=0; i<cm; i++) {
1497: row = owners[rank] + i; /* global row index of C_seq */
1498: bj_i = bj + bi[i]; /* col indices of the i-th row of C */
1499: ba_i = ba + bi[i];
1500: bnz = bi[i+1] - bi[i];
1501: /* add received vals into ba */
1502: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1503: /* i-th row */
1504: if (i == *nextrow[k]) {
1505: cnz = *(nextci[k]+1) - *nextci[k];
1506: cj = buf_rj[k] + *(nextci[k]);
1507: ca = abuf_r[k] + *(nextci[k]);
1508: nextcj = 0;
1509: for (j=0; nextcj<cnz; j++) {
1510: if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */
1511: ba_i[j] += ca[nextcj++];
1512: }
1513: }
1514: nextrow[k]++; nextci[k]++;
1515: PetscLogFlops(2.0*cnz);
1516: }
1517: }
1518: MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);
1519: }
1520: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
1521: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1523: PetscFree(ba);
1524: PetscFree(abuf_r[0]);
1525: PetscFree(abuf_r);
1526: PetscFree3(buf_ri_k,nextrow,nextci);
1527: return(0);
1528: }
1530: /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */
1533: PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,PetscReal fill,Mat *C)
1534: {
1535: PetscErrorCode ierr;
1536: Mat Cmpi,A_loc,POt,PDt;
1537: Mat_PtAPMPI *ptap;
1538: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1539: Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c;
1540: PetscInt *pdti,*pdtj,*poti,*potj,*ptJ;
1541: PetscInt nnz;
1542: PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row;
1543: PetscInt am =A->rmap->n,pn=P->cmap->n;
1544: MPI_Comm comm;
1545: PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri;
1546: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1547: PetscInt len,proc,*dnz,*onz,*owners;
1548: PetscInt nzi,*bi,*bj;
1549: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1550: MPI_Request *swaits,*rwaits;
1551: MPI_Status *sstatus,rstatus;
1552: Mat_Merge_SeqsToMPI *merge;
1553: PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j;
1554: PetscReal afill =1.0,afill_tmp;
1555: PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax;
1556: PetscScalar *vals;
1557: Mat_SeqAIJ *a_loc, *pdt,*pot;
1560: PetscObjectGetComm((PetscObject)A,&comm);
1561: /* check if matrix local sizes are compatible */
1562: if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) {
1563: SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend);
1564: }
1566: MPI_Comm_size(comm,&size);
1567: MPI_Comm_rank(comm,&rank);
1569: /* create struct Mat_PtAPMPI and attached it to C later */
1570: PetscNew(Mat_PtAPMPI,&ptap);
1572: /* get A_loc by taking all local rows of A */
1573: MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);
1575: ptap->A_loc = A_loc;
1576: a_loc = (Mat_SeqAIJ*)(A_loc)->data;
1577: ai = a_loc->i;
1578: aj = a_loc->j;
1580: /* determine symbolic Co=(p->B)^T*A - send to others */
1581: /*----------------------------------------------------*/
1582: MatTransposeSymbolic_SeqAIJ(p->A,&PDt);
1583: pdt = (Mat_SeqAIJ*)PDt->data;
1584: pdti = pdt->i; pdtj = pdt->j;
1586: MatTransposeSymbolic_SeqAIJ(p->B,&POt);
1587: pot = (Mat_SeqAIJ*)POt->data;
1588: poti = pot->i; potj = pot->j;
1590: /* then, compute symbolic Co = (p->B)^T*A */
1591: pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors
1592: >= (num of nonzero rows of C_seq) - pn */
1593: PetscMalloc((pon+1)*sizeof(PetscInt),&coi);
1594: coi[0] = 0;
1596: /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */
1597: nnz = fill*(poti[pon] + ai[am]);
1598: PetscFreeSpaceGet(nnz,&free_space);
1599: current_space = free_space;
1601: /* create and initialize a linked list */
1602: i = PetscMax(pdt->rmax,pot->rmax);
1603: Crmax = i*a_loc->rmax*size; /* non-scalable! */
1604: if (!Crmax || Crmax > aN) Crmax = aN;
1605: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
1607: for (i=0; i<pon; i++) {
1608: pnz = poti[i+1] - poti[i];
1609: ptJ = potj + poti[i];
1610: for (j=0; j<pnz; j++) {
1611: row = ptJ[j]; /* row of A_loc == col of Pot */
1612: anz = ai[row+1] - ai[row];
1613: Jptr = aj + ai[row];
1614: /* add non-zero cols of AP into the sorted linked list lnk */
1615: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1616: }
1617: nnz = lnk[0];
1619: /* If free space is not available, double the total space in the list */
1620: if (current_space->local_remaining<nnz) {
1621: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1622: nspacedouble++;
1623: }
1625: /* Copy data into free space, and zero out denserows */
1626: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1628: current_space->array += nnz;
1629: current_space->local_used += nnz;
1630: current_space->local_remaining -= nnz;
1632: coi[i+1] = coi[i] + nnz;
1633: }
1635: PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);
1636: PetscFreeSpaceContiguous(&free_space,coj);
1638: afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1);
1639: if (afill_tmp > afill) afill = afill_tmp;
1641: /* send j-array (coj) of Co to other processors */
1642: /*----------------------------------------------*/
1643: /* determine row ownership */
1644: PetscNew(Mat_Merge_SeqsToMPI,&merge);
1645: PetscLayoutCreate(comm,&merge->rowmap);
1647: merge->rowmap->n = pn;
1648: merge->rowmap->bs = 1;
1650: PetscLayoutSetUp(merge->rowmap);
1651: owners = merge->rowmap->range;
1653: /* determine the number of messages to send, their lengths */
1654: PetscMalloc(size*sizeof(PetscMPIInt),&len_si);
1655: PetscMemzero(len_si,size*sizeof(PetscMPIInt));
1656: PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);
1658: len_s = merge->len_s;
1659: merge->nsend = 0;
1661: PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);
1662: PetscMemzero(len_s,size*sizeof(PetscMPIInt));
1664: proc = 0;
1665: for (i=0; i<pon; i++) {
1666: while (prmap[i] >= owners[proc+1]) proc++;
1667: len_si[proc]++; /* num of rows in Co to be sent to [proc] */
1668: len_s[proc] += coi[i+1] - coi[i];
1669: }
1671: len = 0; /* max length of buf_si[] */
1672: owners_co[0] = 0;
1673: for (proc=0; proc<size; proc++) {
1674: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1675: if (len_si[proc]) {
1676: merge->nsend++;
1677: len_si[proc] = 2*(len_si[proc] + 1);
1678: len += len_si[proc];
1679: }
1680: }
1682: /* determine the number and length of messages to receive for coi and coj */
1683: PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);
1684: PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);
1686: /* post the Irecv and Isend of coj */
1687: PetscCommGetNewTag(comm,&tagj);
1688: PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);
1689: PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);
1690: for (proc=0, k=0; proc<size; proc++) {
1691: if (!len_s[proc]) continue;
1692: i = owners_co[proc];
1693: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1694: k++;
1695: }
1697: /* receives and sends of coj are complete */
1698: PetscMalloc(size*sizeof(MPI_Status),&sstatus);
1699: for (i=0; i<merge->nrecv; i++) {
1700: PetscMPIInt icompleted;
1701: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1702: }
1703: PetscFree(rwaits);
1704: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1706: /* send and recv coi */
1707: /*-------------------*/
1708: PetscCommGetNewTag(comm,&tagi);
1709: PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);
1710: PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);
1711: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1712: for (proc=0,k=0; proc<size; proc++) {
1713: if (!len_s[proc]) continue;
1714: /* form outgoing message for i-structure:
1715: buf_si[0]: nrows to be sent
1716: [1:nrows]: row index (global)
1717: [nrows+1:2*nrows+1]: i-structure index
1718: */
1719: /*-------------------------------------------*/
1720: nrows = len_si[proc]/2 - 1;
1721: buf_si_i = buf_si + nrows+1;
1722: buf_si[0] = nrows;
1723: buf_si_i[0] = 0;
1724: nrows = 0;
1725: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1726: nzi = coi[i+1] - coi[i];
1727: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1728: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1729: nrows++;
1730: }
1731: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1732: k++;
1733: buf_si += len_si[proc];
1734: }
1735: i = merge->nrecv;
1736: while (i--) {
1737: PetscMPIInt icompleted;
1738: MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);
1739: }
1740: PetscFree(rwaits);
1741: if (merge->nsend) {MPI_Waitall(merge->nsend,swaits,sstatus);}
1742: PetscFree(len_si);
1743: PetscFree(len_ri);
1744: PetscFree(swaits);
1745: PetscFree(sstatus);
1746: PetscFree(buf_s);
1748: /* compute the local portion of C (mpi mat) */
1749: /*------------------------------------------*/
1750: /* allocate bi array and free space for accumulating nonzero column info */
1751: PetscMalloc((pn+1)*sizeof(PetscInt),&bi);
1752: bi[0] = 0;
1754: /* set initial free space to be fill*(nnz(P) + nnz(AP)) */
1755: nnz = fill*(pdti[pn] + poti[pon] + ai[am]);
1756: PetscFreeSpaceGet(nnz,&free_space);
1757: current_space = free_space;
1759: PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);
1760: for (k=0; k<merge->nrecv; k++) {
1761: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1762: nrows = *buf_ri_k[k];
1763: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1764: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure */
1765: }
1767: MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);
1768: rmax = 0;
1769: for (i=0; i<pn; i++) {
1770: /* add pdt[i,:]*AP into lnk */
1771: pnz = pdti[i+1] - pdti[i];
1772: ptJ = pdtj + pdti[i];
1773: for (j=0; j<pnz; j++) {
1774: row = ptJ[j]; /* row of AP == col of Pt */
1775: anz = ai[row+1] - ai[row];
1776: Jptr = aj + ai[row];
1777: /* add non-zero cols of AP into the sorted linked list lnk */
1778: PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);
1779: }
1781: /* add received col data into lnk */
1782: for (k=0; k<merge->nrecv; k++) { /* k-th received message */
1783: if (i == *nextrow[k]) { /* i-th row */
1784: nzi = *(nextci[k]+1) - *nextci[k];
1785: Jptr = buf_rj[k] + *nextci[k];
1786: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
1787: nextrow[k]++; nextci[k]++;
1788: }
1789: }
1790: nnz = lnk[0];
1792: /* if free space is not available, make more free space */
1793: if (current_space->local_remaining<nnz) {
1794: PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);
1795: nspacedouble++;
1796: }
1797: /* copy data into free space, then initialize lnk */
1798: PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);
1799: MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);
1801: current_space->array += nnz;
1802: current_space->local_used += nnz;
1803: current_space->local_remaining -= nnz;
1805: bi[i+1] = bi[i] + nnz;
1806: if (nnz > rmax) rmax = nnz;
1807: }
1808: PetscFree3(buf_ri_k,nextrow,nextci);
1810: PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);
1811: PetscFreeSpaceContiguous(&free_space,bj);
1812: afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1);
1813: if (afill_tmp > afill) afill = afill_tmp;
1814: PetscLLCondensedDestroy_Scalable(lnk);
1815: MatDestroy(&POt);
1816: MatDestroy(&PDt);
1818: /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */
1819: /*----------------------------------------------------------------------------------*/
1820: PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);
1821: PetscMemzero(vals,rmax*sizeof(PetscScalar));
1823: MatCreate(comm,&Cmpi);
1824: MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
1825: MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);
1826: MatSetType(Cmpi,MATMPIAIJ);
1827: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1828: MatPreallocateFinalize(dnz,onz);
1829: MatSetBlockSize(Cmpi,1);
1830: for (i=0; i<pn; i++) {
1831: row = i + rstart;
1832: nnz = bi[i+1] - bi[i];
1833: Jptr = bj + bi[i];
1834: MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);
1835: }
1836: MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);
1837: MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);
1838: PetscFree(vals);
1840: merge->bi = bi;
1841: merge->bj = bj;
1842: merge->coi = coi;
1843: merge->coj = coj;
1844: merge->buf_ri = buf_ri;
1845: merge->buf_rj = buf_rj;
1846: merge->owners_co = owners_co;
1847: merge->destroy = Cmpi->ops->destroy;
1848: merge->duplicate = Cmpi->ops->duplicate;
1850: Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable;
1851: Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP;
1853: /* attach the supporting struct to Cmpi for reuse */
1854: c = (Mat_MPIAIJ*)Cmpi->data;
1856: c->ptap = ptap;
1857: ptap->api = NULL;
1858: ptap->apj = NULL;
1859: ptap->merge = merge;
1860: ptap->rmax = rmax;
1861: ptap->apa = NULL;
1863: *C = Cmpi;
1864: #if defined(PETSC_USE_INFO)
1865: if (bi[pn] != 0) {
1866: PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);
1867: PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);
1868: } else {
1869: PetscInfo(Cmpi,"Empty matrix product\n");
1870: }
1871: #endif
1872: return(0);
1873: }