Actual source code: baijov.c
petsc-3.4.2 2013-07-02
2: /*
3: Routines to compute overlapping regions of a parallel MPI matrix
4: and to find submatrices that were shared across processors.
5: */
6: #include <../src/mat/impls/baij/mpi/mpibaij.h>
7: #include <petscbt.h>
9: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
10: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
11: extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12: extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16: PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
17: {
19: PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
20: IS *is_new;
23: PetscMalloc(imax*sizeof(IS),&is_new);
24: /* Convert the indices into block format */
25: ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new);
26: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");
27: for (i=0; i<ov; ++i) {
28: MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);
29: }
30: for (i=0; i<imax; i++) {ISDestroy(&is[i]);}
31: ISExpandIndicesGeneral(N,N,bs,imax,is_new,is);
32: for (i=0; i<imax; i++) {ISDestroy(&is_new[i]);}
33: PetscFree(is_new);
34: return(0);
35: }
37: /*
38: Sample message format:
39: If a processor A wants processor B to process some elements corresponding
40: to index sets is[1], is[5]
41: mesg [0] = 2 (no of index sets in the mesg)
42: -----------
43: mesg [1] = 1 => is[1]
44: mesg [2] = sizeof(is[1]);
45: -----------
46: mesg [5] = 5 => is[5]
47: mesg [6] = sizeof(is[5]);
48: -----------
49: mesg [7]
50: mesg [n] data(is[1])
51: -----------
52: mesg[n+1]
53: mesg[m] data(is[5])
54: -----------
56: Notes:
57: nrqs - no of requests sent (or to be sent out)
58: nrqr - no of requests recieved (which have to be or which have been processed
59: */
62: PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
63: {
64: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
65: const PetscInt **idx,*idx_i;
66: PetscInt *n,*w3,*w4,**data,len;
68: PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
69: PetscInt Mbs,i,j,k,**rbuf,row,proc=-1,nrqs,msz,**outdat,**ptr;
70: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
71: PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2;
72: PetscBT *table;
73: MPI_Comm comm;
74: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
75: MPI_Status *s_status,*recv_status;
76: char *t_p;
79: PetscObjectGetComm((PetscObject)C,&comm);
80: size = c->size;
81: rank = c->rank;
82: Mbs = c->Mbs;
84: PetscObjectGetNewTag((PetscObject)C,&tag1);
85: PetscObjectGetNewTag((PetscObject)C,&tag2);
87: PetscMalloc2(imax+1,const PetscInt*,&idx,imax,PetscInt,&n);
89: for (i=0; i<imax; i++) {
90: ISGetIndices(is[i],&idx[i]);
91: ISGetLocalSize(is[i],&n[i]);
92: }
94: /* evaluate communication - mesg to who,length of mesg, and buffer space
95: required. Based on this, buffers are allocated, and data copied into them*/
96: PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);
97: PetscMemzero(w1,size*sizeof(PetscMPIInt));
98: PetscMemzero(w2,size*sizeof(PetscMPIInt));
99: PetscMemzero(w3,size*sizeof(PetscInt));
100: for (i=0; i<imax; i++) {
101: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
102: idx_i = idx[i];
103: len = n[i];
104: for (j=0; j<len; j++) {
105: row = idx_i[j];
106: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
107: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
108: w4[proc]++;
109: }
110: for (j=0; j<size; j++) {
111: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
112: }
113: }
115: nrqs = 0; /* no of outgoing messages */
116: msz = 0; /* total mesg length (for all proc */
117: w1[rank] = 0; /* no mesg sent to itself */
118: w3[rank] = 0;
119: for (i=0; i<size; i++) {
120: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
121: }
122: /* pa - is list of processors to communicate with */
123: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
124: for (i=0,j=0; i<size; i++) {
125: if (w1[i]) {pa[j] = i; j++;}
126: }
128: /* Each message would have a header = 1 + 2*(no of IS) + data */
129: for (i=0; i<nrqs; i++) {
130: j = pa[i];
131: w1[j] += w2[j] + 2*w3[j];
132: msz += w1[j];
133: }
135: /* Determine the number of messages to expect, their lengths, from from-ids */
136: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
137: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
139: /* Now post the Irecvs corresponding to these messages */
140: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
142: /* Allocate Memory for outgoing messages */
143: PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);
144: PetscMemzero(outdat,size*sizeof(PetscInt*));
145: PetscMemzero(ptr,size*sizeof(PetscInt*));
146: {
147: PetscInt *iptr = tmp,ict = 0;
148: for (i=0; i<nrqs; i++) {
149: j = pa[i];
150: iptr += ict;
151: outdat[j] = iptr;
152: ict = w1[j];
153: }
154: }
156: /* Form the outgoing messages */
157: /*plug in the headers*/
158: for (i=0; i<nrqs; i++) {
159: j = pa[i];
160: outdat[j][0] = 0;
161: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
162: ptr[j] = outdat[j] + 2*w3[j] + 1;
163: }
165: /* Memory for doing local proc's work*/
166: {
167: PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz,
168: Mbs*imax,PetscInt,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);
169: PetscMemzero(table,imax*sizeof(PetscBT));
170: PetscMemzero(data,imax*sizeof(PetscInt*));
171: PetscMemzero(isz,imax*sizeof(PetscInt));
172: PetscMemzero(d_p,Mbs*imax*sizeof(PetscInt));
173: PetscMemzero(t_p,(Mbs/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));
175: for (i=0; i<imax; i++) {
176: table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
177: data[i] = d_p + (Mbs)*i;
178: }
179: }
181: /* Parse the IS and update local tables and the outgoing buf with the data*/
182: {
183: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
184: PetscBT table_i;
186: for (i=0; i<imax; i++) {
187: PetscMemzero(ctr,size*sizeof(PetscInt));
188: n_i = n[i];
189: table_i = table[i];
190: idx_i = idx[i];
191: data_i = data[i];
192: isz_i = isz[i];
193: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
194: row = idx_i[j];
195: PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);
196: if (proc != rank) { /* copy to the outgoing buffer */
197: ctr[proc]++;
198: *ptr[proc] = row;
199: ptr[proc]++;
200: } else { /* Update the local table */
201: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
202: }
203: }
204: /* Update the headers for the current IS */
205: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
206: if ((ctr_j = ctr[j])) {
207: outdat_j = outdat[j];
208: k = ++outdat_j[0];
209: outdat_j[2*k] = ctr_j;
210: outdat_j[2*k-1] = i;
211: }
212: }
213: isz[i] = isz_i;
214: }
215: }
217: /* Now post the sends */
218: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
219: for (i=0; i<nrqs; ++i) {
220: j = pa[i];
221: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
222: }
224: /* No longer need the original indices*/
225: for (i=0; i<imax; ++i) {
226: ISRestoreIndices(is[i],idx+i);
227: }
228: PetscFree2(idx,n);
230: for (i=0; i<imax; ++i) {
231: ISDestroy(&is[i]);
232: }
234: /* Do Local work*/
235: MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);
237: /* Receive messages*/
238: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
239: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
241: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
242: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
244: /* Phase 1 sends are complete - deallocate buffers */
245: PetscFree4(outdat,ptr,tmp,ctr);
246: PetscFree4(w1,w2,w3,w4);
248: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
249: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
250: MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
251: PetscFree(rbuf[0]);
252: PetscFree(rbuf);
254: /* Send the data back*/
255: /* Do a global reduction to know the buffer space req for incoming messages*/
256: {
257: PetscMPIInt *rw1;
259: PetscMalloc(size*sizeof(PetscInt),&rw1);
260: PetscMemzero(rw1,size*sizeof(PetscInt));
262: for (i=0; i<nrqr; ++i) {
263: proc = recv_status[i].MPI_SOURCE;
264: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
265: rw1[proc] = isz1[i];
266: }
268: PetscFree(onodes1);
269: PetscFree(olengths1);
271: /* Determine the number of messages to expect, their lengths, from from-ids */
272: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
273: PetscFree(rw1);
274: }
275: /* Now post the Irecvs corresponding to these messages */
276: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
278: /* Now post the sends */
279: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
280: for (i=0; i<nrqr; ++i) {
281: j = recv_status[i].MPI_SOURCE;
282: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
283: }
285: /* receive work done on other processors*/
286: {
287: PetscMPIInt idex;
288: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
289: PetscBT table_i;
290: MPI_Status *status2;
292: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
293: for (i=0; i<nrqs; ++i) {
294: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
295: /* Process the message*/
296: rbuf2_i = rbuf2[idex];
297: ct1 = 2*rbuf2_i[0]+1;
298: jmax = rbuf2[idex][0];
299: for (j=1; j<=jmax; j++) {
300: max = rbuf2_i[2*j];
301: is_no = rbuf2_i[2*j-1];
302: isz_i = isz[is_no];
303: data_i = data[is_no];
304: table_i = table[is_no];
305: for (k=0; k<max; k++,ct1++) {
306: row = rbuf2_i[ct1];
307: if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
308: }
309: isz[is_no] = isz_i;
310: }
311: }
312: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
313: PetscFree(status2);
314: }
316: for (i=0; i<imax; ++i) {
317: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);
318: }
321: PetscFree(onodes2);
322: PetscFree(olengths2);
324: PetscFree(pa);
325: PetscFree(rbuf2[0]);
326: PetscFree(rbuf2);
327: PetscFree(s_waits1);
328: PetscFree(r_waits1);
329: PetscFree(s_waits2);
330: PetscFree(r_waits2);
331: PetscFree5(table,data,isz,d_p,t_p);
332: PetscFree(s_status);
333: PetscFree(recv_status);
334: PetscFree(xdata[0]);
335: PetscFree(xdata);
336: PetscFree(isz1);
337: return(0);
338: }
342: /*
343: MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
344: the work on the local processor.
346: Inputs:
347: C - MAT_MPIBAIJ;
348: imax - total no of index sets processed at a time;
349: table - an array of char - size = Mbs bits.
351: Output:
352: isz - array containing the count of the solution elements corresponding
353: to each index set;
354: data - pointer to the solutions
355: */
356: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
357: {
358: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
359: Mat A = c->A,B = c->B;
360: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
361: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
362: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
363: PetscBT table_i;
366: rstart = c->rstartbs;
367: cstart = c->cstartbs;
368: ai = a->i;
369: aj = a->j;
370: bi = b->i;
371: bj = b->j;
372: garray = c->garray;
375: for (i=0; i<imax; i++) {
376: data_i = data[i];
377: table_i = table[i];
378: isz_i = isz[i];
379: for (j=0,max=isz[i]; j<max; j++) {
380: row = data_i[j] - rstart;
381: start = ai[row];
382: end = ai[row+1];
383: for (k=start; k<end; k++) { /* Amat */
384: val = aj[k] + cstart;
385: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
386: }
387: start = bi[row];
388: end = bi[row+1];
389: for (k=start; k<end; k++) { /* Bmat */
390: val = garray[bj[k]];
391: if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
392: }
393: }
394: isz[i] = isz_i;
395: }
396: return(0);
397: }
400: /*
401: MatIncreaseOverlap_MPIBAIJ_Receive - Process the recieved messages,
402: and return the output
404: Input:
405: C - the matrix
406: nrqr - no of messages being processed.
407: rbuf - an array of pointers to the recieved requests
409: Output:
410: xdata - array of messages to be sent back
411: isz1 - size of each message
413: For better efficiency perhaps we should malloc separately each xdata[i],
414: then if a remalloc is required we need only copy the data for that one row
415: rather than all previous rows as it is now where a single large chunck of
416: memory is used.
418: */
419: static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
420: {
421: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
422: Mat A = c->A,B = c->B;
423: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
425: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
426: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
427: PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
428: PetscInt *rbuf_i,kmax,rbuf_0;
429: PetscBT xtable;
432: Mbs = c->Mbs;
433: rstart = c->rstartbs;
434: cstart = c->cstartbs;
435: ai = a->i;
436: aj = a->j;
437: bi = b->i;
438: bj = b->j;
439: garray = c->garray;
442: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
443: rbuf_i = rbuf[i];
444: rbuf_0 = rbuf_i[0];
445: ct += rbuf_0;
446: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
447: }
449: if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
450: else max1 = 1;
451: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
452: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
453: ++no_malloc;
454: PetscBTCreate(Mbs,&xtable);
455: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
457: ct3 = 0;
458: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
459: rbuf_i = rbuf[i];
460: rbuf_0 = rbuf_i[0];
461: ct1 = 2*rbuf_0+1;
462: ct2 = ct1;
463: ct3 += ct1;
464: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
465: PetscBTMemzero(Mbs,xtable);
466: oct2 = ct2;
467: kmax = rbuf_i[2*j];
468: for (k=0; k<kmax; k++,ct1++) {
469: row = rbuf_i[ct1];
470: if (!PetscBTLookupSet(xtable,row)) {
471: if (!(ct3 < mem_estimate)) {
472: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
473: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
474: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
475: PetscFree(xdata[0]);
476: xdata[0] = tmp;
477: mem_estimate = new_estimate; ++no_malloc;
478: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
479: }
480: xdata[i][ct2++] = row;
481: ct3++;
482: }
483: }
484: for (k=oct2,max2=ct2; k<max2; k++) {
485: row = xdata[i][k] - rstart;
486: start = ai[row];
487: end = ai[row+1];
488: for (l=start; l<end; l++) {
489: val = aj[l] + cstart;
490: if (!PetscBTLookupSet(xtable,val)) {
491: if (!(ct3 < mem_estimate)) {
492: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
493: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
494: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
495: PetscFree(xdata[0]);
496: xdata[0] = tmp;
497: mem_estimate = new_estimate; ++no_malloc;
498: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
499: }
500: xdata[i][ct2++] = val;
501: ct3++;
502: }
503: }
504: start = bi[row];
505: end = bi[row+1];
506: for (l=start; l<end; l++) {
507: val = garray[bj[l]];
508: if (!PetscBTLookupSet(xtable,val)) {
509: if (!(ct3 < mem_estimate)) {
510: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
511: PetscMalloc(new_estimate * sizeof(PetscInt),&tmp);
512: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
513: PetscFree(xdata[0]);
514: xdata[0] = tmp;
515: mem_estimate = new_estimate; ++no_malloc;
516: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
517: }
518: xdata[i][ct2++] = val;
519: ct3++;
520: }
521: }
522: }
523: /* Update the header*/
524: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
525: xdata[i][2*j-1] = rbuf_i[2*j-1];
526: }
527: xdata[i][0] = rbuf_0;
528: xdata[i+1] = xdata[i] + ct2;
529: isz1[i] = ct2; /* size of each message */
530: }
531: PetscBTDestroy(&xtable);
532: PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
533: return(0);
534: }
538: PetscErrorCode MatGetSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
539: {
540: IS *isrow_new,*iscol_new;
541: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
543: PetscInt nmax,nstages_local,nstages,i,pos,max_no,ncol,nrow,N=C->cmap->N,bs=C->rmap->bs;
544: PetscBool colflag,*allcolumns,*allrows;
547: /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */
548: for (i = 0; i < ismax; ++i) {
549: PetscBool sorted;
550: ISSorted(iscol[i], &sorted);
551: if (!sorted) SETERRQ1(((PetscObject)iscol[i])->comm, PETSC_ERR_SUP, "Column index set %D not sorted", i);
552: }
553: /* The compression and expansion should be avoided. Doesn't point
554: out errors, might change the indices, hence buggey */
555: PetscMalloc2(ismax+1,IS,&isrow_new,ismax+1,IS,&iscol_new);
556: ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_new);
557: ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_new);
559: /* Check for special case: each processor gets entire matrix columns */
560: PetscMalloc2(ismax+1,PetscBool,&allcolumns,ismax+1,PetscBool,&allrows);
561: for (i=0; i<ismax; i++) {
562: ISIdentity(iscol[i],&colflag);
563: ISGetLocalSize(iscol[i],&ncol);
564: if (colflag && ncol == C->cmap->N) allcolumns[i] = PETSC_TRUE;
565: else allcolumns[i] = PETSC_FALSE;
567: ISIdentity(isrow[i],&colflag);
568: ISGetLocalSize(isrow[i],&nrow);
569: if (colflag && nrow == C->rmap->N) allrows[i] = PETSC_TRUE;
570: else allrows[i] = PETSC_FALSE;
571: }
573: /* Allocate memory to hold all the submatrices */
574: if (scall != MAT_REUSE_MATRIX) {
575: PetscMalloc((ismax+1)*sizeof(Mat),submat);
576: }
577: /* Determine the number of stages through which submatrices are done */
578: nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
579: if (!nmax) nmax = 1;
580: nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0);
582: /* Make sure every processor loops through the nstages */
583: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
584: for (i=0,pos=0; i<nstages; i++) {
585: if (pos+nmax <= ismax) max_no = nmax;
586: else if (pos == ismax) max_no = 0;
587: else max_no = ismax-pos;
588: MatGetSubMatrices_MPIBAIJ_local(C,max_no,isrow_new+pos,iscol_new+pos,scall,allrows+pos,allcolumns+pos,*submat+pos);
589: pos += max_no;
590: }
592: for (i=0; i<ismax; i++) {
593: ISDestroy(&isrow_new[i]);
594: ISDestroy(&iscol_new[i]);
595: }
596: PetscFree2(isrow_new,iscol_new);
597: PetscFree2(allcolumns,allrows);
598: return(0);
599: }
601: #if defined(PETSC_USE_CTABLE)
604: PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
605: {
606: PetscInt nGlobalNd = proc_gnode[size];
607: PetscMPIInt fproc;
611: PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc);
612: if (fproc > size) fproc = size;
613: while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
614: if (row < proc_gnode[fproc]) fproc--;
615: else fproc++;
616: }
617: *rank = fproc;
618: return(0);
619: }
620: #endif
622: /* -------------------------------------------------------------------------*/
623: /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
626: PetscErrorCode MatGetSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allrows,PetscBool *allcolumns,Mat *submats)
627: {
628: Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
629: Mat A = c->A;
630: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*mat;
631: const PetscInt **irow,**icol,*irow_i;
632: PetscInt *nrow,*ncol,*w3,*w4,start;
634: PetscMPIInt size,tag0,tag1,tag2,tag3,*w1,*w2,nrqr,idex,end,proc;
635: PetscInt **sbuf1,**sbuf2,rank,i,j,k,l,ct1,ct2,**rbuf1,row;
636: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
637: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2;
638: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
639: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
640: PetscInt bs =C->rmap->bs,bs2=c->bs2,*a_j=a->j,*b_j=b->j,*cworkA,*cworkB;
641: PetscInt cstart = c->cstartbs,nzA,nzB,*a_i=a->i,*b_i=b->i,imark;
642: PetscInt *bmap = c->garray,ctmp,rstart=c->rstartbs;
643: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3,*s_waits3;
644: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2,*r_status3;
645: MPI_Comm comm;
646: PetscBool flag;
647: PetscMPIInt *onodes1,*olengths1;
648: PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */
650: /* variables below are used for the matrix numerical values - case of !ijonly */
651: MPI_Request *r_waits4,*s_waits4;
652: MPI_Status *r_status4,*s_status4;
653: MatScalar **rbuf4,**sbuf_aa,*vals,*mat_a = NULL,*sbuf_aa_i,*vworkA = NULL,*vworkB = NULL;
654: MatScalar *a_a=a->a,*b_a=b->a;
656: #if defined(PETSC_USE_CTABLE)
657: PetscInt tt;
658: PetscTable *rmap,*cmap,rmap_i,cmap_i=NULL;
659: #else
660: PetscInt **cmap,*cmap_i=NULL,*rtable,*rmap_i,**rmap, Mbs = c->Mbs;
661: #endif
664: PetscObjectGetComm((PetscObject)C,&comm);
665: tag0 = ((PetscObject)C)->tag;
666: size = c->size;
667: rank = c->rank;
669: /* Get some new tags to keep the communication clean */
670: PetscObjectGetNewTag((PetscObject)C,&tag1);
671: PetscObjectGetNewTag((PetscObject)C,&tag2);
672: PetscObjectGetNewTag((PetscObject)C,&tag3);
674: #if defined(PETSC_USE_CTABLE)
675: PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);
676: #else
677: PetscMalloc5(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol,Mbs+1,PetscInt,&rtable);
678: /* Create hash table for the mapping :row -> proc*/
679: for (i=0,j=0; i<size; i++) {
680: jmax = C->rmap->range[i+1]/bs;
681: for (; j<jmax; j++) rtable[j] = i;
682: }
683: #endif
685: for (i=0; i<ismax; i++) {
686: if (allrows[i]) {
687: irow[i] = NULL;
688: nrow[i] = C->rmap->N/bs;
689: } else {
690: ISGetIndices(isrow[i],&irow[i]);
691: ISGetLocalSize(isrow[i],&nrow[i]);
692: }
694: if (allcolumns[i]) {
695: icol[i] = NULL;
696: ncol[i] = C->cmap->N/bs;
697: } else {
698: ISGetIndices(iscol[i],&icol[i]);
699: ISGetLocalSize(iscol[i],&ncol[i]);
700: }
701: }
703: /* evaluate communication - mesg to who,length of mesg,and buffer space
704: required. Based on this, buffers are allocated, and data copied into them*/
705: PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscInt,&w3,size,PetscInt,&w4);
706: PetscMemzero(w1,size*sizeof(PetscMPIInt));
707: PetscMemzero(w2,size*sizeof(PetscMPIInt));
708: PetscMemzero(w3,size*sizeof(PetscInt));
709: for (i=0; i<ismax; i++) {
710: PetscMemzero(w4,size*sizeof(PetscInt)); /* initialise work vector*/
711: jmax = nrow[i];
712: irow_i = irow[i];
713: for (j=0; j<jmax; j++) {
714: if (allrows[i]) row = j;
715: else row = irow_i[j];
717: #if defined(PETSC_USE_CTABLE)
718: PetscGetProc(row,size,c->rangebs,&proc);
719: #else
720: proc = rtable[row];
721: #endif
722: w4[proc]++;
723: }
724: for (j=0; j<size; j++) {
725: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
726: }
727: }
729: nrqs = 0; /* no of outgoing messages */
730: msz = 0; /* total mesg length for all proc */
731: w1[rank] = 0; /* no mesg sent to intself */
732: w3[rank] = 0;
733: for (i=0; i<size; i++) {
734: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
735: }
736: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
737: for (i=0,j=0; i<size; i++) {
738: if (w1[i]) { pa[j] = i; j++; }
739: }
741: /* Each message would have a header = 1 + 2*(no of IS) + data */
742: for (i=0; i<nrqs; i++) {
743: j = pa[i];
744: w1[j] += w2[j] + 2* w3[j];
745: msz += w1[j];
746: }
748: /* Determine the number of messages to expect, their lengths, from from-ids */
749: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
750: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
752: /* Now post the Irecvs corresponding to these messages */
753: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
755: PetscFree(onodes1);
756: PetscFree(olengths1);
758: /* Allocate Memory for outgoing messages */
759: PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);
760: PetscMemzero(sbuf1,size*sizeof(PetscInt*));
761: PetscMemzero(ptr,size*sizeof(PetscInt*));
762: {
763: PetscInt *iptr = tmp,ict = 0;
764: for (i=0; i<nrqs; i++) {
765: j = pa[i];
766: iptr += ict;
767: sbuf1[j] = iptr;
768: ict = w1[j];
769: }
770: }
772: /* Form the outgoing messages */
773: /* Initialise the header space */
774: for (i=0; i<nrqs; i++) {
775: j = pa[i];
776: sbuf1[j][0] = 0;
777: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
778: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
779: }
781: /* Parse the isrow and copy data into outbuf */
782: for (i=0; i<ismax; i++) {
783: PetscMemzero(ctr,size*sizeof(PetscInt));
784: irow_i = irow[i];
785: jmax = nrow[i];
786: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
787: if (allrows[i]) row = j;
788: else row = irow_i[j];
790: #if defined(PETSC_USE_CTABLE)
791: PetscGetProc(row,size,c->rangebs,&proc);
792: #else
793: proc = rtable[row];
794: #endif
795: if (proc != rank) { /* copy to the outgoing buf*/
796: ctr[proc]++;
797: *ptr[proc] = row;
798: ptr[proc]++;
799: }
800: }
801: /* Update the headers for the current IS */
802: for (j=0; j<size; j++) { /* Can Optimise this loop too */
803: if ((ctr_j = ctr[j])) {
804: sbuf1_j = sbuf1[j];
805: k = ++sbuf1_j[0];
806: sbuf1_j[2*k] = ctr_j;
807: sbuf1_j[2*k-1] = i;
808: }
809: }
810: }
812: /* Now post the sends */
813: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
814: for (i=0; i<nrqs; ++i) {
815: j = pa[i];
816: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
817: }
819: /* Post Recieves to capture the buffer size */
820: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
821: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
822: rbuf2[0] = tmp + msz;
823: for (i=1; i<nrqs; ++i) {
824: j = pa[i];
825: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
826: }
827: for (i=0; i<nrqs; ++i) {
828: j = pa[i];
829: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
830: }
832: /* Send to other procs the buf size they should allocate */
834: /* Receive messages*/
835: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
836: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
837: PetscMalloc3(nrqr+1,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);
838: {
839: Mat_SeqBAIJ *sA = (Mat_SeqBAIJ*)c->A->data,*sB = (Mat_SeqBAIJ*)c->B->data;
840: PetscInt *sAi = sA->i,*sBi = sB->i,id,*sbuf2_i;
842: for (i=0; i<nrqr; ++i) {
843: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
845: req_size[idex] = 0;
846: rbuf1_i = rbuf1[idex];
847: start = 2*rbuf1_i[0] + 1;
848: MPI_Get_count(r_status1+i,MPIU_INT,&end);
849: PetscMalloc(end*sizeof(PetscInt),&sbuf2[idex]);
850: sbuf2_i = sbuf2[idex];
851: for (j=start; j<end; j++) {
852: id = rbuf1_i[j] - rstart;
853: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
854: sbuf2_i[j] = ncols;
855: req_size[idex] += ncols;
856: }
857: req_source[idex] = r_status1[i].MPI_SOURCE;
858: /* form the header */
859: sbuf2_i[0] = req_size[idex];
860: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
861: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
862: }
863: }
864: PetscFree(r_status1);
865: PetscFree(r_waits1);
867: /* recv buffer sizes */
868: /* Receive messages*/
869: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
870: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
871: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
872: if (!ijonly) {
873: PetscMalloc((nrqs+1)*sizeof(MatScalar*),&rbuf4);
874: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
875: }
877: for (i=0; i<nrqs; ++i) {
878: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
879: PetscMalloc(rbuf2[idex][0]*sizeof(PetscInt),&rbuf3[idex]);
880: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
881: if (!ijonly) {
882: PetscMalloc(rbuf2[idex][0]*bs2*sizeof(MatScalar),&rbuf4[idex]);
883: MPI_Irecv(rbuf4[idex],rbuf2[idex][0]*bs2,MPIU_MATSCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
884: }
885: }
886: PetscFree(r_status2);
887: PetscFree(r_waits2);
889: /* Wait on sends1 and sends2 */
890: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
891: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
893: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
894: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
895: PetscFree(s_status1);
896: PetscFree(s_status2);
897: PetscFree(s_waits1);
898: PetscFree(s_waits2);
900: /* Now allocate buffers for a->j, and send them off */
901: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
902: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
903: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
904: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
906: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
907: {
908: for (i=0; i<nrqr; i++) {
909: rbuf1_i = rbuf1[i];
910: sbuf_aj_i = sbuf_aj[i];
911: ct1 = 2*rbuf1_i[0] + 1;
912: ct2 = 0;
913: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
914: kmax = rbuf1[i][2*j];
915: for (k=0; k<kmax; k++,ct1++) {
916: row = rbuf1_i[ct1] - rstart;
917: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
918: ncols = nzA + nzB;
919: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
921: /* load the column indices for this row into cols*/
922: cols = sbuf_aj_i + ct2;
923: for (l=0; l<nzB; l++) {
924: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
925: else break;
926: }
927: imark = l;
928: for (l=0; l<nzA; l++) cols[imark+l] = cstart + cworkA[l];
929: for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
930: ct2 += ncols;
931: }
932: }
933: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
934: }
935: }
936: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
937: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
939: /* Allocate buffers for a->a, and send them off */
940: if (!ijonly) {
941: PetscMalloc((nrqr+1)*sizeof(MatScalar*),&sbuf_aa);
942: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
943: PetscMalloc((j+1)*bs2*sizeof(MatScalar),&sbuf_aa[0]);
944: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
946: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
947: {
948: for (i=0; i<nrqr; i++) {
949: rbuf1_i = rbuf1[i];
950: sbuf_aa_i = sbuf_aa[i];
951: ct1 = 2*rbuf1_i[0]+1;
952: ct2 = 0;
953: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
954: kmax = rbuf1_i[2*j];
955: for (k=0; k<kmax; k++,ct1++) {
956: row = rbuf1_i[ct1] - rstart;
957: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
958: ncols = nzA + nzB;
959: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
960: vworkA = a_a + a_i[row]*bs2; vworkB = b_a + b_i[row]*bs2;
962: /* load the column values for this row into vals*/
963: vals = sbuf_aa_i+ct2*bs2;
964: for (l=0; l<nzB; l++) {
965: if ((bmap[cworkB[l]]) < cstart) {
966: PetscMemcpy(vals+l*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
967: } else break;
968: }
969: imark = l;
970: for (l=0; l<nzA; l++) {
971: PetscMemcpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2*sizeof(MatScalar));
972: }
973: for (l=imark; l<nzB; l++) {
974: PetscMemcpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2*sizeof(MatScalar));
975: }
976: ct2 += ncols;
977: }
978: }
979: MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_MATSCALAR,req_source[i],tag3,comm,s_waits4+i);
980: }
981: }
982: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
983: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
984: }
985: PetscFree(rbuf1[0]);
986: PetscFree(rbuf1);
988: /* Form the matrix */
989: /* create col map: global col of C -> local col of submatrices */
990: {
991: const PetscInt *icol_i;
992: #if defined(PETSC_USE_CTABLE)
993: PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);
994: for (i=0; i<ismax; i++) {
995: if (!allcolumns[i]) {
996: PetscTableCreate(ncol[i]+1,c->Nbs+1,&cmap[i]);
997: jmax = ncol[i];
998: icol_i = icol[i];
999: cmap_i = cmap[i];
1000: for (j=0; j<jmax; j++) {
1001: PetscTableAdd(cmap_i,icol_i[j]+1,j+1,INSERT_VALUES);
1002: }
1003: } else {
1004: cmap[i] = NULL;
1005: }
1006: }
1007: #else
1008: PetscMalloc(ismax*sizeof(PetscInt*),&cmap);
1009: for (i=0; i<ismax; i++) {
1010: if (!allcolumns[i]) {
1011: PetscMalloc(c->Nbs*sizeof(PetscInt),&cmap[i]);
1012: PetscMemzero(cmap[i],c->Nbs*sizeof(PetscInt));
1013: jmax = ncol[i];
1014: icol_i = icol[i];
1015: cmap_i = cmap[i];
1016: for (j=0; j<jmax; j++) {
1017: cmap_i[icol_i[j]] = j+1;
1018: }
1019: } else { /* allcolumns[i] */
1020: cmap[i] = NULL;
1021: }
1022: }
1023: #endif
1024: }
1026: /* Create lens which is required for MatCreate... */
1027: for (i=0,j=0; i<ismax; i++) j += nrow[i];
1028: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt),&lens);
1029: lens[0] = (PetscInt*)(lens + ismax);
1030: PetscMemzero(lens[0],j*sizeof(PetscInt));
1031: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1033: /* Update lens from local data */
1034: for (i=0; i<ismax; i++) {
1035: jmax = nrow[i];
1036: if (!allcolumns[i]) cmap_i = cmap[i];
1037: irow_i = irow[i];
1038: lens_i = lens[i];
1039: for (j=0; j<jmax; j++) {
1040: if (allrows[i]) row = j;
1041: else row = irow_i[j];
1043: #if defined(PETSC_USE_CTABLE)
1044: PetscGetProc(row,size,c->rangebs,&proc);
1045: #else
1046: proc = rtable[row];
1047: #endif
1048: if (proc == rank) {
1049: /* Get indices from matA and then from matB */
1050: row = row - rstart;
1051: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1052: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1053: if (!allcolumns[i]) {
1054: #if defined(PETSC_USE_CTABLE)
1055: for (k=0; k<nzA; k++) {
1056: PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt);
1057: if (tt) lens_i[j]++;
1058: }
1059: for (k=0; k<nzB; k++) {
1060: PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt);
1061: if (tt) lens_i[j]++;
1062: }
1064: #else
1065: for (k=0; k<nzA; k++) {
1066: if (cmap_i[cstart + cworkA[k]]) lens_i[j]++;
1067: }
1068: for (k=0; k<nzB; k++) {
1069: if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++;
1070: }
1071: #endif
1072: } else { /* allcolumns */
1073: lens_i[j] = nzA + nzB;
1074: }
1075: }
1076: }
1077: }
1078: #if defined(PETSC_USE_CTABLE)
1079: /* Create row map*/
1080: PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);
1081: for (i=0; i<ismax; i++) {
1082: PetscTableCreate(nrow[i]+1,c->Mbs+1,&rmap[i]);
1083: }
1084: #else
1085: /* Create row map*/
1086: PetscMalloc((1+ismax)*sizeof(PetscInt*)+ ismax*Mbs*sizeof(PetscInt),&rmap);
1087: rmap[0] = (PetscInt*)(rmap + ismax);
1088: PetscMemzero(rmap[0],ismax*Mbs*sizeof(PetscInt));
1089: for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + Mbs;
1090: #endif
1091: for (i=0; i<ismax; i++) {
1092: irow_i = irow[i];
1093: jmax = nrow[i];
1094: #if defined(PETSC_USE_CTABLE)
1095: rmap_i = rmap[i];
1096: for (j=0; j<jmax; j++) {
1097: if (allrows[i]) {
1098: PetscTableAdd(rmap_i,j+1,j+1,INSERT_VALUES);
1099: } else {
1100: PetscTableAdd(rmap_i,irow_i[j]+1,j+1,INSERT_VALUES);
1101: }
1102: }
1103: #else
1104: rmap_i = rmap[i];
1105: for (j=0; j<jmax; j++) {
1106: if (allrows[i]) rmap_i[j] = j;
1107: else rmap_i[irow_i[j]] = j;
1108: }
1109: #endif
1110: }
1112: /* Update lens from offproc data */
1113: {
1114: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1115: PetscMPIInt ii;
1117: for (tmp2=0; tmp2<nrqs; tmp2++) {
1118: MPI_Waitany(nrqs,r_waits3,&ii,r_status3+tmp2);
1119: idex = pa[ii];
1120: sbuf1_i = sbuf1[idex];
1121: jmax = sbuf1_i[0];
1122: ct1 = 2*jmax+1;
1123: ct2 = 0;
1124: rbuf2_i = rbuf2[ii];
1125: rbuf3_i = rbuf3[ii];
1126: for (j=1; j<=jmax; j++) {
1127: is_no = sbuf1_i[2*j-1];
1128: max1 = sbuf1_i[2*j];
1129: lens_i = lens[is_no];
1130: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1131: rmap_i = rmap[is_no];
1132: for (k=0; k<max1; k++,ct1++) {
1133: #if defined(PETSC_USE_CTABLE)
1134: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
1135: row--;
1136: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1137: #else
1138: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1139: #endif
1140: max2 = rbuf2_i[ct1];
1141: for (l=0; l<max2; l++,ct2++) {
1142: if (!allcolumns[is_no]) {
1143: #if defined(PETSC_USE_CTABLE)
1144: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tt);
1145: if (tt) lens_i[row]++;
1146: #else
1147: if (cmap_i[rbuf3_i[ct2]]) lens_i[row]++;
1148: #endif
1149: } else { /* allcolumns */
1150: lens_i[row]++;
1151: }
1152: }
1153: }
1154: }
1155: }
1156: }
1157: PetscFree(r_status3);
1158: PetscFree(r_waits3);
1159: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1160: PetscFree(s_status3);
1161: PetscFree(s_waits3);
1163: /* Create the submatrices */
1164: if (scall == MAT_REUSE_MATRIX) {
1165: if (ijonly) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP," MAT_REUSE_MATRIX and ijonly is not supported yet");
1166: /*
1167: Assumes new rows are same length as the old rows, hence bug!
1168: */
1169: for (i=0; i<ismax; i++) {
1170: mat = (Mat_SeqBAIJ*)(submats[i]->data);
1171: if ((mat->mbs != nrow[i]) || (mat->nbs != ncol[i] || C->rmap->bs != bs)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1172: PetscMemcmp(mat->ilen,lens[i],mat->mbs *sizeof(PetscInt),&flag);
1173: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Cannot reuse matrix. wrong no of nonzeros");
1174: /* Initial matrix as if empty */
1175: PetscMemzero(mat->ilen,mat->mbs*sizeof(PetscInt));
1177: submats[i]->factortype = C->factortype;
1178: }
1179: } else {
1180: PetscInt bs_tmp;
1181: if (ijonly) bs_tmp = 1;
1182: else bs_tmp = bs;
1183: for (i=0; i<ismax; i++) {
1184: MatCreate(PETSC_COMM_SELF,submats+i);
1185: MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,nrow[i]*bs_tmp,ncol[i]*bs_tmp);
1186: MatSetType(submats[i],((PetscObject)A)->type_name);
1187: MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);
1188: MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]); /* this subroutine is used by SBAIJ routines */
1189: }
1190: }
1192: /* Assemble the matrices */
1193: /* First assemble the local rows */
1194: {
1195: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i;
1196: MatScalar *imat_a = NULL;
1198: for (i=0; i<ismax; i++) {
1199: mat = (Mat_SeqBAIJ*)submats[i]->data;
1200: imat_ilen = mat->ilen;
1201: imat_j = mat->j;
1202: imat_i = mat->i;
1203: if (!ijonly) imat_a = mat->a;
1204: if (!allcolumns[i]) cmap_i = cmap[i];
1205: rmap_i = rmap[i];
1206: irow_i = irow[i];
1207: jmax = nrow[i];
1208: for (j=0; j<jmax; j++) {
1209: if (allrows[i]) row = j;
1210: else row = irow_i[j];
1212: #if defined(PETSC_USE_CTABLE)
1213: PetscGetProc(row,size,c->rangebs,&proc);
1214: #else
1215: proc = rtable[row];
1216: #endif
1217: if (proc == rank) {
1218: row = row - rstart;
1219: nzA = a_i[row+1] - a_i[row];
1220: nzB = b_i[row+1] - b_i[row];
1221: cworkA = a_j + a_i[row];
1222: cworkB = b_j + b_i[row];
1223: if (!ijonly) {
1224: vworkA = a_a + a_i[row]*bs2;
1225: vworkB = b_a + b_i[row]*bs2;
1226: }
1227: #if defined(PETSC_USE_CTABLE)
1228: PetscTableFind(rmap_i,row+rstart+1,&row);
1229: row--;
1230: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1231: #else
1232: row = rmap_i[row + rstart];
1233: #endif
1234: mat_i = imat_i[row];
1235: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1236: mat_j = imat_j + mat_i;
1237: ilen_row = imat_ilen[row];
1239: /* load the column indices for this row into cols*/
1240: if (!allcolumns[i]) {
1241: for (l=0; l<nzB; l++) {
1242: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1243: #if defined(PETSC_USE_CTABLE)
1244: PetscTableFind(cmap_i,ctmp+1,&tcol);
1245: if (tcol) {
1246: #else
1247: if ((tcol = cmap_i[ctmp])) {
1248: #endif
1249: *mat_j++ = tcol - 1;
1250: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1251: mat_a += bs2;
1252: ilen_row++;
1253: }
1254: } else break;
1255: }
1256: imark = l;
1257: for (l=0; l<nzA; l++) {
1258: #if defined(PETSC_USE_CTABLE)
1259: PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol);
1260: if (tcol) {
1261: #else
1262: if ((tcol = cmap_i[cstart + cworkA[l]])) {
1263: #endif
1264: *mat_j++ = tcol - 1;
1265: if (!ijonly) {
1266: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1267: mat_a += bs2;
1268: }
1269: ilen_row++;
1270: }
1271: }
1272: for (l=imark; l<nzB; l++) {
1273: #if defined(PETSC_USE_CTABLE)
1274: PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol);
1275: if (tcol) {
1276: #else
1277: if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1278: #endif
1279: *mat_j++ = tcol - 1;
1280: if (!ijonly) {
1281: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1282: mat_a += bs2;
1283: }
1284: ilen_row++;
1285: }
1286: }
1287: } else { /* allcolumns */
1288: for (l=0; l<nzB; l++) {
1289: if ((ctmp = bmap[cworkB[l]]) < cstart) {
1290: *mat_j++ = ctmp;
1291: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1292: mat_a += bs2;
1293: ilen_row++;
1294: } else break;
1295: }
1296: imark = l;
1297: for (l=0; l<nzA; l++) {
1298: *mat_j++ = cstart+cworkA[l];
1299: if (!ijonly) {
1300: PetscMemcpy(mat_a,vworkA+l*bs2,bs2*sizeof(MatScalar));
1301: mat_a += bs2;
1302: }
1303: ilen_row++;
1304: }
1305: for (l=imark; l<nzB; l++) {
1306: *mat_j++ = bmap[cworkB[l]];
1307: if (!ijonly) {
1308: PetscMemcpy(mat_a,vworkB+l*bs2,bs2*sizeof(MatScalar));
1309: mat_a += bs2;
1310: }
1311: ilen_row++;
1312: }
1313: }
1314: imat_ilen[row] = ilen_row;
1315: }
1316: }
1317: }
1318: }
1320: /* Now assemble the off proc rows*/
1321: {
1322: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1323: PetscInt *imat_j,*imat_i;
1324: MatScalar *imat_a = NULL,*rbuf4_i = NULL;
1325: PetscMPIInt ii;
1327: for (tmp2=0; tmp2<nrqs; tmp2++) {
1328: if (ijonly) ii = tmp2;
1329: else {
1330: MPI_Waitany(nrqs,r_waits4,&ii,r_status4+tmp2);
1331: }
1332: idex = pa[ii];
1333: sbuf1_i = sbuf1[idex];
1334: jmax = sbuf1_i[0];
1335: ct1 = 2*jmax + 1;
1336: ct2 = 0;
1337: rbuf2_i = rbuf2[ii];
1338: rbuf3_i = rbuf3[ii];
1339: if (!ijonly) rbuf4_i = rbuf4[ii];
1340: for (j=1; j<=jmax; j++) {
1341: is_no = sbuf1_i[2*j-1];
1342: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1343: rmap_i = rmap[is_no];
1344: mat = (Mat_SeqBAIJ*)submats[is_no]->data;
1345: imat_ilen = mat->ilen;
1346: imat_j = mat->j;
1347: imat_i = mat->i;
1348: if (!ijonly) imat_a = mat->a;
1349: max1 = sbuf1_i[2*j];
1350: for (k=0; k<max1; k++,ct1++) {
1351: row = sbuf1_i[ct1];
1352: #if defined(PETSC_USE_CTABLE)
1353: PetscTableFind(rmap_i,row+1,&row);
1354: row--;
1355: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1356: #else
1357: row = rmap_i[row];
1358: #endif
1359: ilen = imat_ilen[row];
1360: mat_i = imat_i[row];
1361: if (!ijonly) mat_a = imat_a + mat_i*bs2;
1362: mat_j = imat_j + mat_i;
1363: max2 = rbuf2_i[ct1];
1365: if (!allcolumns[is_no]) {
1366: for (l=0; l<max2; l++,ct2++) {
1367: #if defined(PETSC_USE_CTABLE)
1368: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
1369: if (tcol) {
1370: #else
1371: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1372: #endif
1373: *mat_j++ = tcol - 1;
1374: if (!ijonly) {
1375: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1376: mat_a += bs2;
1377: }
1378: ilen++;
1379: }
1380: }
1381: } else { /* allcolumns */
1382: for (l=0; l<max2; l++,ct2++) {
1383: *mat_j++ = rbuf3_i[ct2];
1384: if (!ijonly) {
1385: PetscMemcpy(mat_a,rbuf4_i+ct2*bs2,bs2*sizeof(MatScalar));
1386: mat_a += bs2;
1387: }
1388: ilen++;
1389: }
1390: }
1391: imat_ilen[row] = ilen;
1392: }
1393: }
1394: }
1395: }
1396: if (!ijonly) {
1397: PetscFree(r_status4);
1398: PetscFree(r_waits4);
1399: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1400: PetscFree(s_waits4);
1401: PetscFree(s_status4);
1402: }
1404: /* Restore the indices */
1405: for (i=0; i<ismax; i++) {
1406: if (!allrows[i]) {
1407: ISRestoreIndices(isrow[i],irow+i);
1408: }
1409: if (!allcolumns[i]) {
1410: ISRestoreIndices(iscol[i],icol+i);
1411: }
1412: }
1414: /* Destroy allocated memory */
1415: #if defined(PETSC_USE_CTABLE)
1416: PetscFree4(irow,icol,nrow,ncol);
1417: #else
1418: PetscFree5(irow,icol,nrow,ncol,rtable);
1419: #endif
1420: PetscFree4(w1,w2,w3,w4);
1421: PetscFree(pa);
1423: PetscFree4(sbuf1,ptr,tmp,ctr);
1424: PetscFree(sbuf1);
1425: PetscFree(rbuf2);
1426: for (i=0; i<nrqr; ++i) {
1427: PetscFree(sbuf2[i]);
1428: }
1429: for (i=0; i<nrqs; ++i) {
1430: PetscFree(rbuf3[i]);
1431: }
1432: PetscFree3(sbuf2,req_size,req_source);
1433: PetscFree(rbuf3);
1434: PetscFree(sbuf_aj[0]);
1435: PetscFree(sbuf_aj);
1436: if (!ijonly) {
1437: for (i=0; i<nrqs; ++i) {PetscFree(rbuf4[i]);}
1438: PetscFree(rbuf4);
1439: PetscFree(sbuf_aa[0]);
1440: PetscFree(sbuf_aa);
1441: }
1443: #if defined(PETSC_USE_CTABLE)
1444: for (i=0; i<ismax; i++) {
1445: PetscTableDestroy((PetscTable*)&rmap[i]);
1446: }
1447: #endif
1448: PetscFree(rmap);
1450: for (i=0; i<ismax; i++) {
1451: if (!allcolumns[i]) {
1452: #if defined(PETSC_USE_CTABLE)
1453: PetscTableDestroy((PetscTable*)&cmap[i]);
1454: #else
1455: PetscFree(cmap[i]);
1456: #endif
1457: }
1458: }
1459: PetscFree(cmap);
1460: PetscFree(lens);
1462: for (i=0; i<ismax; i++) {
1463: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1464: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1465: }
1467: c->ijonly = PETSC_FALSE; /* set back to the default */
1468: return(0);
1469: }