Actual source code: mpiov.c
1: #define PETSCMAT_DLL
3: /*
4: Routines to compute overlapping regions of a parallel MPI matrix
5: and to find submatrices that were shared across processors.
6: */
7: #include src/mat/impls/aij/mpi/mpiaij.h
8: #include petscbt.h
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*);
13: EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
18: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
19: {
21: PetscInt i;
24: if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
25: for (i=0; i<ov; ++i) {
26: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
27: }
28: return(0);
29: }
31: /*
32: Sample message format:
33: If a processor A wants processor B to process some elements corresponding
34: to index sets is[1],is[5]
35: mesg [0] = 2 (no of index sets in the mesg)
36: -----------
37: mesg [1] = 1 => is[1]
38: mesg [2] = sizeof(is[1]);
39: -----------
40: mesg [3] = 5 => is[5]
41: mesg [4] = sizeof(is[5]);
42: -----------
43: mesg [5]
44: mesg [n] datas[1]
45: -----------
46: mesg[n+1]
47: mesg[m] data(is[5])
48: -----------
49:
50: Notes:
51: nrqs - no of requests sent (or to be sent out)
52: nrqr - no of requests recieved (which have to be or which have been processed
53: */
56: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
57: {
58: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
59: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
60: PetscInt **idx,*n,*rtable,**data,len,*idx_i;
62: PetscMPIInt size,rank,tag1,tag2;
63: PetscInt m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
64: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
65: PetscBT *table;
66: MPI_Comm comm;
67: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
68: MPI_Status *s_status,*recv_status;
71: comm = C->comm;
72: size = c->size;
73: rank = c->rank;
74: m = C->rmap.N;
76: PetscObjectGetNewTag((PetscObject)C,&tag1);
77: PetscObjectGetNewTag((PetscObject)C,&tag2);
78:
79: len = (imax+1)*sizeof(PetscInt*)+ (imax + m)*sizeof(PetscInt);
80: PetscMalloc(len,&idx);
81: n = (PetscInt*)(idx + imax);
82: rtable = n + imax;
83:
84: for (i=0; i<imax; i++) {
85: ISGetIndices(is[i],&idx[i]);
86: ISGetLocalSize(is[i],&n[i]);
87: }
88:
89: /* Create hash table for the mapping :row -> proc*/
90: for (i=0,j=0; i<size; i++) {
91: len = C->rmap.range[i+1];
92: for (; j<len; j++) {
93: rtable[j] = i;
94: }
95: }
97: /* evaluate communication - mesg to who,length of mesg, and buffer space
98: required. Based on this, buffers are allocated, and data copied into them*/
99: PetscMalloc(size*4*sizeof(PetscMPIInt),&w1);/* mesg size */
100: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
101: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
102: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
103: PetscMemzero(w1,size*3*sizeof(PetscMPIInt)); /* initialise work vector*/
104: for (i=0; i<imax; i++) {
105: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialise work vector*/
106: idx_i = idx[i];
107: len = n[i];
108: for (j=0; j<len; j++) {
109: row = idx_i[j];
110: if (row < 0) {
111: SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
112: }
113: proc = rtable[row];
114: w4[proc]++;
115: }
116: for (j=0; j<size; j++){
117: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
118: }
119: }
121: nrqs = 0; /* no of outgoing messages */
122: msz = 0; /* total mesg length (for all proc */
123: w1[rank] = 0; /* no mesg sent to intself */
124: w3[rank] = 0;
125: for (i=0; i<size; i++) {
126: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
127: }
128: /* pa - is list of processors to communicate with */
129: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);
130: for (i=0,j=0; i<size; i++) {
131: if (w1[i]) {pa[j] = i; j++;}
132: }
134: /* Each message would have a header = 1 + 2*(no of IS) + data */
135: for (i=0; i<nrqs; i++) {
136: j = pa[i];
137: w1[j] += w2[j] + 2*w3[j];
138: msz += w1[j];
139: }
141: /* Determine the number of messages to expect, their lengths, from from-ids */
142: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
143: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
145: /* Now post the Irecvs corresponding to these messages */
146: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
148: /* Allocate Memory for outgoing messages */
149: len = 2*size*sizeof(PetscInt*) + (size+msz)*sizeof(PetscInt);
150: PetscMalloc(len,&outdat);
151: ptr = outdat + size; /* Pointers to the data in outgoing buffers */
152: PetscMemzero(outdat,2*size*sizeof(PetscInt*));
153: tmp = (PetscInt*)(outdat + 2*size);
154: ctr = tmp + msz;
156: {
157: PetscInt *iptr = tmp,ict = 0;
158: for (i=0; i<nrqs; i++) {
159: j = pa[i];
160: iptr += ict;
161: outdat[j] = iptr;
162: ict = w1[j];
163: }
164: }
166: /* Form the outgoing messages */
167: /*plug in the headers*/
168: for (i=0; i<nrqs; i++) {
169: j = pa[i];
170: outdat[j][0] = 0;
171: PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));
172: ptr[j] = outdat[j] + 2*w3[j] + 1;
173: }
174:
175: /* Memory for doing local proc's work*/
176: {
177: PetscInt *d_p;
178: char *t_p;
180: len = (imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) +
181: (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
182: PetscMalloc(len,&table);
183: PetscMemzero(table,len);
184: data = (PetscInt **)(table + imax);
185: isz = (PetscInt *)(data + imax);
186: d_p = (PetscInt *)(isz + imax);
187: t_p = (char *)(d_p + m*imax);
188: for (i=0; i<imax; i++) {
189: table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
190: data[i] = d_p + (m)*i;
191: }
192: }
194: /* Parse the IS and update local tables and the outgoing buf with the data*/
195: {
196: PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
197: PetscBT table_i;
199: for (i=0; i<imax; i++) {
200: PetscMemzero(ctr,size*sizeof(PetscInt));
201: n_i = n[i];
202: table_i = table[i];
203: idx_i = idx[i];
204: data_i = data[i];
205: isz_i = isz[i];
206: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
207: row = idx_i[j];
208: proc = rtable[row];
209: if (proc != rank) { /* copy to the outgoing buffer */
210: ctr[proc]++;
211: *ptr[proc] = row;
212: ptr[proc]++;
213: } else { /* Update the local table */
214: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
215: }
216: }
217: /* Update the headers for the current IS */
218: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
219: if ((ctr_j = ctr[j])) {
220: outdat_j = outdat[j];
221: k = ++outdat_j[0];
222: outdat_j[2*k] = ctr_j;
223: outdat_j[2*k-1] = i;
224: }
225: }
226: isz[i] = isz_i;
227: }
228: }
229:
232: /* Now post the sends */
233: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
234: for (i=0; i<nrqs; ++i) {
235: j = pa[i];
236: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
237: }
238:
239: /* No longer need the original indices*/
240: for (i=0; i<imax; ++i) {
241: ISRestoreIndices(is[i],idx+i);
242: }
243: PetscFree(idx);
245: for (i=0; i<imax; ++i) {
246: ISDestroy(is[i]);
247: }
248:
249: /* Do Local work*/
250: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);
252: /* Receive messages*/
253: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);
254: if (nrqr) {MPI_Waitall(nrqr,r_waits1,recv_status);}
255:
256: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);
257: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status);}
259: /* Phase 1 sends are complete - deallocate buffers */
260: PetscFree(outdat);
261: PetscFree(w1);
263: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);
264: PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);
265: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
266: PetscFree(rbuf);
268:
269: /* Send the data back*/
270: /* Do a global reduction to know the buffer space req for incoming messages*/
271: {
272: PetscMPIInt *rw1;
273:
274: PetscMalloc(size*sizeof(PetscMPIInt),&rw1);
275: PetscMemzero(rw1,size*sizeof(PetscMPIInt));
277: for (i=0; i<nrqr; ++i) {
278: proc = recv_status[i].MPI_SOURCE;
279: if (proc != onodes1[i]) SETERRQ(PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
280: rw1[proc] = isz1[i];
281: }
282: PetscFree(onodes1);
283: PetscFree(olengths1);
285: /* Determine the number of messages to expect, their lengths, from from-ids */
286: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
287: PetscFree(rw1);
288: }
289: /* Now post the Irecvs corresponding to these messages */
290: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
292: /* Now post the sends */
293: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
294: for (i=0; i<nrqr; ++i) {
295: j = recv_status[i].MPI_SOURCE;
296: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
297: }
299: /* receive work done on other processors*/
300: {
301: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
302: PetscMPIInt idex;
303: PetscBT table_i;
304: MPI_Status *status2;
305:
306: PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);
307: for (i=0; i<nrqs; ++i) {
308: MPI_Waitany(nrqs,r_waits2,&idex,status2+i);
309: /* Process the message*/
310: rbuf2_i = rbuf2[idex];
311: ct1 = 2*rbuf2_i[0]+1;
312: jmax = rbuf2[idex][0];
313: for (j=1; j<=jmax; j++) {
314: max = rbuf2_i[2*j];
315: is_no = rbuf2_i[2*j-1];
316: isz_i = isz[is_no];
317: data_i = data[is_no];
318: table_i = table[is_no];
319: for (k=0; k<max; k++,ct1++) {
320: row = rbuf2_i[ct1];
321: if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
322: }
323: isz[is_no] = isz_i;
324: }
325: }
327: if (nrqr) {MPI_Waitall(nrqr,s_waits2,status2);}
328: PetscFree(status2);
329: }
330:
331: for (i=0; i<imax; ++i) {
332: ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);
333: }
334:
335: PetscFree(onodes2);
336: PetscFree(olengths2);
338: PetscFree(pa);
339: PetscFree(rbuf2);
340: PetscFree(s_waits1);
341: PetscFree(r_waits1);
342: PetscFree(s_waits2);
343: PetscFree(r_waits2);
344: PetscFree(table);
345: PetscFree(s_status);
346: PetscFree(recv_status);
347: PetscFree(xdata[0]);
348: PetscFree(xdata);
349: PetscFree(isz1);
350: return(0);
351: }
355: /*
356: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
357: the work on the local processor.
359: Inputs:
360: C - MAT_MPIAIJ;
361: imax - total no of index sets processed at a time;
362: table - an array of char - size = m bits.
363:
364: Output:
365: isz - array containing the count of the solution elements correspondign
366: to each index set;
367: data - pointer to the solutions
368: */
369: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
370: {
371: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
372: Mat A = c->A,B = c->B;
373: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
374: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
375: PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
376: PetscBT table_i;
379: rstart = C->rmap.rstart;
380: cstart = C->cmap.rstart;
381: ai = a->i;
382: aj = a->j;
383: bi = b->i;
384: bj = b->j;
385: garray = c->garray;
387:
388: for (i=0; i<imax; i++) {
389: data_i = data[i];
390: table_i = table[i];
391: isz_i = isz[i];
392: for (j=0,max=isz[i]; j<max; j++) {
393: row = data_i[j] - rstart;
394: start = ai[row];
395: end = ai[row+1];
396: for (k=start; k<end; k++) { /* Amat */
397: val = aj[k] + cstart;
398: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
399: }
400: start = bi[row];
401: end = bi[row+1];
402: for (k=start; k<end; k++) { /* Bmat */
403: val = garray[bj[k]];
404: if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
405: }
406: }
407: isz[i] = isz_i;
408: }
409: return(0);
410: }
414: /*
415: MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
416: and return the output
418: Input:
419: C - the matrix
420: nrqr - no of messages being processed.
421: rbuf - an array of pointers to the recieved requests
422:
423: Output:
424: xdata - array of messages to be sent back
425: isz1 - size of each message
427: For better efficiency perhaps we should malloc separately each xdata[i],
428: then if a remalloc is required we need only copy the data for that one row
429: rather then all previous rows as it is now where a single large chunck of
430: memory is used.
432: */
433: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
434: {
435: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
436: Mat A = c->A,B = c->B;
437: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
439: PetscMPIInt rank;
440: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
441: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
442: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
443: PetscInt *rbuf_i,kmax,rbuf_0;
444: PetscBT xtable;
447: rank = c->rank;
448: m = C->rmap.N;
449: rstart = C->rmap.rstart;
450: cstart = C->cmap.rstart;
451: ai = a->i;
452: aj = a->j;
453: bi = b->i;
454: bj = b->j;
455: garray = c->garray;
456:
457:
458: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
459: rbuf_i = rbuf[i];
460: rbuf_0 = rbuf_i[0];
461: ct += rbuf_0;
462: for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
463: }
464:
465: if (C->rmap.n) max1 = ct*(a->nz + b->nz)/C->rmap.n;
466: else max1 = 1;
467: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
468: PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);
469: ++no_malloc;
470: PetscBTCreate(m,xtable);
471: PetscMemzero(isz1,nrqr*sizeof(PetscInt));
472:
473: ct3 = 0;
474: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
475: rbuf_i = rbuf[i];
476: rbuf_0 = rbuf_i[0];
477: ct1 = 2*rbuf_0+1;
478: ct2 = ct1;
479: ct3 += ct1;
480: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
481: PetscBTMemzero(m,xtable);
482: oct2 = ct2;
483: kmax = rbuf_i[2*j];
484: for (k=0; k<kmax; k++,ct1++) {
485: row = rbuf_i[ct1];
486: if (!PetscBTLookupSet(xtable,row)) {
487: if (!(ct3 < mem_estimate)) {
488: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
489: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
490: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
491: PetscFree(xdata[0]);
492: xdata[0] = tmp;
493: mem_estimate = new_estimate; ++no_malloc;
494: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
495: }
496: xdata[i][ct2++] = row;
497: ct3++;
498: }
499: }
500: for (k=oct2,max2=ct2; k<max2; k++) {
501: row = xdata[i][k] - rstart;
502: start = ai[row];
503: end = ai[row+1];
504: for (l=start; l<end; l++) {
505: val = aj[l] + cstart;
506: if (!PetscBTLookupSet(xtable,val)) {
507: if (!(ct3 < mem_estimate)) {
508: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
509: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
510: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
511: PetscFree(xdata[0]);
512: xdata[0] = tmp;
513: mem_estimate = new_estimate; ++no_malloc;
514: for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
515: }
516: xdata[i][ct2++] = val;
517: ct3++;
518: }
519: }
520: start = bi[row];
521: end = bi[row+1];
522: for (l=start; l<end; l++) {
523: val = garray[bj[l]];
524: if (!PetscBTLookupSet(xtable,val)) {
525: if (!(ct3 < mem_estimate)) {
526: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
527: PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);
528: PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));
529: PetscFree(xdata[0]);
530: xdata[0] = tmp;
531: mem_estimate = new_estimate; ++no_malloc;
532: for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
533: }
534: xdata[i][ct2++] = val;
535: ct3++;
536: }
537: }
538: }
539: /* Update the header*/
540: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
541: xdata[i][2*j-1] = rbuf_i[2*j-1];
542: }
543: xdata[i][0] = rbuf_0;
544: xdata[i+1] = xdata[i] + ct2;
545: isz1[i] = ct2; /* size of each message */
546: }
547: PetscBTDestroy(xtable);
548: PetscInfo4(0,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);
549: return(0);
550: }
551: /* -------------------------------------------------------------------------*/
552: EXTERN PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
553: EXTERN PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
554: /*
555: Every processor gets the entire matrix
556: */
559: PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatReuse scall,Mat *Bin[])
560: {
561: Mat B;
562: Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
563: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
565: PetscMPIInt size,rank,*recvcounts = 0,*displs = 0;
566: PetscInt sendcount,i,*rstarts = A->rmap.range,n,cnt,j;
567: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
568: PetscScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
571: MPI_Comm_size(A->comm,&size);
572: MPI_Comm_rank(A->comm,&rank);
574: if (scall == MAT_INITIAL_MATRIX) {
575: /* ----------------------------------------------------------------
576: Tell every processor the number of nonzeros per row
577: */
578: PetscMalloc(A->rmap.N*sizeof(PetscInt),&lens);
579: for (i=A->rmap.rstart; i<A->rmap.rend; i++) {
580: lens[i] = ad->i[i-A->rmap.rstart+1] - ad->i[i-A->rmap.rstart] + bd->i[i-A->rmap.rstart+1] - bd->i[i-A->rmap.rstart];
581: }
582: sendcount = A->rmap.rend - A->rmap.rstart;
583: PetscMalloc(2*size*sizeof(PetscMPIInt),&recvcounts);
584: displs = recvcounts + size;
585: for (i=0; i<size; i++) {
586: recvcounts[i] = A->rmap.range[i+1] - A->rmap.range[i];
587: displs[i] = A->rmap.range[i];
588: }
589: MPI_Allgatherv(lens+A->rmap.rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,A->comm);
591: /* ---------------------------------------------------------------
592: Create the sequential matrix of the same type as the local block diagonal
593: */
594: MatCreate(PETSC_COMM_SELF,&B);
595: MatSetSizes(B,A->rmap.N,A->cmap.N,PETSC_DETERMINE,PETSC_DETERMINE);
596: MatSetType(B,a->A->type_name);
597: MatSeqAIJSetPreallocation(B,0,lens);
598: PetscMalloc(sizeof(Mat),Bin);
599: **Bin = B;
600: b = (Mat_SeqAIJ *)B->data;
602: /*--------------------------------------------------------------------
603: Copy my part of matrix column indices over
604: */
605: sendcount = ad->nz + bd->nz;
606: jsendbuf = b->j + b->i[rstarts[rank]];
607: a_jsendbuf = ad->j;
608: b_jsendbuf = bd->j;
609: n = A->rmap.rend - A->rmap.rstart;
610: cnt = 0;
611: for (i=0; i<n; i++) {
613: /* put in lower diagonal portion */
614: m = bd->i[i+1] - bd->i[i];
615: while (m > 0) {
616: /* is it above diagonal (in bd (compressed) numbering) */
617: if (garray[*b_jsendbuf] > A->rmap.rstart + i) break;
618: jsendbuf[cnt++] = garray[*b_jsendbuf++];
619: m--;
620: }
622: /* put in diagonal portion */
623: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
624: jsendbuf[cnt++] = A->rmap.rstart + *a_jsendbuf++;
625: }
627: /* put in upper diagonal portion */
628: while (m-- > 0) {
629: jsendbuf[cnt++] = garray[*b_jsendbuf++];
630: }
631: }
632: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
634: /*--------------------------------------------------------------------
635: Gather all column indices to all processors
636: */
637: for (i=0; i<size; i++) {
638: recvcounts[i] = 0;
639: for (j=A->rmap.range[i]; j<A->rmap.range[i+1]; j++) {
640: recvcounts[i] += lens[j];
641: }
642: }
643: displs[0] = 0;
644: for (i=1; i<size; i++) {
645: displs[i] = displs[i-1] + recvcounts[i-1];
646: }
647: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,A->comm);
649: /*--------------------------------------------------------------------
650: Assemble the matrix into useable form (note numerical values not yet set)
651: */
652: /* set the b->ilen (length of each row) values */
653: PetscMemcpy(b->ilen,lens,A->rmap.N*sizeof(PetscInt));
654: /* set the b->i indices */
655: b->i[0] = 0;
656: for (i=1; i<=A->rmap.N; i++) {
657: b->i[i] = b->i[i-1] + lens[i-1];
658: }
659: PetscFree(lens);
660: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
661: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
663: } else {
664: B = **Bin;
665: b = (Mat_SeqAIJ *)B->data;
666: }
668: /*--------------------------------------------------------------------
669: Copy my part of matrix numerical values into the values location
670: */
671: sendcount = ad->nz + bd->nz;
672: sendbuf = b->a + b->i[rstarts[rank]];
673: a_sendbuf = ad->a;
674: b_sendbuf = bd->a;
675: b_sendj = bd->j;
676: n = A->rmap.rend - A->rmap.rstart;
677: cnt = 0;
678: for (i=0; i<n; i++) {
680: /* put in lower diagonal portion */
681: m = bd->i[i+1] - bd->i[i];
682: while (m > 0) {
683: /* is it above diagonal (in bd (compressed) numbering) */
684: if (garray[*b_sendj] > A->rmap.rstart + i) break;
685: sendbuf[cnt++] = *b_sendbuf++;
686: m--;
687: b_sendj++;
688: }
690: /* put in diagonal portion */
691: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
692: sendbuf[cnt++] = *a_sendbuf++;
693: }
695: /* put in upper diagonal portion */
696: while (m-- > 0) {
697: sendbuf[cnt++] = *b_sendbuf++;
698: b_sendj++;
699: }
700: }
701: if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
702:
703: /* -----------------------------------------------------------------
704: Gather all numerical values to all processors
705: */
706: if (!recvcounts) {
707: PetscMalloc(2*size*sizeof(PetscInt),&recvcounts);
708: displs = recvcounts + size;
709: }
710: for (i=0; i<size; i++) {
711: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
712: }
713: displs[0] = 0;
714: for (i=1; i<size; i++) {
715: displs[i] = displs[i-1] + recvcounts[i-1];
716: }
717: recvbuf = b->a;
718: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,A->comm);
719: PetscFree(recvcounts);
720: if (A->symmetric){
721: MatSetOption(B,MAT_SYMMETRIC);
722: } else if (A->hermitian) {
723: MatSetOption(B,MAT_HERMITIAN);
724: } else if (A->structurally_symmetric) {
725: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC);
726: }
728: return(0);
729: }
733: PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
734: {
736: PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol;
737: PetscTruth rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
740: /*
741: Check for special case each processor gets entire matrix
742: */
743: if (ismax == 1 && C->rmap.N == C->cmap.N) {
744: ISIdentity(*isrow,&rowflag);
745: ISIdentity(*iscol,&colflag);
746: ISGetLocalSize(*isrow,&nrow);
747: ISGetLocalSize(*iscol,&ncol);
748: if (rowflag && colflag && nrow == C->rmap.N && ncol == C->cmap.N) {
749: wantallmatrix = PETSC_TRUE;
750: PetscOptionsGetTruth(C->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);
751: }
752: }
753: MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,C->comm);
754: if (twantallmatrix) {
755: MatGetSubMatrix_MPIAIJ_All(C,scall,submat);
756: return(0);
757: }
759: /* Allocate memory to hold all the submatrices */
760: if (scall != MAT_REUSE_MATRIX) {
761: PetscMalloc((ismax+1)*sizeof(Mat),submat);
762: }
763: /* Determine the number of stages through which submatrices are done */
764: nmax = 20*1000000 / (C->cmap.N * sizeof(PetscInt));
765: if (!nmax) nmax = 1;
766: nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
768: /* Make sure every processor loops through the nstages */
769: MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,C->comm);
771: for (i=0,pos=0; i<nstages; i++) {
772: if (pos+nmax <= ismax) max_no = nmax;
773: else if (pos == ismax) max_no = 0;
774: else max_no = ismax-pos;
775: MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
776: pos += max_no;
777: }
778: return(0);
779: }
780: /* -------------------------------------------------------------------------*/
783: PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
784: {
785: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
786: Mat A = c->A;
787: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
788: PetscInt **irow,**icol,*nrow,*ncol,start;
790: PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr;
791: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc;
792: PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol;
793: PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
794: PetscInt **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
795: PetscInt len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
796: PetscInt *rmap_i;
797: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
798: MPI_Request *r_waits4,*s_waits3,*s_waits4;
799: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
800: MPI_Status *r_status3,*r_status4,*s_status4;
801: MPI_Comm comm;
802: PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
803: PetscTruth sorted;
804: PetscMPIInt *onodes1,*olengths1;
805: PetscMPIInt idex,idex2,end;
808: comm = C->comm;
809: tag0 = C->tag;
810: size = c->size;
811: rank = c->rank;
812:
813: /* Get some new tags to keep the communication clean */
814: PetscObjectGetNewTag((PetscObject)C,&tag1);
815: PetscObjectGetNewTag((PetscObject)C,&tag2);
816: PetscObjectGetNewTag((PetscObject)C,&tag3);
818: /* Check if the col indices are sorted */
819: for (i=0; i<ismax; i++) {
820: ISSorted(isrow[i],&sorted);
821: /*if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/
822: ISSorted(iscol[i],&sorted);
823: /* if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */
824: }
826: len = (2*ismax+1)*(sizeof(PetscInt*)+ sizeof(PetscInt));
827: PetscMalloc(len,&irow);
828: icol = irow + ismax;
829: nrow = (PetscInt*)(icol + ismax);
830: ncol = nrow + ismax;
832: for (i=0; i<ismax; i++) {
833: ISGetIndices(isrow[i],&irow[i]);
834: ISGetIndices(iscol[i],&icol[i]);
835: ISGetLocalSize(isrow[i],&nrow[i]);
836: ISGetLocalSize(iscol[i],&ncol[i]);
837: }
839: /* evaluate communication - mesg to who, length of mesg, and buffer space
840: required. Based on this, buffers are allocated, and data copied into them*/
841: PetscMalloc(size*4*sizeof(PetscMPIInt),&w1); /* mesg size */
842: w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/
843: w3 = w2 + size; /* no of IS that needs to be sent to proc i */
844: w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */
845: PetscMemzero(w1,size*3*sizeof(PetscMPIInt)); /* initialize work vector*/
846: for (i=0; i<ismax; i++) {
847: PetscMemzero(w4,size*sizeof(PetscMPIInt)); /* initialize work vector*/
848: jmax = nrow[i];
849: irow_i = irow[i];
850: for (j=0; j<jmax; j++) {
851: l = 0;
852: row = irow_i[j];
853: while (row >= C->rmap.range[l+1]) l++;
854: proc = l;
855: w4[proc]++;
856: }
857: for (j=0; j<size; j++) {
858: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
859: }
860: }
861:
862: nrqs = 0; /* no of outgoing messages */
863: msz = 0; /* total mesg length (for all procs) */
864: w1[rank] = 0; /* no mesg sent to self */
865: w3[rank] = 0;
866: for (i=0; i<size; i++) {
867: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
868: }
869: PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa); /*(proc -array)*/
870: for (i=0,j=0; i<size; i++) {
871: if (w1[i]) { pa[j] = i; j++; }
872: }
874: /* Each message would have a header = 1 + 2*(no of IS) + data */
875: for (i=0; i<nrqs; i++) {
876: j = pa[i];
877: w1[j] += w2[j] + 2* w3[j];
878: msz += w1[j];
879: }
881: /* Determine the number of messages to expect, their lengths, from from-ids */
882: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
883: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
885: /* Now post the Irecvs corresponding to these messages */
886: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
887:
888: PetscFree(onodes1);
889: PetscFree(olengths1);
890:
891: /* Allocate Memory for outgoing messages */
892: len = 2*size*sizeof(PetscInt*) + 2*msz*sizeof(PetscInt) + size*sizeof(PetscInt);
893: PetscMalloc(len,&sbuf1);
894: ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */
895: PetscMemzero(sbuf1,2*size*sizeof(PetscInt*));
896: /* allocate memory for outgoing data + buf to receive the first reply */
897: tmp = (PetscInt*)(ptr + size);
898: ctr = tmp + 2*msz;
900: {
901: PetscInt *iptr = tmp,ict = 0;
902: for (i=0; i<nrqs; i++) {
903: j = pa[i];
904: iptr += ict;
905: sbuf1[j] = iptr;
906: ict = w1[j];
907: }
908: }
910: /* Form the outgoing messages */
911: /* Initialize the header space */
912: for (i=0; i<nrqs; i++) {
913: j = pa[i];
914: sbuf1[j][0] = 0;
915: PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));
916: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
917: }
918:
919: /* Parse the isrow and copy data into outbuf */
920: for (i=0; i<ismax; i++) {
921: PetscMemzero(ctr,size*sizeof(PetscInt));
922: irow_i = irow[i];
923: jmax = nrow[i];
924: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
925: l = 0;
926: row = irow_i[j];
927: while (row >= C->rmap.range[l+1]) l++;
928: proc = l;
929: if (proc != rank) { /* copy to the outgoing buf*/
930: ctr[proc]++;
931: *ptr[proc] = row;
932: ptr[proc]++;
933: }
934: }
935: /* Update the headers for the current IS */
936: for (j=0; j<size; j++) { /* Can Optimise this loop too */
937: if ((ctr_j = ctr[j])) {
938: sbuf1_j = sbuf1[j];
939: k = ++sbuf1_j[0];
940: sbuf1_j[2*k] = ctr_j;
941: sbuf1_j[2*k-1] = i;
942: }
943: }
944: }
946: /* Now post the sends */
947: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);
948: for (i=0; i<nrqs; ++i) {
949: j = pa[i];
950: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
951: }
953: /* Post Receives to capture the buffer size */
954: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);
955: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);
956: rbuf2[0] = tmp + msz;
957: for (i=1; i<nrqs; ++i) {
958: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
959: }
960: for (i=0; i<nrqs; ++i) {
961: j = pa[i];
962: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);
963: }
965: /* Send to other procs the buf size they should allocate */
966:
968: /* Receive messages*/
969: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);
970: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);
971: len = 2*nrqr*sizeof(PetscInt) + (nrqr+1)*sizeof(PetscInt*);
972: PetscMalloc(len,&sbuf2);
973: req_size = (PetscInt*)(sbuf2 + nrqr);
974: req_source = req_size + nrqr;
975:
976: {
977: Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
978: PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap.rstart;
979: PetscInt *sbuf2_i;
981: for (i=0; i<nrqr; ++i) {
982: MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);
983: req_size[idex] = 0;
984: rbuf1_i = rbuf1[idex];
985: start = 2*rbuf1_i[0] + 1;
986: MPI_Get_count(r_status1+i,MPIU_INT,&end);
987: PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);
988: sbuf2_i = sbuf2[idex];
989: for (j=start; j<end; j++) {
990: id = rbuf1_i[j] - rstart;
991: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
992: sbuf2_i[j] = ncols;
993: req_size[idex] += ncols;
994: }
995: req_source[idex] = r_status1[i].MPI_SOURCE;
996: /* form the header */
997: sbuf2_i[0] = req_size[idex];
998: for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
999: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);
1000: }
1001: }
1002: PetscFree(r_status1);
1003: PetscFree(r_waits1);
1005: /* recv buffer sizes */
1006: /* Receive messages*/
1007:
1008: PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);
1009: PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);
1010: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);
1011: PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);
1012: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);
1014: for (i=0; i<nrqs; ++i) {
1015: MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);
1016: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);
1017: PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);
1018: MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);
1019: MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);
1020: }
1021: PetscFree(r_status2);
1022: PetscFree(r_waits2);
1023:
1024: /* Wait on sends1 and sends2 */
1025: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);
1026: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);
1028: if (nrqs) {MPI_Waitall(nrqs,s_waits1,s_status1);}
1029: if (nrqr) {MPI_Waitall(nrqr,s_waits2,s_status2);}
1030: PetscFree(s_status1);
1031: PetscFree(s_status2);
1032: PetscFree(s_waits1);
1033: PetscFree(s_waits2);
1035: /* Now allocate buffers for a->j, and send them off */
1036: PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);
1037: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1038: PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);
1039: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1040:
1041: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);
1042: {
1043: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
1044: PetscInt *cworkA,*cworkB,cstart = C->cmap.rstart,rstart = C->rmap.rstart,*bmap = c->garray;
1045: PetscInt cend = C->cmap.rend;
1046: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
1048: for (i=0; i<nrqr; i++) {
1049: rbuf1_i = rbuf1[i];
1050: sbuf_aj_i = sbuf_aj[i];
1051: ct1 = 2*rbuf1_i[0] + 1;
1052: ct2 = 0;
1053: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1054: kmax = rbuf1[i][2*j];
1055: for (k=0; k<kmax; k++,ct1++) {
1056: row = rbuf1_i[ct1] - rstart;
1057: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1058: ncols = nzA + nzB;
1059: cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1061: /* load the column indices for this row into cols*/
1062: cols = sbuf_aj_i + ct2;
1063:
1064: lwrite = 0;
1065: for (l=0; l<nzB; l++) {
1066: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1067: }
1068: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1069: for (l=0; l<nzB; l++) {
1070: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1071: }
1073: ct2 += ncols;
1074: }
1075: }
1076: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);
1077: }
1078: }
1079: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);
1080: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);
1082: /* Allocate buffers for a->a, and send them off */
1083: PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);
1084: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1085: PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);
1086: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1087:
1088: PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);
1089: {
1090: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
1091: PetscInt cstart = C->cmap.rstart,rstart = C->rmap.rstart,*bmap = c->garray;
1092: PetscInt cend = C->cmap.rend;
1093: PetscInt *b_j = b->j;
1094: PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1095:
1096: for (i=0; i<nrqr; i++) {
1097: rbuf1_i = rbuf1[i];
1098: sbuf_aa_i = sbuf_aa[i];
1099: ct1 = 2*rbuf1_i[0]+1;
1100: ct2 = 0;
1101: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1102: kmax = rbuf1_i[2*j];
1103: for (k=0; k<kmax; k++,ct1++) {
1104: row = rbuf1_i[ct1] - rstart;
1105: nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
1106: ncols = nzA + nzB;
1107: cworkB = b_j + b_i[row];
1108: vworkA = a_a + a_i[row];
1109: vworkB = b_a + b_i[row];
1111: /* load the column values for this row into vals*/
1112: vals = sbuf_aa_i+ct2;
1113:
1114: lwrite = 0;
1115: for (l=0; l<nzB; l++) {
1116: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1117: }
1118: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1119: for (l=0; l<nzB; l++) {
1120: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1121: }
1122:
1123: ct2 += ncols;
1124: }
1125: }
1126: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);
1127: }
1128: }
1129: PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);
1130: PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);
1131: PetscFree(rbuf1);
1133: /* Form the matrix */
1134: /* create col map */
1135: {
1136: PetscInt *icol_i;
1137:
1138: len = (1+ismax)*sizeof(PetscInt*)+ (1+ismax*C->cmap.N)*sizeof(PetscInt);
1139: PetscMalloc(len,&cmap);
1140: cmap[0] = (PetscInt*)(cmap + ismax);
1141: PetscMemzero(cmap[0],(1+ismax*C->cmap.N)*sizeof(PetscInt));
1142: for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap.N; }
1143: for (i=0; i<ismax; i++) {
1144: jmax = ncol[i];
1145: icol_i = icol[i];
1146: cmap_i = cmap[i];
1147: for (j=0; j<jmax; j++) {
1148: cmap_i[icol_i[j]] = j+1;
1149: }
1150: }
1151: }
1153: /* Create lens which is required for MatCreate... */
1154: for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1155: len = (1+ismax)*sizeof(PetscInt*)+ j*sizeof(PetscInt);
1156: PetscMalloc(len,&lens);
1157: lens[0] = (PetscInt*)(lens + ismax);
1158: PetscMemzero(lens[0],j*sizeof(PetscInt));
1159: for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1160:
1161: /* Update lens from local data */
1162: for (i=0; i<ismax; i++) {
1163: jmax = nrow[i];
1164: cmap_i = cmap[i];
1165: irow_i = irow[i];
1166: lens_i = lens[i];
1167: for (j=0; j<jmax; j++) {
1168: l = 0;
1169: row = irow_i[j];
1170: while (row >= C->rmap.range[l+1]) l++;
1171: proc = l;
1172: if (proc == rank) {
1173: MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);
1174: for (k=0; k<ncols; k++) {
1175: if (cmap_i[cols[k]]) { lens_i[j]++;}
1176: }
1177: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);
1178: }
1179: }
1180: }
1181:
1182: /* Create row map*/
1183: len = (1+ismax)*sizeof(PetscInt*)+ ismax*C->rmap.N*sizeof(PetscInt);
1184: PetscMalloc(len,&rmap);
1185: rmap[0] = (PetscInt*)(rmap + ismax);
1186: PetscMemzero(rmap[0],ismax*C->rmap.N*sizeof(PetscInt));
1187: for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap.N;}
1188: for (i=0; i<ismax; i++) {
1189: rmap_i = rmap[i];
1190: irow_i = irow[i];
1191: jmax = nrow[i];
1192: for (j=0; j<jmax; j++) {
1193: rmap_i[irow_i[j]] = j;
1194: }
1195: }
1196:
1197: /* Update lens from offproc data */
1198: {
1199: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1201: for (tmp2=0; tmp2<nrqs; tmp2++) {
1202: MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);
1203: idex = pa[idex2];
1204: sbuf1_i = sbuf1[idex];
1205: jmax = sbuf1_i[0];
1206: ct1 = 2*jmax+1;
1207: ct2 = 0;
1208: rbuf2_i = rbuf2[idex2];
1209: rbuf3_i = rbuf3[idex2];
1210: for (j=1; j<=jmax; j++) {
1211: is_no = sbuf1_i[2*j-1];
1212: max1 = sbuf1_i[2*j];
1213: lens_i = lens[is_no];
1214: cmap_i = cmap[is_no];
1215: rmap_i = rmap[is_no];
1216: for (k=0; k<max1; k++,ct1++) {
1217: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1218: max2 = rbuf2_i[ct1];
1219: for (l=0; l<max2; l++,ct2++) {
1220: if (cmap_i[rbuf3_i[ct2]]) {
1221: lens_i[row]++;
1222: }
1223: }
1224: }
1225: }
1226: }
1227: }
1228: PetscFree(r_status3);
1229: PetscFree(r_waits3);
1230: if (nrqr) {MPI_Waitall(nrqr,s_waits3,s_status3);}
1231: PetscFree(s_status3);
1232: PetscFree(s_waits3);
1234: /* Create the submatrices */
1235: if (scall == MAT_REUSE_MATRIX) {
1236: PetscTruth flag;
1238: /*
1239: Assumes new rows are same length as the old rows,hence bug!
1240: */
1241: for (i=0; i<ismax; i++) {
1242: mat = (Mat_SeqAIJ *)(submats[i]->data);
1243: if ((submats[i]->rmap.n != nrow[i]) || (submats[i]->cmap.n != ncol[i])) {
1244: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1245: }
1246: PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap.n*sizeof(PetscInt),&flag);
1247: if (!flag) {
1248: SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1249: }
1250: /* Initial matrix as if empty */
1251: PetscMemzero(mat->ilen,submats[i]->rmap.n*sizeof(PetscInt));
1252: submats[i]->factor = C->factor;
1253: }
1254: } else {
1255: for (i=0; i<ismax; i++) {
1256: MatCreate(PETSC_COMM_SELF,submats+i);
1257: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
1258: MatSetType(submats[i],A->type_name);
1259: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
1260: }
1261: }
1263: /* Assemble the matrices */
1264: /* First assemble the local rows */
1265: {
1266: PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1267: PetscScalar *imat_a;
1268:
1269: for (i=0; i<ismax; i++) {
1270: mat = (Mat_SeqAIJ*)submats[i]->data;
1271: imat_ilen = mat->ilen;
1272: imat_j = mat->j;
1273: imat_i = mat->i;
1274: imat_a = mat->a;
1275: cmap_i = cmap[i];
1276: rmap_i = rmap[i];
1277: irow_i = irow[i];
1278: jmax = nrow[i];
1279: for (j=0; j<jmax; j++) {
1280: l = 0;
1281: row = irow_i[j];
1282: while (row >= C->rmap.range[l+1]) l++;
1283: proc = l;
1284: if (proc == rank) {
1285: old_row = row;
1286: row = rmap_i[row];
1287: ilen_row = imat_ilen[row];
1288: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1289: mat_i = imat_i[row] ;
1290: mat_a = imat_a + mat_i;
1291: mat_j = imat_j + mat_i;
1292: for (k=0; k<ncols; k++) {
1293: if ((tcol = cmap_i[cols[k]])) {
1294: *mat_j++ = tcol - 1;
1295: *mat_a++ = vals[k];
1296: ilen_row++;
1297: }
1298: }
1299: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
1300: imat_ilen[row] = ilen_row;
1301: }
1302: }
1303: }
1304: }
1306: /* Now assemble the off proc rows*/
1307: {
1308: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1309: PetscInt *imat_j,*imat_i;
1310: PetscScalar *imat_a,*rbuf4_i;
1312: for (tmp2=0; tmp2<nrqs; tmp2++) {
1313: MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);
1314: idex = pa[idex2];
1315: sbuf1_i = sbuf1[idex];
1316: jmax = sbuf1_i[0];
1317: ct1 = 2*jmax + 1;
1318: ct2 = 0;
1319: rbuf2_i = rbuf2[idex2];
1320: rbuf3_i = rbuf3[idex2];
1321: rbuf4_i = rbuf4[idex2];
1322: for (j=1; j<=jmax; j++) {
1323: is_no = sbuf1_i[2*j-1];
1324: rmap_i = rmap[is_no];
1325: cmap_i = cmap[is_no];
1326: mat = (Mat_SeqAIJ*)submats[is_no]->data;
1327: imat_ilen = mat->ilen;
1328: imat_j = mat->j;
1329: imat_i = mat->i;
1330: imat_a = mat->a;
1331: max1 = sbuf1_i[2*j];
1332: for (k=0; k<max1; k++,ct1++) {
1333: row = sbuf1_i[ct1];
1334: row = rmap_i[row];
1335: ilen = imat_ilen[row];
1336: mat_i = imat_i[row] ;
1337: mat_a = imat_a + mat_i;
1338: mat_j = imat_j + mat_i;
1339: max2 = rbuf2_i[ct1];
1340: for (l=0; l<max2; l++,ct2++) {
1341: if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1342: *mat_j++ = tcol - 1;
1343: *mat_a++ = rbuf4_i[ct2];
1344: ilen++;
1345: }
1346: }
1347: imat_ilen[row] = ilen;
1348: }
1349: }
1350: }
1351: }
1352: PetscFree(r_status4);
1353: PetscFree(r_waits4);
1354: if (nrqr) {MPI_Waitall(nrqr,s_waits4,s_status4);}
1355: PetscFree(s_waits4);
1356: PetscFree(s_status4);
1358: /* Restore the indices */
1359: for (i=0; i<ismax; i++) {
1360: ISRestoreIndices(isrow[i],irow+i);
1361: ISRestoreIndices(iscol[i],icol+i);
1362: }
1364: /* Destroy allocated memory */
1365: PetscFree(irow);
1366: PetscFree(w1);
1367: PetscFree(pa);
1369: PetscFree(sbuf1);
1370: PetscFree(rbuf2);
1371: for (i=0; i<nrqr; ++i) {
1372: PetscFree(sbuf2[i]);
1373: }
1374: for (i=0; i<nrqs; ++i) {
1375: PetscFree(rbuf3[i]);
1376: PetscFree(rbuf4[i]);
1377: }
1379: PetscFree(sbuf2);
1380: PetscFree(rbuf3);
1381: PetscFree(rbuf4);
1382: PetscFree(sbuf_aj[0]);
1383: PetscFree(sbuf_aj);
1384: PetscFree(sbuf_aa[0]);
1385: PetscFree(sbuf_aa);
1386:
1387: PetscFree(cmap);
1388: PetscFree(rmap);
1389: PetscFree(lens);
1391: for (i=0; i<ismax; i++) {
1392: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
1393: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
1394: }
1395: return(0);
1396: }