Actual source code: inode.c
1: #define PETSCMAT_DLL
3: /*
4: This file provides high performance routines for the Inode format (compressed sparse row)
5: by taking advantage of rows with identical nonzero structure (I-nodes).
6: */
7: #include src/mat/impls/csr/inode/inode.h
11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12: {
13: Mat_inode *a = (Mat_inode*)A->data;
15: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
18: n = A->cmap.n;
19: m = A->rmap.n;
20: ns_row = a->inode.size;
21:
22: min_mn = (m < n) ? m : n;
23: if (!ns) {
24: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25: for(; count+1 < n; count++,i++);
26: if (count < n) {
27: i++;
28: }
29: *size = i;
30: return(0);
31: }
32: PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
33:
34: /* Use the same row structure wherever feasible. */
35: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36: ns_col[i] = ns_row[i];
37: }
39: /* if m < n; pad up the remainder with inode_limit */
40: for(; count+1 < n; count++,i++) {
41: ns_col[i] = 1;
42: }
43: /* The last node is the odd ball. padd it up with the remaining rows; */
44: if (count < n) {
45: ns_col[i] = n - count;
46: i++;
47: } else if (count > n) {
48: /* Adjust for the over estimation */
49: ns_col[i-1] += n - count;
50: }
51: *size = i;
52: *ns = ns_col;
53: return(0);
54: }
57: /*
58: This builds symmetric version of nonzero structure,
59: */
62: static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63: {
64: Mat_inode *a = (Mat_inode*)A->data;
66: PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap.n;
72: n = A->cmap.n;
73: if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74:
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
81: PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1){
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col)
87: tvc[col] = i1;
88: }
89: /* allocate space for row pointers */
90: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
91: *iia = ia;
92: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
93: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
95: /* determine the number of columns in each row */
96: ia[0] = oshift;
97: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
99: j = aj + ai[row] + ishift;
100: jmax = aj + ai[row+1] + ishift;
101: i2 = 0;
102: col = *j++ + ishift;
103: i2 = tvc[col];
104: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105: ia[i1+1]++;
106: ia[i2+1]++;
107: i2++; /* Start col of next node */
108: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109: i2 = tvc[col];
110: }
111: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
112: }
114: /* shift ia[i] to point to next row */
115: for (i1=1; i1<nslim_row+1; i1++) {
116: row = ia[i1-1];
117: ia[i1] += row;
118: work[i1-1] = row - oshift;
119: }
121: /* allocate space for column pointers */
122: nz = ia[nslim_row] + (!ishift);
123: PetscMalloc(nz*sizeof(PetscInt),&ja);
124: *jja = ja;
126: /* loop over lower triangular part putting into ja */
127: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128: j = aj + ai[row] + ishift;
129: jmax = aj + ai[row+1] + ishift;
130: i2 = 0; /* Col inode index */
131: col = *j++ + ishift;
132: i2 = tvc[col];
133: while (i2<i1 && j<jmax) {
134: ja[work[i2]++] = i1 + oshift;
135: ja[work[i1]++] = i2 + oshift;
136: ++i2;
137: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138: i2 = tvc[col];
139: }
140: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
142: }
143: PetscFree(work);
144: PetscFree(tns);
145: PetscFree(tvc);
146: return(0);
147: }
149: /*
150: This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156: Mat_inode *a = (Mat_inode*)A->data;
158: PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
162: nslim_row = a->inode.node_count;
163: n = A->cmap.n;
165: /* Create The column_inode for this matrix */
166: Mat_CreateColInode(A,&nslim_col,&ns_col);
167:
168: /* allocate space for reformated column_inode structure */
169: PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170: PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
173: for (i1=0,col=0; i1<nslim_col; ++i1){
174: nsz = ns_col[i1];
175: for (i2=0; i2<nsz; ++i2,++col)
176: tvc[col] = i1;
177: }
178: /* allocate space for row pointers */
179: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180: *iia = ia;
181: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
184: /* determine the number of columns in each row */
185: ia[0] = oshift;
186: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187: j = aj + ai[row] + ishift;
188: col = *j++ + ishift;
189: i2 = tvc[col];
190: nz = ai[row+1] - ai[row];
191: while (nz-- > 0) { /* off-diagonal elemets */
192: ia[i1+1]++;
193: i2++; /* Start col of next node */
194: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195: if (nz > 0) i2 = tvc[col];
196: }
197: }
199: /* shift ia[i] to point to next row */
200: for (i1=1; i1<nslim_row+1; i1++) {
201: row = ia[i1-1];
202: ia[i1] += row;
203: work[i1-1] = row - oshift;
204: }
206: /* allocate space for column pointers */
207: nz = ia[nslim_row] + (!ishift);
208: PetscMalloc(nz*sizeof(PetscInt),&ja);
209: *jja = ja;
211: /* loop over matrix putting into ja */
212: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213: j = aj + ai[row] + ishift;
214: i2 = 0; /* Col inode index */
215: col = *j++ + ishift;
216: i2 = tvc[col];
217: nz = ai[row+1] - ai[row];
218: while (nz-- > 0) {
219: ja[work[i1]++] = i2 + oshift;
220: ++i2;
221: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222: if (nz > 0) i2 = tvc[col];
223: }
224: }
225: PetscFree(ns_col);
226: PetscFree(work);
227: PetscFree(tns);
228: PetscFree(tvc);
229: return(0);
230: }
234: static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236: Mat_inode *a = (Mat_inode*)A->data;
240: *n = a->inode.node_count;
241: if (!ia) return(0);
243: if (symmetric) {
244: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
245: } else {
246: MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
247: }
248: return(0);
249: }
253: static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
254: {
258: if (!ia) return(0);
259: PetscFree(*ia);
260: PetscFree(*ja);
261: return(0);
262: }
264: /* ----------------------------------------------------------- */
268: static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
269: {
270: Mat_inode *a = (Mat_inode*)A->data;
272: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
273: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276: nslim_row = a->inode.node_count;
277: n = A->cmap.n;
279: /* Create The column_inode for this matrix */
280: Mat_CreateColInode(A,&nslim_col,&ns_col);
281:
282: /* allocate space for reformated column_inode structure */
283: PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
284: PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
285: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
287: for (i1=0,col=0; i1<nslim_col; ++i1){
288: nsz = ns_col[i1];
289: for (i2=0; i2<nsz; ++i2,++col)
290: tvc[col] = i1;
291: }
292: /* allocate space for column pointers */
293: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
294: *iia = ia;
295: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
296: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);
298: /* determine the number of columns in each row */
299: ia[0] = oshift;
300: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
301: j = aj + ai[row] + ishift;
302: col = *j++ + ishift;
303: i2 = tvc[col];
304: nz = ai[row+1] - ai[row];
305: while (nz-- > 0) { /* off-diagonal elemets */
306: /* ia[i1+1]++; */
307: ia[i2+1]++;
308: i2++;
309: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
310: if (nz > 0) i2 = tvc[col];
311: }
312: }
314: /* shift ia[i] to point to next col */
315: for (i1=1; i1<nslim_col+1; i1++) {
316: col = ia[i1-1];
317: ia[i1] += col;
318: work[i1-1] = col - oshift;
319: }
321: /* allocate space for column pointers */
322: nz = ia[nslim_col] + (!ishift);
323: PetscMalloc(nz*sizeof(PetscInt),&ja);
324: *jja = ja;
326: /* loop over matrix putting into ja */
327: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
328: j = aj + ai[row] + ishift;
329: i2 = 0; /* Col inode index */
330: col = *j++ + ishift;
331: i2 = tvc[col];
332: nz = ai[row+1] - ai[row];
333: while (nz-- > 0) {
334: /* ja[work[i1]++] = i2 + oshift; */
335: ja[work[i2]++] = i1 + oshift;
336: i2++;
337: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
338: if (nz > 0) i2 = tvc[col];
339: }
340: }
341: PetscFree(ns_col);
342: PetscFree(work);
343: PetscFree(tns);
344: PetscFree(tvc);
345: return(0);
346: }
350: static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
351: {
355: Mat_CreateColInode(A,n,PETSC_NULL);
356: if (!ia) return(0);
358: if (symmetric) {
359: /* Since the indices are symmetric it does'nt matter */
360: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
361: } else {
362: MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
363: }
364: return(0);
365: }
369: static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
370: {
374: if (!ia) return(0);
375: PetscFree(*ia);
376: PetscFree(*ja);
377: return(0);
378: }
380: /* ----------------------------------------------------------- */
384: static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
385: {
386: Mat_inode *a = (Mat_inode*)A->data;
387: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
388: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y;
390: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
391:
392: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
393: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
394: #endif
397: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
398: node_max = a->inode.node_count;
399: ns = a->inode.size; /* Node Size array */
400: VecGetArray(xx,&x);
401: VecGetArray(yy,&y);
402: idx = a->j;
403: v1 = a->a;
404: ii = a->i;
406: for (i = 0,row = 0; i< node_max; ++i){
407: nsz = ns[i];
408: n = ii[1] - ii[0];
409: ii += nsz;
410: sz = n; /* No of non zeros in this row */
411: /* Switch on the size of Node */
412: switch (nsz){ /* Each loop in 'case' is unrolled */
413: case 1 :
414: sum1 = 0;
415:
416: for(n = 0; n< sz-1; n+=2) {
417: i1 = idx[0]; /* The instructions are ordered to */
418: i2 = idx[1]; /* make the compiler's job easy */
419: idx += 2;
420: tmp0 = x[i1];
421: tmp1 = x[i2];
422: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
423: }
424:
425: if (n == sz-1){ /* Take care of the last nonzero */
426: tmp0 = x[*idx++];
427: sum1 += *v1++ * tmp0;
428: }
429: y[row++]=sum1;
430: break;
431: case 2:
432: sum1 = 0;
433: sum2 = 0;
434: v2 = v1 + n;
435:
436: for (n = 0; n< sz-1; n+=2) {
437: i1 = idx[0];
438: i2 = idx[1];
439: idx += 2;
440: tmp0 = x[i1];
441: tmp1 = x[i2];
442: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
443: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
444: }
445: if (n == sz-1){
446: tmp0 = x[*idx++];
447: sum1 += *v1++ * tmp0;
448: sum2 += *v2++ * tmp0;
449: }
450: y[row++]=sum1;
451: y[row++]=sum2;
452: v1 =v2; /* Since the next block to be processed starts there*/
453: idx +=sz;
454: break;
455: case 3:
456: sum1 = 0;
457: sum2 = 0;
458: sum3 = 0;
459: v2 = v1 + n;
460: v3 = v2 + n;
461:
462: for (n = 0; n< sz-1; n+=2) {
463: i1 = idx[0];
464: i2 = idx[1];
465: idx += 2;
466: tmp0 = x[i1];
467: tmp1 = x[i2];
468: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
469: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
470: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
471: }
472: if (n == sz-1){
473: tmp0 = x[*idx++];
474: sum1 += *v1++ * tmp0;
475: sum2 += *v2++ * tmp0;
476: sum3 += *v3++ * tmp0;
477: }
478: y[row++]=sum1;
479: y[row++]=sum2;
480: y[row++]=sum3;
481: v1 =v3; /* Since the next block to be processed starts there*/
482: idx +=2*sz;
483: break;
484: case 4:
485: sum1 = 0;
486: sum2 = 0;
487: sum3 = 0;
488: sum4 = 0;
489: v2 = v1 + n;
490: v3 = v2 + n;
491: v4 = v3 + n;
492:
493: for (n = 0; n< sz-1; n+=2) {
494: i1 = idx[0];
495: i2 = idx[1];
496: idx += 2;
497: tmp0 = x[i1];
498: tmp1 = x[i2];
499: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
500: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
501: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
502: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
503: }
504: if (n == sz-1){
505: tmp0 = x[*idx++];
506: sum1 += *v1++ * tmp0;
507: sum2 += *v2++ * tmp0;
508: sum3 += *v3++ * tmp0;
509: sum4 += *v4++ * tmp0;
510: }
511: y[row++]=sum1;
512: y[row++]=sum2;
513: y[row++]=sum3;
514: y[row++]=sum4;
515: v1 =v4; /* Since the next block to be processed starts there*/
516: idx +=3*sz;
517: break;
518: case 5:
519: sum1 = 0;
520: sum2 = 0;
521: sum3 = 0;
522: sum4 = 0;
523: sum5 = 0;
524: v2 = v1 + n;
525: v3 = v2 + n;
526: v4 = v3 + n;
527: v5 = v4 + n;
528:
529: for (n = 0; n<sz-1; n+=2) {
530: i1 = idx[0];
531: i2 = idx[1];
532: idx += 2;
533: tmp0 = x[i1];
534: tmp1 = x[i2];
535: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
536: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
537: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
538: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
539: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
540: }
541: if (n == sz-1){
542: tmp0 = x[*idx++];
543: sum1 += *v1++ * tmp0;
544: sum2 += *v2++ * tmp0;
545: sum3 += *v3++ * tmp0;
546: sum4 += *v4++ * tmp0;
547: sum5 += *v5++ * tmp0;
548: }
549: y[row++]=sum1;
550: y[row++]=sum2;
551: y[row++]=sum3;
552: y[row++]=sum4;
553: y[row++]=sum5;
554: v1 =v5; /* Since the next block to be processed starts there */
555: idx +=4*sz;
556: break;
557: default :
558: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
559: }
560: }
561: VecRestoreArray(xx,&x);
562: VecRestoreArray(yy,&y);
563: PetscLogFlops(2*a->nz - A->rmap.n);
564: return(0);
565: }
566: /* ----------------------------------------------------------- */
567: /* Almost same code as the MatMult_Inode() */
570: static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
571: {
572: Mat_inode *a = (Mat_inode*)A->data;
573: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
574: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
576: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
577:
579: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
580: node_max = a->inode.node_count;
581: ns = a->inode.size; /* Node Size array */
582: VecGetArray(xx,&x);
583: VecGetArray(yy,&y);
584: if (zz != yy) {
585: VecGetArray(zz,&z);
586: } else {
587: z = y;
588: }
589: zt = z;
591: idx = a->j;
592: v1 = a->a;
593: ii = a->i;
595: for (i = 0,row = 0; i< node_max; ++i){
596: nsz = ns[i];
597: n = ii[1] - ii[0];
598: ii += nsz;
599: sz = n; /* No of non zeros in this row */
600: /* Switch on the size of Node */
601: switch (nsz){ /* Each loop in 'case' is unrolled */
602: case 1 :
603: sum1 = *zt++;
604:
605: for(n = 0; n< sz-1; n+=2) {
606: i1 = idx[0]; /* The instructions are ordered to */
607: i2 = idx[1]; /* make the compiler's job easy */
608: idx += 2;
609: tmp0 = x[i1];
610: tmp1 = x[i2];
611: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
612: }
613:
614: if(n == sz-1){ /* Take care of the last nonzero */
615: tmp0 = x[*idx++];
616: sum1 += *v1++ * tmp0;
617: }
618: y[row++]=sum1;
619: break;
620: case 2:
621: sum1 = *zt++;
622: sum2 = *zt++;
623: v2 = v1 + n;
624:
625: for(n = 0; n< sz-1; n+=2) {
626: i1 = idx[0];
627: i2 = idx[1];
628: idx += 2;
629: tmp0 = x[i1];
630: tmp1 = x[i2];
631: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
632: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
633: }
634: if(n == sz-1){
635: tmp0 = x[*idx++];
636: sum1 += *v1++ * tmp0;
637: sum2 += *v2++ * tmp0;
638: }
639: y[row++]=sum1;
640: y[row++]=sum2;
641: v1 =v2; /* Since the next block to be processed starts there*/
642: idx +=sz;
643: break;
644: case 3:
645: sum1 = *zt++;
646: sum2 = *zt++;
647: sum3 = *zt++;
648: v2 = v1 + n;
649: v3 = v2 + n;
650:
651: for (n = 0; n< sz-1; n+=2) {
652: i1 = idx[0];
653: i2 = idx[1];
654: idx += 2;
655: tmp0 = x[i1];
656: tmp1 = x[i2];
657: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
658: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
659: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
660: }
661: if (n == sz-1){
662: tmp0 = x[*idx++];
663: sum1 += *v1++ * tmp0;
664: sum2 += *v2++ * tmp0;
665: sum3 += *v3++ * tmp0;
666: }
667: y[row++]=sum1;
668: y[row++]=sum2;
669: y[row++]=sum3;
670: v1 =v3; /* Since the next block to be processed starts there*/
671: idx +=2*sz;
672: break;
673: case 4:
674: sum1 = *zt++;
675: sum2 = *zt++;
676: sum3 = *zt++;
677: sum4 = *zt++;
678: v2 = v1 + n;
679: v3 = v2 + n;
680: v4 = v3 + n;
681:
682: for (n = 0; n< sz-1; n+=2) {
683: i1 = idx[0];
684: i2 = idx[1];
685: idx += 2;
686: tmp0 = x[i1];
687: tmp1 = x[i2];
688: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
689: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
690: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
691: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
692: }
693: if (n == sz-1){
694: tmp0 = x[*idx++];
695: sum1 += *v1++ * tmp0;
696: sum2 += *v2++ * tmp0;
697: sum3 += *v3++ * tmp0;
698: sum4 += *v4++ * tmp0;
699: }
700: y[row++]=sum1;
701: y[row++]=sum2;
702: y[row++]=sum3;
703: y[row++]=sum4;
704: v1 =v4; /* Since the next block to be processed starts there*/
705: idx +=3*sz;
706: break;
707: case 5:
708: sum1 = *zt++;
709: sum2 = *zt++;
710: sum3 = *zt++;
711: sum4 = *zt++;
712: sum5 = *zt++;
713: v2 = v1 + n;
714: v3 = v2 + n;
715: v4 = v3 + n;
716: v5 = v4 + n;
717:
718: for (n = 0; n<sz-1; n+=2) {
719: i1 = idx[0];
720: i2 = idx[1];
721: idx += 2;
722: tmp0 = x[i1];
723: tmp1 = x[i2];
724: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
725: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
726: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
727: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
728: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
729: }
730: if(n == sz-1){
731: tmp0 = x[*idx++];
732: sum1 += *v1++ * tmp0;
733: sum2 += *v2++ * tmp0;
734: sum3 += *v3++ * tmp0;
735: sum4 += *v4++ * tmp0;
736: sum5 += *v5++ * tmp0;
737: }
738: y[row++]=sum1;
739: y[row++]=sum2;
740: y[row++]=sum3;
741: y[row++]=sum4;
742: y[row++]=sum5;
743: v1 =v5; /* Since the next block to be processed starts there */
744: idx +=4*sz;
745: break;
746: default :
747: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
748: }
749: }
750: VecRestoreArray(xx,&x);
751: VecRestoreArray(yy,&y);
752: if (zz != yy) {
753: VecRestoreArray(zz,&z);
754: }
755: PetscLogFlops(2*a->nz);
756: return(0);
757: }
759: /* ----------------------------------------------------------- */
762: PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
763: {
764: Mat_inode *a = (Mat_inode*)A->data;
765: IS iscol = a->col,isrow = a->row;
767: PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
768: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
769: PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
770: PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
773: if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
774: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
775: node_max = a->inode.node_count;
776: ns = a->inode.size; /* Node Size array */
778: VecGetArray(bb,&b);
779: VecGetArray(xx,&x);
780: tmp = a->solve_work;
781:
782: ISGetIndices(isrow,&rout); r = rout;
783: ISGetIndices(iscol,&cout); c = cout + (n-1);
784:
785: /* forward solve the lower triangular */
786: tmps = tmp ;
787: aa = a_a ;
788: aj = a_j ;
789: ad = a->diag;
791: for (i = 0,row = 0; i< node_max; ++i){
792: nsz = ns[i];
793: aii = ai[row];
794: v1 = aa + aii;
795: vi = aj + aii;
796: nz = ad[row]- aii;
797:
798: switch (nsz){ /* Each loop in 'case' is unrolled */
799: case 1 :
800: sum1 = b[*r++];
801: /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
802: for(j=0; j<nz-1; j+=2){
803: i0 = vi[0];
804: i1 = vi[1];
805: vi +=2;
806: tmp0 = tmps[i0];
807: tmp1 = tmps[i1];
808: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
809: }
810: if(j == nz-1){
811: tmp0 = tmps[*vi++];
812: sum1 -= *v1++ *tmp0;
813: }
814: tmp[row ++]=sum1;
815: break;
816: case 2:
817: sum1 = b[*r++];
818: sum2 = b[*r++];
819: v2 = aa + ai[row+1];
821: for(j=0; j<nz-1; j+=2){
822: i0 = vi[0];
823: i1 = vi[1];
824: vi +=2;
825: tmp0 = tmps[i0];
826: tmp1 = tmps[i1];
827: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
829: }
830: if(j == nz-1){
831: tmp0 = tmps[*vi++];
832: sum1 -= *v1++ *tmp0;
833: sum2 -= *v2++ *tmp0;
834: }
835: sum2 -= *v2++ * sum1;
836: tmp[row ++]=sum1;
837: tmp[row ++]=sum2;
838: break;
839: case 3:
840: sum1 = b[*r++];
841: sum2 = b[*r++];
842: sum3 = b[*r++];
843: v2 = aa + ai[row+1];
844: v3 = aa + ai[row+2];
845:
846: for (j=0; j<nz-1; j+=2){
847: i0 = vi[0];
848: i1 = vi[1];
849: vi +=2;
850: tmp0 = tmps[i0];
851: tmp1 = tmps[i1];
852: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
853: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
854: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
855: }
856: if (j == nz-1){
857: tmp0 = tmps[*vi++];
858: sum1 -= *v1++ *tmp0;
859: sum2 -= *v2++ *tmp0;
860: sum3 -= *v3++ *tmp0;
861: }
862: sum2 -= *v2++ * sum1;
863: sum3 -= *v3++ * sum1;
864: sum3 -= *v3++ * sum2;
865: tmp[row ++]=sum1;
866: tmp[row ++]=sum2;
867: tmp[row ++]=sum3;
868: break;
869:
870: case 4:
871: sum1 = b[*r++];
872: sum2 = b[*r++];
873: sum3 = b[*r++];
874: sum4 = b[*r++];
875: v2 = aa + ai[row+1];
876: v3 = aa + ai[row+2];
877: v4 = aa + ai[row+3];
878:
879: for (j=0; j<nz-1; j+=2){
880: i0 = vi[0];
881: i1 = vi[1];
882: vi +=2;
883: tmp0 = tmps[i0];
884: tmp1 = tmps[i1];
885: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
886: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
887: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
888: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
889: }
890: if (j == nz-1){
891: tmp0 = tmps[*vi++];
892: sum1 -= *v1++ *tmp0;
893: sum2 -= *v2++ *tmp0;
894: sum3 -= *v3++ *tmp0;
895: sum4 -= *v4++ *tmp0;
896: }
897: sum2 -= *v2++ * sum1;
898: sum3 -= *v3++ * sum1;
899: sum4 -= *v4++ * sum1;
900: sum3 -= *v3++ * sum2;
901: sum4 -= *v4++ * sum2;
902: sum4 -= *v4++ * sum3;
903:
904: tmp[row ++]=sum1;
905: tmp[row ++]=sum2;
906: tmp[row ++]=sum3;
907: tmp[row ++]=sum4;
908: break;
909: case 5:
910: sum1 = b[*r++];
911: sum2 = b[*r++];
912: sum3 = b[*r++];
913: sum4 = b[*r++];
914: sum5 = b[*r++];
915: v2 = aa + ai[row+1];
916: v3 = aa + ai[row+2];
917: v4 = aa + ai[row+3];
918: v5 = aa + ai[row+4];
919:
920: for (j=0; j<nz-1; j+=2){
921: i0 = vi[0];
922: i1 = vi[1];
923: vi +=2;
924: tmp0 = tmps[i0];
925: tmp1 = tmps[i1];
926: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
927: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
928: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
929: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
930: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
931: }
932: if (j == nz-1){
933: tmp0 = tmps[*vi++];
934: sum1 -= *v1++ *tmp0;
935: sum2 -= *v2++ *tmp0;
936: sum3 -= *v3++ *tmp0;
937: sum4 -= *v4++ *tmp0;
938: sum5 -= *v5++ *tmp0;
939: }
941: sum2 -= *v2++ * sum1;
942: sum3 -= *v3++ * sum1;
943: sum4 -= *v4++ * sum1;
944: sum5 -= *v5++ * sum1;
945: sum3 -= *v3++ * sum2;
946: sum4 -= *v4++ * sum2;
947: sum5 -= *v5++ * sum2;
948: sum4 -= *v4++ * sum3;
949: sum5 -= *v5++ * sum3;
950: sum5 -= *v5++ * sum4;
951:
952: tmp[row ++]=sum1;
953: tmp[row ++]=sum2;
954: tmp[row ++]=sum3;
955: tmp[row ++]=sum4;
956: tmp[row ++]=sum5;
957: break;
958: default:
959: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
960: }
961: }
962: /* backward solve the upper triangular */
963: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
964: nsz = ns[i];
965: aii = ai[row+1] -1;
966: v1 = aa + aii;
967: vi = aj + aii;
968: nz = aii- ad[row];
969: switch (nsz){ /* Each loop in 'case' is unrolled */
970: case 1 :
971: sum1 = tmp[row];
973: for(j=nz ; j>1; j-=2){
974: vi -=2;
975: i0 = vi[2];
976: i1 = vi[1];
977: tmp0 = tmps[i0];
978: tmp1 = tmps[i1];
979: v1 -= 2;
980: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
981: }
982: if (j==1){
983: tmp0 = tmps[*vi--];
984: sum1 -= *v1-- * tmp0;
985: }
986: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
987: break;
988: case 2 :
989: sum1 = tmp[row];
990: sum2 = tmp[row -1];
991: v2 = aa + ai[row]-1;
992: for (j=nz ; j>1; j-=2){
993: vi -=2;
994: i0 = vi[2];
995: i1 = vi[1];
996: tmp0 = tmps[i0];
997: tmp1 = tmps[i1];
998: v1 -= 2;
999: v2 -= 2;
1000: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1001: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1002: }
1003: if (j==1){
1004: tmp0 = tmps[*vi--];
1005: sum1 -= *v1-- * tmp0;
1006: sum2 -= *v2-- * tmp0;
1007: }
1008:
1009: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1010: sum2 -= *v2-- * tmp0;
1011: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1012: break;
1013: case 3 :
1014: sum1 = tmp[row];
1015: sum2 = tmp[row -1];
1016: sum3 = tmp[row -2];
1017: v2 = aa + ai[row]-1;
1018: v3 = aa + ai[row -1]-1;
1019: for (j=nz ; j>1; j-=2){
1020: vi -=2;
1021: i0 = vi[2];
1022: i1 = vi[1];
1023: tmp0 = tmps[i0];
1024: tmp1 = tmps[i1];
1025: v1 -= 2;
1026: v2 -= 2;
1027: v3 -= 2;
1028: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1031: }
1032: if (j==1){
1033: tmp0 = tmps[*vi--];
1034: sum1 -= *v1-- * tmp0;
1035: sum2 -= *v2-- * tmp0;
1036: sum3 -= *v3-- * tmp0;
1037: }
1038: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1039: sum2 -= *v2-- * tmp0;
1040: sum3 -= *v3-- * tmp0;
1041: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1042: sum3 -= *v3-- * tmp0;
1043: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1044:
1045: break;
1046: case 4 :
1047: sum1 = tmp[row];
1048: sum2 = tmp[row -1];
1049: sum3 = tmp[row -2];
1050: sum4 = tmp[row -3];
1051: v2 = aa + ai[row]-1;
1052: v3 = aa + ai[row -1]-1;
1053: v4 = aa + ai[row -2]-1;
1055: for (j=nz ; j>1; j-=2){
1056: vi -=2;
1057: i0 = vi[2];
1058: i1 = vi[1];
1059: tmp0 = tmps[i0];
1060: tmp1 = tmps[i1];
1061: v1 -= 2;
1062: v2 -= 2;
1063: v3 -= 2;
1064: v4 -= 2;
1065: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1066: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1067: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1068: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1069: }
1070: if (j==1){
1071: tmp0 = tmps[*vi--];
1072: sum1 -= *v1-- * tmp0;
1073: sum2 -= *v2-- * tmp0;
1074: sum3 -= *v3-- * tmp0;
1075: sum4 -= *v4-- * tmp0;
1076: }
1078: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1079: sum2 -= *v2-- * tmp0;
1080: sum3 -= *v3-- * tmp0;
1081: sum4 -= *v4-- * tmp0;
1082: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1083: sum3 -= *v3-- * tmp0;
1084: sum4 -= *v4-- * tmp0;
1085: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1086: sum4 -= *v4-- * tmp0;
1087: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1088: break;
1089: case 5 :
1090: sum1 = tmp[row];
1091: sum2 = tmp[row -1];
1092: sum3 = tmp[row -2];
1093: sum4 = tmp[row -3];
1094: sum5 = tmp[row -4];
1095: v2 = aa + ai[row]-1;
1096: v3 = aa + ai[row -1]-1;
1097: v4 = aa + ai[row -2]-1;
1098: v5 = aa + ai[row -3]-1;
1099: for (j=nz ; j>1; j-=2){
1100: vi -= 2;
1101: i0 = vi[2];
1102: i1 = vi[1];
1103: tmp0 = tmps[i0];
1104: tmp1 = tmps[i1];
1105: v1 -= 2;
1106: v2 -= 2;
1107: v3 -= 2;
1108: v4 -= 2;
1109: v5 -= 2;
1110: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1111: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1112: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1113: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1114: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1115: }
1116: if (j==1){
1117: tmp0 = tmps[*vi--];
1118: sum1 -= *v1-- * tmp0;
1119: sum2 -= *v2-- * tmp0;
1120: sum3 -= *v3-- * tmp0;
1121: sum4 -= *v4-- * tmp0;
1122: sum5 -= *v5-- * tmp0;
1123: }
1125: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1126: sum2 -= *v2-- * tmp0;
1127: sum3 -= *v3-- * tmp0;
1128: sum4 -= *v4-- * tmp0;
1129: sum5 -= *v5-- * tmp0;
1130: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1131: sum3 -= *v3-- * tmp0;
1132: sum4 -= *v4-- * tmp0;
1133: sum5 -= *v5-- * tmp0;
1134: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1135: sum4 -= *v4-- * tmp0;
1136: sum5 -= *v5-- * tmp0;
1137: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1138: sum5 -= *v5-- * tmp0;
1139: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1140: break;
1141: default:
1142: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1143: }
1144: }
1145: ISRestoreIndices(isrow,&rout);
1146: ISRestoreIndices(iscol,&cout);
1147: VecRestoreArray(bb,&b);
1148: VecRestoreArray(xx,&x);
1149: PetscLogFlops(2*a->nz - A->cmap.n);
1150: return(0);
1151: }
1155: PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1156: {
1157: Mat C = *B;
1158: Mat_inode *a = (Mat_inode*)A->data,*b = (Mat_inode*)C->data;
1159: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1161: PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1162: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,row,prow;
1163: PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1164: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1165: PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1166: PetscScalar tmp,*ba = b->a,*aa = a->a,*pv,*rtmps1,*rtmps2,*rtmps3;
1167: PetscReal rs=0.0,rsum[3];
1168: LUShift_Ctx sctx;
1169: PetscInt newshift;
1173: sctx.shift_top = 0;
1174: sctx.nshift_max = 0;
1175: sctx.shift_lo = 0;
1176: sctx.shift_hi = 0;
1178: /* if both shift schemes are chosen by user, only use info->shiftpd */
1179: if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1180: if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1181: sctx.shift_top = 0;
1182: for (i=0; i<n; i++) {
1183: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1184: rs = 0.0;
1185: ajtmp = aj + ai[i];
1186: rtmp1 = aa + ai[i];
1187: nz = ai[i+1] - ai[i];
1188: for (j=0; j<nz; j++){
1189: if (*ajtmp != i){
1190: rs += PetscAbsScalar(*rtmp1++);
1191: } else {
1192: rs -= PetscRealPart(*rtmp1++);
1193: }
1194: ajtmp++;
1195: }
1196: if (rs>sctx.shift_top) sctx.shift_top = rs;
1197: }
1198: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1199: sctx.shift_top *= 1.1;
1200: sctx.nshift_max = 5;
1201: sctx.shift_lo = 0.;
1202: sctx.shift_hi = 1.;
1203: }
1204: sctx.shift_amount = 0;
1205: sctx.nshift = 0;
1207: ISGetIndices(isrow,&r);
1208: ISGetIndices(iscol,&c);
1209: ISGetIndices(isicol,&ic);
1210: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1211: PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1212: ics = ic ; rtmps1 = rtmp1 ;
1213: rtmp2 = rtmp1 + n; rtmps2 = rtmp2 ;
1214: rtmp3 = rtmp2 + n; rtmps3 = rtmp3 ;
1215:
1216: node_max = a->inode.node_count;
1217: ns = a->inode.size ;
1218: if (!ns){
1219: SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1220: }
1222: /* If max inode size > 3, split it into two inodes.*/
1223: /* also map the inode sizes according to the ordering */
1224: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1225: for (i=0,j=0; i<node_max; ++i,++j){
1226: if (ns[i]>3) {
1227: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1228: ++j;
1229: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1230: } else {
1231: tmp_vec1[j] = ns[i];
1232: }
1233: }
1234: /* Use the correct node_max */
1235: node_max = j;
1237: /* Now reorder the inode info based on mat re-ordering info */
1238: /* First create a row -> inode_size_array_index map */
1239: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1240: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1241: for (i=0,row=0; i<node_max; i++) {
1242: nodesz = tmp_vec1[i];
1243: for (j=0; j<nodesz; j++,row++) {
1244: nsmap[row] = i;
1245: }
1246: }
1247: /* Using nsmap, create a reordered ns structure */
1248: for (i=0,j=0; i< node_max; i++) {
1249: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1250: tmp_vec2[i] = nodesz;
1251: j += nodesz;
1252: }
1253: PetscFree(nsmap);
1254: PetscFree(tmp_vec1);
1255: /* Now use the correct ns */
1256: ns = tmp_vec2;
1258: do {
1259: sctx.lushift = PETSC_FALSE;
1260: /* Now loop over each block-row, and do the factorization */
1261: for (i=0,row=0; i<node_max; i++) {
1262: nodesz = ns[i];
1263: nz = bi[row+1] - bi[row];
1264: bjtmp = bj + bi[row];
1266: switch (nodesz){
1267: case 1:
1268: for (j=0; j<nz; j++){
1269: idx = bjtmp[j];
1270: rtmps1[idx] = 0.0;
1271: }
1272:
1273: /* load in initial (unfactored row) */
1274: idx = r[row];
1275: nz = ai[idx+1] - ai[idx];
1276: ajtmp = aj + ai[idx];
1277: v1 = aa + ai[idx];
1279: for (j=0; j<nz; j++) {
1280: idx = ics[ajtmp[j]];
1281: rtmp1[idx] = v1[j];
1282: if (sctx.nshift && ajtmp[j] == r[row]) {
1283: rtmp1[idx] += sctx.shift_amount;
1284: }
1285: }
1286: prow = *bjtmp++ ;
1287: while (prow < row) {
1288: pc1 = rtmp1 + prow;
1289: if (*pc1 != 0.0){
1290: pv = ba + bd[prow];
1291: pj = nbj + bd[prow];
1292: mul1 = *pc1 * *pv++;
1293: *pc1 = mul1;
1294: nz = bi[prow+1] - bd[prow] - 1;
1295: PetscLogFlops(2*nz);
1296: for (j=0; j<nz; j++) {
1297: tmp = pv[j];
1298: idx = pj[j];
1299: rtmps1[idx] -= mul1 * tmp;
1300: }
1301: }
1302: prow = *bjtmp++ ;
1303: }
1304: nz = bi[row+1] - bi[row];
1305: pj = bj + bi[row];
1306: pc1 = ba + bi[row];
1308: sctx.pv = rtmp1[row];
1309: rs = 0.0;
1310: rtmp1[row] = 1.0/rtmp1[row];
1311: for (j=0; j<nz; j++) {
1312: idx = pj[j];
1313: pc1[j] = rtmps1[idx];
1314: if (idx != row) rs += PetscAbsScalar(pc1[j]);
1315: }
1317: sctx.rs = rs;
1318: MatLUCheckShift_inline(info,sctx,newshift);
1319: if (newshift == 1){
1320: goto endofwhile;
1321: } else if (newshift == -1){
1322: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G, inode.size %D",row,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1323: }
1324: break;
1325:
1326: case 2:
1327: for (j=0; j<nz; j++) {
1328: idx = bjtmp[j];
1329: rtmps1[idx] = 0.0;
1330: rtmps2[idx] = 0.0;
1331: }
1332:
1333: /* load in initial (unfactored row) */
1334: idx = r[row];
1335: nz = ai[idx+1] - ai[idx];
1336: ajtmp = aj + ai[idx];
1337: v1 = aa + ai[idx];
1338: v2 = aa + ai[idx+1];
1339:
1340: for (j=0; j<nz; j++) {
1341: idx = ics[ajtmp[j]];
1342: rtmp1[idx] = v1[j];
1343: rtmp2[idx] = v2[j];
1344: if (sctx.nshift && ajtmp[j] == r[row]) {
1345: rtmp1[idx] += sctx.shift_amount;
1346: }
1347: if (sctx.nshift && ajtmp[j] == r[row+1]) {
1348: rtmp2[idx] += sctx.shift_amount;
1349: }
1350: }
1351: prow = *bjtmp++ ;
1352: while (prow < row) {
1353: pc1 = rtmp1 + prow;
1354: pc2 = rtmp2 + prow;
1355: if (*pc1 != 0.0 || *pc2 != 0.0){
1356: pv = ba + bd[prow];
1357: pj = nbj + bd[prow];
1358: mul1 = *pc1 * *pv;
1359: mul2 = *pc2 * *pv;
1360: ++pv;
1361: *pc1 = mul1;
1362: *pc2 = mul2;
1363:
1364: nz = bi[prow+1] - bd[prow] - 1;
1365: PetscLogFlops(2*2*nz);
1366: for (j=0; j<nz; j++) {
1367: tmp = pv[j];
1368: idx = pj[j];
1369: rtmps1[idx] -= mul1 * tmp;
1370: rtmps2[idx] -= mul2 * tmp;
1371: }
1372: }
1373: prow = *bjtmp++ ;
1374: }
1375: /* Now take care of the odd element*/
1376: pc1 = rtmp1 + prow;
1377: pc2 = rtmp2 + prow;
1378: if (*pc2 != 0.0){
1379: pj = nbj + bd[prow];
1381: rs = 0.0;
1382: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1383: *pc2 = mul2;
1384: nz = bi[prow+1] - bd[prow] - 1;
1385: PetscLogFlops(2*nz);
1386: for (j=0; j<nz; j++) {
1387: idx = pj[j] ;
1388: tmp = rtmp1[idx];
1389: rtmp2[idx] -= mul2 * tmp;
1390: if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1391: }
1392:
1393: sctx.rs = rs;
1394: sctx.pv = *pc1;
1395: MatLUCheckShift_inline(info,sctx,newshift);
1396: if (newshift == 1){
1397: goto endofwhile; /* sctx.shift_amount is updated */
1398: } else if (newshift == -1){
1399: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",prow,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1400: }
1401: }
1402:
1403: nz = bi[row+1] - bi[row];
1404: pj = bj + bi[row];
1405: pc1 = ba + bi[row];
1406: pc2 = ba + bi[row+1];
1408: rsum[0] = rsum[1] = 0.0;
1409: rtmp1[row] = 1.0/rtmp1[row];
1410: rtmp2[row+1] = 1.0/rtmp2[row+1];
1411: for (j=0; j<nz; j++) {
1412: idx = pj[j];
1413: pc1[j] = rtmps1[idx];
1414: pc2[j] = rtmps2[idx];
1415: if (idx != row) rsum[0] += PetscAbsScalar(pc1[j]);
1416: if (idx != row+1) rsum[1] += PetscAbsScalar(pc2[j]);
1417: }
1419: sctx.pv = 1.0/rtmp1[row]; /* rtmp1[row] = 1.0/diag[row] */
1420: sctx.rs = rsum[0];
1421: MatLUCheckShift_inline(info,sctx,newshift);
1422: if (newshift == 1){
1423: goto endofwhile;
1424: } else if (newshift == -1){
1425: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",row,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1426: }
1427: sctx.pv = 1.0/rtmp2[row+1];
1428: sctx.rs = rsum[1];
1429: MatLUCheckShift_inline(info,sctx,newshift);
1430: if (newshift == 1){
1431: goto endofwhile;
1432: } else if (newshift == -1){
1433: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",row+1,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1434: }
1435: break;
1437: case 3:
1438: for (j=0; j<nz; j++) {
1439: idx = bjtmp[j];
1440: rtmps1[idx] = 0.0;
1441: rtmps2[idx] = 0.0;
1442: rtmps3[idx] = 0.0;
1443: }
1444: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1445: idx = r[row];
1446: nz = ai[idx+1] - ai[idx];
1447: ajtmp = aj + ai[idx];
1448: v1 = aa + ai[idx];
1449: v2 = aa + ai[idx+1];
1450: v3 = aa + ai[idx+2];
1451: for (j=0; j<nz; j++) {
1452: idx = ics[ajtmp[j]];
1453: rtmp1[idx] = v1[j];
1454: rtmp2[idx] = v2[j];
1455: rtmp3[idx] = v3[j];
1456: if (sctx.nshift && ajtmp[j] == r[row]) {
1457: rtmp1[idx] += sctx.shift_amount;
1458: }
1459: if (sctx.nshift && ajtmp[j] == r[row+1]) {
1460: rtmp2[idx] += sctx.shift_amount;
1461: }
1462: if (sctx.nshift && ajtmp[j] == r[row+2]) {
1463: rtmp3[idx] += sctx.shift_amount;
1464: }
1465: }
1466: /* loop over all pivot row blocks above this row block */
1467: prow = *bjtmp++ ;
1468: while (prow < row) {
1469: pc1 = rtmp1 + prow;
1470: pc2 = rtmp2 + prow;
1471: pc3 = rtmp3 + prow;
1472: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1473: pv = ba + bd[prow];
1474: pj = nbj + bd[prow];
1475: mul1 = *pc1 * *pv;
1476: mul2 = *pc2 * *pv;
1477: mul3 = *pc3 * *pv;
1478: ++pv;
1479: *pc1 = mul1;
1480: *pc2 = mul2;
1481: *pc3 = mul3;
1482:
1483: nz = bi[prow+1] - bd[prow] - 1;
1484: PetscLogFlops(3*2*nz);
1485: /* update this row based on pivot row */
1486: for (j=0; j<nz; j++) {
1487: tmp = pv[j];
1488: idx = pj[j];
1489: rtmps1[idx] -= mul1 * tmp;
1490: rtmps2[idx] -= mul2 * tmp;
1491: rtmps3[idx] -= mul3 * tmp;
1492: }
1493: }
1494: prow = *bjtmp++ ;
1495: }
1496: /* Now take care of diagonal block in this set of rows */
1497: pc1 = rtmp1 + prow;
1498: pc2 = rtmp2 + prow;
1499: pc3 = rtmp3 + prow;
1500: if (*pc2 != 0.0 || *pc3 != 0.0){
1501: pj = nbj + bd[prow];
1503: sctx.rs = 1.0; /* for simplicity, set rs=1.0 */
1504: sctx.pv = *pc1;
1505: MatLUCheckShift_inline(info,sctx,newshift);
1506: if (newshift == 1){
1507: goto endofwhile;
1508: } else if (newshift == -1){
1509: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",prow,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1510: }
1512: mul2 = (*pc2)/(*pc1);
1513: mul3 = (*pc3)/(*pc1);
1514: *pc2 = mul2;
1515: *pc3 = mul3;
1516: nz = bi[prow+1] - bd[prow] - 1;
1517: PetscLogFlops(2*2*nz);
1518: for (j=0; j<nz; j++) {
1519: idx = pj[j] ;
1520: tmp = rtmp1[idx];
1521: rtmp2[idx] -= mul2 * tmp;
1522: rtmp3[idx] -= mul3 * tmp;
1523: }
1524: }
1525: ++prow;
1526: pc2 = rtmp2 + prow;
1527: pc3 = rtmp3 + prow;
1528: if (*pc3 != 0.0){
1529: pj = nbj + bd[prow];
1530: pj = nbj + bd[prow];
1532: sctx.rs = 1.0; /* for simplicity, set rs=1.0 */
1533: sctx.pv = *pc2;
1534: MatLUCheckShift_inline(info,sctx,newshift);
1535: if (newshift == 1){
1536: goto endofwhile;
1537: } else if (newshift == -1){
1538: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",prow,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1539: }
1540: mul3 = (*pc3)/(*pc2);
1541: *pc3 = mul3;
1542: nz = bi[prow+1] - bd[prow] - 1;
1543: PetscLogFlops(2*2*nz);
1544: for (j=0; j<nz; j++) {
1545: idx = pj[j] ;
1546: tmp = rtmp2[idx];
1547: rtmp3[idx] -= mul3 * tmp;
1548: }
1549: }
1550: nz = bi[row+1] - bi[row];
1551: pj = bj + bi[row];
1552: pc1 = ba + bi[row];
1553: pc2 = ba + bi[row+1];
1554: pc3 = ba + bi[row+2];
1556: rsum[0] = rsum[1] = rsum[2] = 0.0;
1557: rtmp1[row] = 1.0/rtmp1[row];
1558: rtmp2[row+1] = 1.0/rtmp2[row+1];
1559: rtmp3[row+2] = 1.0/rtmp3[row+2];
1560: /* copy row entries from dense representation to sparse */
1561: for (j=0; j<nz; j++) {
1562: idx = pj[j];
1563: pc1[j] = rtmps1[idx];
1564: pc2[j] = rtmps2[idx];
1565: pc3[j] = rtmps3[idx];
1566: if (idx != row) rsum[0] += PetscAbsScalar(pc1[j]);
1567: if (idx != row+1) rsum[1] += PetscAbsScalar(pc2[j]);
1568: if (idx != row+2) rsum[2] += PetscAbsScalar(pc3[j]);
1569: }
1571: /* sctx.rs = rs/3.0; */
1572: sctx.pv = 1.0/rtmp1[row];
1573: sctx.rs = rsum[0];
1574: MatLUCheckShift_inline(info,sctx,newshift);
1575: if (newshift == 1){
1576: goto endofwhile;
1577: } else if (newshift == -1){
1578: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",row,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1579: }
1580: sctx.pv = 1.0/rtmp2[row+1];
1581: sctx.rs = rsum[1];
1582: MatLUCheckShift_inline(info,sctx,newshift);
1583: if (newshift == 1){
1584: goto endofwhile;
1585: } else if (newshift == -1){
1586: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D, 2nd row",row+1,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1587: }
1588: sctx.pv = 1.0/rtmp3[row+2];
1589: sctx.rs = rsum[2];
1590: MatLUCheckShift_inline(info,sctx,newshift);
1591: if (newshift == 1){
1592: goto endofwhile;
1593: } else if (newshift == -1){
1594: SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D, 3rd row",row+2,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1595: }
1596: break;
1597: default:
1598: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1599: }
1600: row += nodesz; /* Update the row */
1601: }
1602: endofwhile:;
1603: } while (sctx.lushift);
1604: PetscFree(rtmp1);
1605: PetscFree(tmp_vec2);
1606: ISRestoreIndices(isicol,&ic);
1607: ISRestoreIndices(isrow,&r);
1608: ISRestoreIndices(iscol,&c);
1609: C->factor = FACTOR_LU;
1610: C->assembled = PETSC_TRUE;
1611: if (sctx.nshift) {
1612: if (info->shiftnz) {
1613: PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1614: } else if (info->shiftpd) {
1615: PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);
1616: }
1617: }
1618: PetscLogFlops(C->cmap.n);
1619: return(0);
1620: }
1622: /*
1623: Makes a longer coloring[] array and calls the usual code with that
1624: */
1627: PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1628: {
1629: Mat_inode *a = (Mat_inode*)mat->data;
1630: PetscErrorCode ierr;
1631: PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1632: PetscInt *colorused,i;
1633: ISColoringValue *newcolor;
1636: PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
1637: /* loop over inodes, marking a color for each column*/
1638: row = 0;
1639: for (i=0; i<m; i++){
1640: for (j=0; j<ns[i]; j++) {
1641: newcolor[row++] = coloring[i] + j*ncolors;
1642: }
1643: }
1645: /* eliminate unneeded colors */
1646: PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
1647: PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
1648: for (i=0; i<n; i++) {
1649: colorused[newcolor[i]] = 1;
1650: }
1652: for (i=1; i<5*ncolors; i++) {
1653: colorused[i] += colorused[i-1];
1654: }
1655: ncolors = colorused[5*ncolors-1];
1656: for (i=0; i<n; i++) {
1657: newcolor[i] = colorused[newcolor[i]];
1658: }
1659: PetscFree(colorused);
1660: ISColoringCreate(mat->comm,ncolors,n,newcolor,iscoloring);
1661: PetscFree(coloring);
1662: return(0);
1663: }
1665: /*
1666: samestructure indicates that the matrix has not changed its nonzero structure so we
1667: do not need to recompute the inodes
1668: */
1671: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
1672: {
1673: Mat_inode *a = (Mat_inode*)A->data;
1675: PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
1676: PetscTruth flag,flg;
1679: if (a->inode.checked && samestructure) return(0);
1681: a->inode.checked = PETSC_TRUE;
1683: /* Notes: We set a->inode.limit=5 in MatCreate_Inode(). */
1684: if (!a->inode.use) {PetscInfo(A,"Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODES\n"); return(0);}
1685: PetscOptionsHasName(A->prefix,"-mat_no_inode",&flg);
1686: if (flg) {PetscInfo(A,"Not using Inode routines due to -mat_no_inode\n");return(0);}
1687: PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
1688: if (flg) {PetscInfo(A,"Not using Inode routines due to -mat_no_unroll\n");return(0);}
1689: PetscOptionsGetInt(A->prefix,"-mat_inode_limit",&a->inode.limit,PETSC_NULL);
1690: if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
1691: m = A->rmap.n;
1692: if (a->inode.size) {ns = a->inode.size;}
1693: else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}
1695: i = 0;
1696: node_count = 0;
1697: idx = a->j;
1698: ii = a->i;
1699: while (i < m){ /* For each row */
1700: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
1701: /* Limits the number of elements in a node to 'a->inode.limit' */
1702: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
1703: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
1704: if (nzy != nzx) break;
1705: idy += nzx; /* Same nonzero pattern */
1706: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
1707: if (!flag) break;
1708: }
1709: ns[node_count++] = blk_size;
1710: idx += blk_size*nzx;
1711: i = j;
1712: }
1713: /* If not enough inodes found,, do not use inode version of the routines */
1714: if (!a->inode.size && m && node_count > 0.9*m) {
1715: PetscFree(ns);
1716: a->inode.node_count = 0;
1717: a->inode.size = PETSC_NULL;
1718: a->inode.use = PETSC_FALSE;
1719: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
1720: } else {
1721: A->ops->mult = MatMult_Inode;
1722: A->ops->multadd = MatMultAdd_Inode;
1723: A->ops->solve = MatSolve_Inode;
1724: A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
1725: A->ops->getrowij = MatGetRowIJ_Inode;
1726: A->ops->restorerowij = MatRestoreRowIJ_Inode;
1727: A->ops->getcolumnij = MatGetColumnIJ_Inode;
1728: A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
1729: A->ops->coloringpatch = MatColoringPatch_Inode;
1730: a->inode.node_count = node_count;
1731: a->inode.size = ns;
1732: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
1733: }
1734: return(0);
1735: }
1737: /*
1738: This is really ugly. if inodes are used this replaces the
1739: permutations with ones that correspond to rows/cols of the matrix
1740: rather then inode blocks
1741: */
1744: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1745: {
1746: PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
1749: PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
1750: if (f) {
1751: (*f)(A,rperm,cperm);
1752: }
1753: return(0);
1754: }
1759: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
1760: {
1761: Mat_inode *a=(Mat_inode*)A->data;
1763: PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1764: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
1765: PetscInt nslim_col,*ns_col;
1766: IS ris = *rperm,cis = *cperm;
1769: if (!a->inode.size) return(0); /* no inodes so return */
1770: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
1772: Mat_CreateColInode(A,&nslim_col,&ns_col);
1773: PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
1774: PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
1775: permc = permr + m;
1777: ISGetIndices(ris,&ridx);
1778: ISGetIndices(cis,&cidx);
1780: /* Form the inode structure for the rows of permuted matric using inv perm*/
1781: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
1783: /* Construct the permutations for rows*/
1784: for (i=0,row = 0; i<nslim_row; ++i){
1785: indx = ridx[i];
1786: start_val = tns[indx];
1787: end_val = tns[indx + 1];
1788: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1789: }
1791: /* Form the inode structure for the columns of permuted matrix using inv perm*/
1792: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
1794: /* Construct permutations for columns */
1795: for (i=0,col=0; i<nslim_col; ++i){
1796: indx = cidx[i];
1797: start_val = tns[indx];
1798: end_val = tns[indx + 1];
1799: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1800: }
1802: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1803: ISSetPermutation(*rperm);
1804: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1805: ISSetPermutation(*cperm);
1806:
1807: ISRestoreIndices(ris,&ridx);
1808: ISRestoreIndices(cis,&cidx);
1810: PetscFree(ns_col);
1811: PetscFree(permr);
1812: ISDestroy(cis);
1813: ISDestroy(ris);
1814: PetscFree(tns);
1815: return(0);
1816: }
1821: /*@C
1822: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
1824: Collective on Mat
1826: Input Parameter:
1827: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
1829: Output Parameter:
1830: + node_count - no of inodes present in the matrix.
1831: . sizes - an array of size node_count,with sizes of each inode.
1832: - limit - the max size used to generate the inodes.
1834: Level: advanced
1836: Notes: This routine returns some internal storage information
1837: of the matrix, it is intended to be used by advanced users.
1838: It should be called after the matrix is assembled.
1839: The contents of the sizes[] array should not be changed.
1840: PETSC_NULL may be passed for information not requested.
1842: .keywords: matrix, seqaij, get, inode
1844: .seealso: MatGetInfo()
1845: @*/
1846: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1847: {
1848: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
1851: if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1852: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
1853: if (f) {
1854: (*f)(A,node_count,sizes,limit);
1855: }
1856: return(0);
1857: }
1862: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1863: {
1864: Mat_inode *a = (Mat_inode*)A->data;
1867: if (node_count) *node_count = a->inode.node_count;
1868: if (sizes) *sizes = a->inode.size;
1869: if (limit) *limit = a->inode.limit;
1870: return(0);
1871: }