Actual source code: inode.c

  1: #define PETSCMAT_DLL

  3: /*
  4:   This file provides high performance routines for the Inode format (compressed sparse row)
  5:   by taking advantage of rows with identical nonzero structure (I-nodes).
  6: */
 7:  #include src/mat/impls/csr/inode/inode.h

 11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
 12: {
 13:   Mat_inode      *a = (Mat_inode*)A->data;
 15:   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;

 18:   n      = A->cmap.n;
 19:   m      = A->rmap.n;
 20:   ns_row = a->inode.size;
 21: 
 22:   min_mn = (m < n) ? m : n;
 23:   if (!ns) {
 24:     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
 25:     for(; count+1 < n; count++,i++);
 26:     if (count < n)  {
 27:       i++;
 28:     }
 29:     *size = i;
 30:     return(0);
 31:   }
 32:   PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
 33: 
 34:   /* Use the same row structure wherever feasible. */
 35:   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
 36:     ns_col[i] = ns_row[i];
 37:   }

 39:   /* if m < n; pad up the remainder with inode_limit */
 40:   for(; count+1 < n; count++,i++) {
 41:     ns_col[i] = 1;
 42:   }
 43:   /* The last node is the odd ball. padd it up with the remaining rows; */
 44:   if (count < n)  {
 45:     ns_col[i] = n - count;
 46:     i++;
 47:   } else if (count > n) {
 48:     /* Adjust for the over estimation */
 49:     ns_col[i-1] += n - count;
 50:   }
 51:   *size = i;
 52:   *ns   = ns_col;
 53:   return(0);
 54: }


 57: /*
 58:       This builds symmetric version of nonzero structure,
 59: */
 62: static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
 63: {
 64:   Mat_inode      *a = (Mat_inode*)A->data;
 66:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
 67:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;

 70:   nslim_row = a->inode.node_count;
 71:   m         = A->rmap.n;
 72:   n         = A->cmap.n;
 73:   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
 74: 
 75:   /* Use the row_inode as column_inode */
 76:   nslim_col = nslim_row;
 77:   ns_col    = ns_row;

 79:   /* allocate space for reformated inode structure */
 80:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
 81:   PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
 82:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];

 84:   for (i1=0,col=0; i1<nslim_col; ++i1){
 85:     nsz = ns_col[i1];
 86:     for (i2=0; i2<nsz; ++i2,++col)
 87:       tvc[col] = i1;
 88:   }
 89:   /* allocate space for row pointers */
 90:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
 91:   *iia = ia;
 92:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
 93:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

 95:   /* determine the number of columns in each row */
 96:   ia[0] = oshift;
 97:   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {

 99:     j    = aj + ai[row] + ishift;
100:     jmax = aj + ai[row+1] + ishift;
101:     i2   = 0;
102:     col  = *j++ + ishift;
103:     i2   = tvc[col];
104:     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105:       ia[i1+1]++;
106:       ia[i2+1]++;
107:       i2++;                     /* Start col of next node */
108:       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109:       i2 = tvc[col];
110:     }
111:     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112:   }

114:   /* shift ia[i] to point to next row */
115:   for (i1=1; i1<nslim_row+1; i1++) {
116:     row        = ia[i1-1];
117:     ia[i1]    += row;
118:     work[i1-1] = row - oshift;
119:   }

121:   /* allocate space for column pointers */
122:   nz   = ia[nslim_row] + (!ishift);
123:   PetscMalloc(nz*sizeof(PetscInt),&ja);
124:   *jja = ja;

126:  /* loop over lower triangular part putting into ja */
127:   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128:     j    = aj + ai[row] + ishift;
129:     jmax = aj + ai[row+1] + ishift;
130:     i2   = 0;                     /* Col inode index */
131:     col  = *j++ + ishift;
132:     i2   = tvc[col];
133:     while (i2<i1 && j<jmax) {
134:       ja[work[i2]++] = i1 + oshift;
135:       ja[work[i1]++] = i2 + oshift;
136:       ++i2;
137:       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138:       i2 = tvc[col];
139:     }
140:     if (i2 == i1) ja[work[i1]++] = i2 + oshift;

142:   }
143:   PetscFree(work);
144:   PetscFree(tns);
145:   PetscFree(tvc);
146:   return(0);
147: }

149: /*
150:       This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156:   Mat_inode      *a = (Mat_inode*)A->data;
158:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

162:   nslim_row = a->inode.node_count;
163:   n         = A->cmap.n;

165:   /* Create The column_inode for this matrix */
166:   Mat_CreateColInode(A,&nslim_col,&ns_col);
167: 
168:   /* allocate space for reformated column_inode structure */
169:   PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170:   PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

173:   for (i1=0,col=0; i1<nslim_col; ++i1){
174:     nsz = ns_col[i1];
175:     for (i2=0; i2<nsz; ++i2,++col)
176:       tvc[col] = i1;
177:   }
178:   /* allocate space for row pointers */
179:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180:   *iia = ia;
181:   PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182:   PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);

184:   /* determine the number of columns in each row */
185:   ia[0] = oshift;
186:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187:     j   = aj + ai[row] + ishift;
188:     col = *j++ + ishift;
189:     i2  = tvc[col];
190:     nz  = ai[row+1] - ai[row];
191:     while (nz-- > 0) {           /* off-diagonal elemets */
192:       ia[i1+1]++;
193:       i2++;                     /* Start col of next node */
194:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195:       if (nz > 0) i2 = tvc[col];
196:     }
197:   }

199:   /* shift ia[i] to point to next row */
200:   for (i1=1; i1<nslim_row+1; i1++) {
201:     row        = ia[i1-1];
202:     ia[i1]    += row;
203:     work[i1-1] = row - oshift;
204:   }

206:   /* allocate space for column pointers */
207:   nz   = ia[nslim_row] + (!ishift);
208:   PetscMalloc(nz*sizeof(PetscInt),&ja);
209:   *jja = ja;

211:  /* loop over matrix putting into ja */
212:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213:     j   = aj + ai[row] + ishift;
214:     i2  = 0;                     /* Col inode index */
215:     col = *j++ + ishift;
216:     i2  = tvc[col];
217:     nz  = ai[row+1] - ai[row];
218:     while (nz-- > 0) {
219:       ja[work[i1]++] = i2 + oshift;
220:       ++i2;
221:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222:       if (nz > 0) i2 = tvc[col];
223:     }
224:   }
225:   PetscFree(ns_col);
226:   PetscFree(work);
227:   PetscFree(tns);
228:   PetscFree(tvc);
229:   return(0);
230: }

234: static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236:   Mat_inode      *a = (Mat_inode*)A->data;

240:   *n     = a->inode.node_count;
241:   if (!ia) return(0);

243:   if (symmetric) {
244:     MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
245:   } else {
246:     MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
247:   }
248:   return(0);
249: }

253: static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
254: {

258:   if (!ia) return(0);
259:   PetscFree(*ia);
260:   PetscFree(*ja);
261:   return(0);
262: }

264: /* ----------------------------------------------------------- */

268: static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
269: {
270:   Mat_inode      *a = (Mat_inode*)A->data;
272:   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
273:   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;

276:   nslim_row = a->inode.node_count;
277:   n         = A->cmap.n;

279:   /* Create The column_inode for this matrix */
280:   Mat_CreateColInode(A,&nslim_col,&ns_col);
281: 
282:   /* allocate space for reformated column_inode structure */
283:   PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
284:   PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
285:   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];

287:   for (i1=0,col=0; i1<nslim_col; ++i1){
288:     nsz = ns_col[i1];
289:     for (i2=0; i2<nsz; ++i2,++col)
290:       tvc[col] = i1;
291:   }
292:   /* allocate space for column pointers */
293:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
294:   *iia = ia;
295:   PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
296:   PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);

298:   /* determine the number of columns in each row */
299:   ia[0] = oshift;
300:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
301:     j   = aj + ai[row] + ishift;
302:     col = *j++ + ishift;
303:     i2  = tvc[col];
304:     nz  = ai[row+1] - ai[row];
305:     while (nz-- > 0) {           /* off-diagonal elemets */
306:       /* ia[i1+1]++; */
307:       ia[i2+1]++;
308:       i2++;
309:       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
310:       if (nz > 0) i2 = tvc[col];
311:     }
312:   }

314:   /* shift ia[i] to point to next col */
315:   for (i1=1; i1<nslim_col+1; i1++) {
316:     col        = ia[i1-1];
317:     ia[i1]    += col;
318:     work[i1-1] = col - oshift;
319:   }

321:   /* allocate space for column pointers */
322:   nz   = ia[nslim_col] + (!ishift);
323:   PetscMalloc(nz*sizeof(PetscInt),&ja);
324:   *jja = ja;

326:  /* loop over matrix putting into ja */
327:   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
328:     j   = aj + ai[row] + ishift;
329:     i2  = 0;                     /* Col inode index */
330:     col = *j++ + ishift;
331:     i2  = tvc[col];
332:     nz  = ai[row+1] - ai[row];
333:     while (nz-- > 0) {
334:       /* ja[work[i1]++] = i2 + oshift; */
335:       ja[work[i2]++] = i1 + oshift;
336:       i2++;
337:       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
338:       if (nz > 0) i2 = tvc[col];
339:     }
340:   }
341:   PetscFree(ns_col);
342:   PetscFree(work);
343:   PetscFree(tns);
344:   PetscFree(tvc);
345:   return(0);
346: }

350: static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
351: {

355:   Mat_CreateColInode(A,n,PETSC_NULL);
356:   if (!ia) return(0);

358:   if (symmetric) {
359:     /* Since the indices are symmetric it does'nt matter */
360:     MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
361:   } else {
362:     MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
363:   }
364:   return(0);
365: }

369: static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
370: {

374:   if (!ia) return(0);
375:   PetscFree(*ia);
376:   PetscFree(*ja);
377:   return(0);
378: }

380: /* ----------------------------------------------------------- */

384: static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
385: {
386:   Mat_inode      *a = (Mat_inode*)A->data;
387:   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
388:   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y;
390:   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
391: 
392: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
393: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
394: #endif

397:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
398:   node_max = a->inode.node_count;
399:   ns       = a->inode.size;     /* Node Size array */
400:   VecGetArray(xx,&x);
401:   VecGetArray(yy,&y);
402:   idx  = a->j;
403:   v1   = a->a;
404:   ii   = a->i;

406:   for (i = 0,row = 0; i< node_max; ++i){
407:     nsz  = ns[i];
408:     n    = ii[1] - ii[0];
409:     ii  += nsz;
410:     sz   = n;                   /* No of non zeros in this row */
411:                                 /* Switch on the size of Node */
412:     switch (nsz){               /* Each loop in 'case' is unrolled */
413:     case 1 :
414:       sum1  = 0;
415: 
416:       for(n = 0; n< sz-1; n+=2) {
417:         i1   = idx[0];          /* The instructions are ordered to */
418:         i2   = idx[1];          /* make the compiler's job easy */
419:         idx += 2;
420:         tmp0 = x[i1];
421:         tmp1 = x[i2];
422:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
423:        }
424: 
425:       if (n == sz-1){          /* Take care of the last nonzero  */
426:         tmp0  = x[*idx++];
427:         sum1 += *v1++ * tmp0;
428:       }
429:       y[row++]=sum1;
430:       break;
431:     case 2:
432:       sum1  = 0;
433:       sum2  = 0;
434:       v2    = v1 + n;
435: 
436:       for (n = 0; n< sz-1; n+=2) {
437:         i1   = idx[0];
438:         i2   = idx[1];
439:         idx += 2;
440:         tmp0 = x[i1];
441:         tmp1 = x[i2];
442:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
443:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
444:       }
445:       if (n == sz-1){
446:         tmp0  = x[*idx++];
447:         sum1 += *v1++ * tmp0;
448:         sum2 += *v2++ * tmp0;
449:       }
450:       y[row++]=sum1;
451:       y[row++]=sum2;
452:       v1      =v2;              /* Since the next block to be processed starts there*/
453:       idx    +=sz;
454:       break;
455:     case 3:
456:       sum1  = 0;
457:       sum2  = 0;
458:       sum3  = 0;
459:       v2    = v1 + n;
460:       v3    = v2 + n;
461: 
462:       for (n = 0; n< sz-1; n+=2) {
463:         i1   = idx[0];
464:         i2   = idx[1];
465:         idx += 2;
466:         tmp0 = x[i1];
467:         tmp1 = x[i2];
468:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
469:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
470:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
471:       }
472:       if (n == sz-1){
473:         tmp0  = x[*idx++];
474:         sum1 += *v1++ * tmp0;
475:         sum2 += *v2++ * tmp0;
476:         sum3 += *v3++ * tmp0;
477:       }
478:       y[row++]=sum1;
479:       y[row++]=sum2;
480:       y[row++]=sum3;
481:       v1       =v3;             /* Since the next block to be processed starts there*/
482:       idx     +=2*sz;
483:       break;
484:     case 4:
485:       sum1  = 0;
486:       sum2  = 0;
487:       sum3  = 0;
488:       sum4  = 0;
489:       v2    = v1 + n;
490:       v3    = v2 + n;
491:       v4    = v3 + n;
492: 
493:       for (n = 0; n< sz-1; n+=2) {
494:         i1   = idx[0];
495:         i2   = idx[1];
496:         idx += 2;
497:         tmp0 = x[i1];
498:         tmp1 = x[i2];
499:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
500:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
501:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
502:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
503:       }
504:       if (n == sz-1){
505:         tmp0  = x[*idx++];
506:         sum1 += *v1++ * tmp0;
507:         sum2 += *v2++ * tmp0;
508:         sum3 += *v3++ * tmp0;
509:         sum4 += *v4++ * tmp0;
510:       }
511:       y[row++]=sum1;
512:       y[row++]=sum2;
513:       y[row++]=sum3;
514:       y[row++]=sum4;
515:       v1      =v4;              /* Since the next block to be processed starts there*/
516:       idx    +=3*sz;
517:       break;
518:     case 5:
519:       sum1  = 0;
520:       sum2  = 0;
521:       sum3  = 0;
522:       sum4  = 0;
523:       sum5  = 0;
524:       v2    = v1 + n;
525:       v3    = v2 + n;
526:       v4    = v3 + n;
527:       v5    = v4 + n;
528: 
529:       for (n = 0; n<sz-1; n+=2) {
530:         i1   = idx[0];
531:         i2   = idx[1];
532:         idx += 2;
533:         tmp0 = x[i1];
534:         tmp1 = x[i2];
535:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
536:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
537:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
538:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
539:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
540:       }
541:       if (n == sz-1){
542:         tmp0  = x[*idx++];
543:         sum1 += *v1++ * tmp0;
544:         sum2 += *v2++ * tmp0;
545:         sum3 += *v3++ * tmp0;
546:         sum4 += *v4++ * tmp0;
547:         sum5 += *v5++ * tmp0;
548:       }
549:       y[row++]=sum1;
550:       y[row++]=sum2;
551:       y[row++]=sum3;
552:       y[row++]=sum4;
553:       y[row++]=sum5;
554:       v1      =v5;       /* Since the next block to be processed starts there */
555:       idx    +=4*sz;
556:       break;
557:     default :
558:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
559:     }
560:   }
561:   VecRestoreArray(xx,&x);
562:   VecRestoreArray(yy,&y);
563:   PetscLogFlops(2*a->nz - A->rmap.n);
564:   return(0);
565: }
566: /* ----------------------------------------------------------- */
567: /* Almost same code as the MatMult_Inode() */
570: static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
571: {
572:   Mat_inode      *a = (Mat_inode*)A->data;
573:   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
574:   PetscScalar    *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
576:   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
577: 
579:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
580:   node_max = a->inode.node_count;
581:   ns       = a->inode.size;     /* Node Size array */
582:   VecGetArray(xx,&x);
583:   VecGetArray(yy,&y);
584:   if (zz != yy) {
585:     VecGetArray(zz,&z);
586:   } else {
587:     z = y;
588:   }
589:   zt = z;

591:   idx  = a->j;
592:   v1   = a->a;
593:   ii   = a->i;

595:   for (i = 0,row = 0; i< node_max; ++i){
596:     nsz  = ns[i];
597:     n    = ii[1] - ii[0];
598:     ii  += nsz;
599:     sz   = n;                   /* No of non zeros in this row */
600:                                 /* Switch on the size of Node */
601:     switch (nsz){               /* Each loop in 'case' is unrolled */
602:     case 1 :
603:       sum1  = *zt++;
604: 
605:       for(n = 0; n< sz-1; n+=2) {
606:         i1   = idx[0];          /* The instructions are ordered to */
607:         i2   = idx[1];          /* make the compiler's job easy */
608:         idx += 2;
609:         tmp0 = x[i1];
610:         tmp1 = x[i2];
611:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
612:        }
613: 
614:       if(n   == sz-1){          /* Take care of the last nonzero  */
615:         tmp0  = x[*idx++];
616:         sum1 += *v1++ * tmp0;
617:       }
618:       y[row++]=sum1;
619:       break;
620:     case 2:
621:       sum1  = *zt++;
622:       sum2  = *zt++;
623:       v2    = v1 + n;
624: 
625:       for(n = 0; n< sz-1; n+=2) {
626:         i1   = idx[0];
627:         i2   = idx[1];
628:         idx += 2;
629:         tmp0 = x[i1];
630:         tmp1 = x[i2];
631:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
632:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
633:       }
634:       if(n   == sz-1){
635:         tmp0  = x[*idx++];
636:         sum1 += *v1++ * tmp0;
637:         sum2 += *v2++ * tmp0;
638:       }
639:       y[row++]=sum1;
640:       y[row++]=sum2;
641:       v1      =v2;              /* Since the next block to be processed starts there*/
642:       idx    +=sz;
643:       break;
644:     case 3:
645:       sum1  = *zt++;
646:       sum2  = *zt++;
647:       sum3  = *zt++;
648:       v2    = v1 + n;
649:       v3    = v2 + n;
650: 
651:       for (n = 0; n< sz-1; n+=2) {
652:         i1   = idx[0];
653:         i2   = idx[1];
654:         idx += 2;
655:         tmp0 = x[i1];
656:         tmp1 = x[i2];
657:         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
658:         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
659:         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
660:       }
661:       if (n == sz-1){
662:         tmp0  = x[*idx++];
663:         sum1 += *v1++ * tmp0;
664:         sum2 += *v2++ * tmp0;
665:         sum3 += *v3++ * tmp0;
666:       }
667:       y[row++]=sum1;
668:       y[row++]=sum2;
669:       y[row++]=sum3;
670:       v1       =v3;             /* Since the next block to be processed starts there*/
671:       idx     +=2*sz;
672:       break;
673:     case 4:
674:       sum1  = *zt++;
675:       sum2  = *zt++;
676:       sum3  = *zt++;
677:       sum4  = *zt++;
678:       v2    = v1 + n;
679:       v3    = v2 + n;
680:       v4    = v3 + n;
681: 
682:       for (n = 0; n< sz-1; n+=2) {
683:         i1   = idx[0];
684:         i2   = idx[1];
685:         idx += 2;
686:         tmp0 = x[i1];
687:         tmp1 = x[i2];
688:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
689:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
690:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
691:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
692:       }
693:       if (n == sz-1){
694:         tmp0  = x[*idx++];
695:         sum1 += *v1++ * tmp0;
696:         sum2 += *v2++ * tmp0;
697:         sum3 += *v3++ * tmp0;
698:         sum4 += *v4++ * tmp0;
699:       }
700:       y[row++]=sum1;
701:       y[row++]=sum2;
702:       y[row++]=sum3;
703:       y[row++]=sum4;
704:       v1      =v4;              /* Since the next block to be processed starts there*/
705:       idx    +=3*sz;
706:       break;
707:     case 5:
708:       sum1  = *zt++;
709:       sum2  = *zt++;
710:       sum3  = *zt++;
711:       sum4  = *zt++;
712:       sum5  = *zt++;
713:       v2    = v1 + n;
714:       v3    = v2 + n;
715:       v4    = v3 + n;
716:       v5    = v4 + n;
717: 
718:       for (n = 0; n<sz-1; n+=2) {
719:         i1   = idx[0];
720:         i2   = idx[1];
721:         idx += 2;
722:         tmp0 = x[i1];
723:         tmp1 = x[i2];
724:         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
725:         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
726:         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
727:         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
728:         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
729:       }
730:       if(n   == sz-1){
731:         tmp0  = x[*idx++];
732:         sum1 += *v1++ * tmp0;
733:         sum2 += *v2++ * tmp0;
734:         sum3 += *v3++ * tmp0;
735:         sum4 += *v4++ * tmp0;
736:         sum5 += *v5++ * tmp0;
737:       }
738:       y[row++]=sum1;
739:       y[row++]=sum2;
740:       y[row++]=sum3;
741:       y[row++]=sum4;
742:       y[row++]=sum5;
743:       v1      =v5;       /* Since the next block to be processed starts there */
744:       idx    +=4*sz;
745:       break;
746:     default :
747:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
748:     }
749:   }
750:   VecRestoreArray(xx,&x);
751:   VecRestoreArray(yy,&y);
752:   if (zz != yy) {
753:     VecRestoreArray(zz,&z);
754:   }
755:   PetscLogFlops(2*a->nz);
756:   return(0);
757: }

759: /* ----------------------------------------------------------- */
762: PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
763: {
764:   Mat_inode      *a = (Mat_inode*)A->data;
765:   IS             iscol = a->col,isrow = a->row;
767:   PetscInt       *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
768:   PetscInt       node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
769:   PetscScalar    *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
770:   PetscScalar    sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;

773:   if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
774:   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
775:   node_max = a->inode.node_count;
776:   ns       = a->inode.size;     /* Node Size array */

778:   VecGetArray(bb,&b);
779:   VecGetArray(xx,&x);
780:   tmp  = a->solve_work;
781: 
782:   ISGetIndices(isrow,&rout); r = rout;
783:   ISGetIndices(iscol,&cout); c = cout + (n-1);
784: 
785:   /* forward solve the lower triangular */
786:   tmps = tmp ;
787:   aa   = a_a ;
788:   aj   = a_j ;
789:   ad   = a->diag;

791:   for (i = 0,row = 0; i< node_max; ++i){
792:     nsz = ns[i];
793:     aii = ai[row];
794:     v1  = aa + aii;
795:     vi  = aj + aii;
796:     nz  = ad[row]- aii;
797: 
798:     switch (nsz){               /* Each loop in 'case' is unrolled */
799:     case 1 :
800:       sum1 = b[*r++];
801:       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
802:       for(j=0; j<nz-1; j+=2){
803:         i0   = vi[0];
804:         i1   = vi[1];
805:         vi  +=2;
806:         tmp0 = tmps[i0];
807:         tmp1 = tmps[i1];
808:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
809:       }
810:       if(j == nz-1){
811:         tmp0 = tmps[*vi++];
812:         sum1 -= *v1++ *tmp0;
813:       }
814:       tmp[row ++]=sum1;
815:       break;
816:     case 2:
817:       sum1 = b[*r++];
818:       sum2 = b[*r++];
819:       v2   = aa + ai[row+1];

821:       for(j=0; j<nz-1; j+=2){
822:         i0   = vi[0];
823:         i1   = vi[1];
824:         vi  +=2;
825:         tmp0 = tmps[i0];
826:         tmp1 = tmps[i1];
827:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
829:       }
830:       if(j == nz-1){
831:         tmp0 = tmps[*vi++];
832:         sum1 -= *v1++ *tmp0;
833:         sum2 -= *v2++ *tmp0;
834:       }
835:       sum2 -= *v2++ * sum1;
836:       tmp[row ++]=sum1;
837:       tmp[row ++]=sum2;
838:       break;
839:     case 3:
840:       sum1 = b[*r++];
841:       sum2 = b[*r++];
842:       sum3 = b[*r++];
843:       v2   = aa + ai[row+1];
844:       v3   = aa + ai[row+2];
845: 
846:       for (j=0; j<nz-1; j+=2){
847:         i0   = vi[0];
848:         i1   = vi[1];
849:         vi  +=2;
850:         tmp0 = tmps[i0];
851:         tmp1 = tmps[i1];
852:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
853:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
854:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
855:       }
856:       if (j == nz-1){
857:         tmp0 = tmps[*vi++];
858:         sum1 -= *v1++ *tmp0;
859:         sum2 -= *v2++ *tmp0;
860:         sum3 -= *v3++ *tmp0;
861:       }
862:       sum2 -= *v2++ * sum1;
863:       sum3 -= *v3++ * sum1;
864:       sum3 -= *v3++ * sum2;
865:       tmp[row ++]=sum1;
866:       tmp[row ++]=sum2;
867:       tmp[row ++]=sum3;
868:       break;
869: 
870:     case 4:
871:       sum1 = b[*r++];
872:       sum2 = b[*r++];
873:       sum3 = b[*r++];
874:       sum4 = b[*r++];
875:       v2   = aa + ai[row+1];
876:       v3   = aa + ai[row+2];
877:       v4   = aa + ai[row+3];
878: 
879:       for (j=0; j<nz-1; j+=2){
880:         i0   = vi[0];
881:         i1   = vi[1];
882:         vi  +=2;
883:         tmp0 = tmps[i0];
884:         tmp1 = tmps[i1];
885:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
886:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
887:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
888:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
889:       }
890:       if (j == nz-1){
891:         tmp0 = tmps[*vi++];
892:         sum1 -= *v1++ *tmp0;
893:         sum2 -= *v2++ *tmp0;
894:         sum3 -= *v3++ *tmp0;
895:         sum4 -= *v4++ *tmp0;
896:       }
897:       sum2 -= *v2++ * sum1;
898:       sum3 -= *v3++ * sum1;
899:       sum4 -= *v4++ * sum1;
900:       sum3 -= *v3++ * sum2;
901:       sum4 -= *v4++ * sum2;
902:       sum4 -= *v4++ * sum3;
903: 
904:       tmp[row ++]=sum1;
905:       tmp[row ++]=sum2;
906:       tmp[row ++]=sum3;
907:       tmp[row ++]=sum4;
908:       break;
909:     case 5:
910:       sum1 = b[*r++];
911:       sum2 = b[*r++];
912:       sum3 = b[*r++];
913:       sum4 = b[*r++];
914:       sum5 = b[*r++];
915:       v2   = aa + ai[row+1];
916:       v3   = aa + ai[row+2];
917:       v4   = aa + ai[row+3];
918:       v5   = aa + ai[row+4];
919: 
920:       for (j=0; j<nz-1; j+=2){
921:         i0   = vi[0];
922:         i1   = vi[1];
923:         vi  +=2;
924:         tmp0 = tmps[i0];
925:         tmp1 = tmps[i1];
926:         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
927:         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
928:         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
929:         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
930:         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
931:       }
932:       if (j == nz-1){
933:         tmp0 = tmps[*vi++];
934:         sum1 -= *v1++ *tmp0;
935:         sum2 -= *v2++ *tmp0;
936:         sum3 -= *v3++ *tmp0;
937:         sum4 -= *v4++ *tmp0;
938:         sum5 -= *v5++ *tmp0;
939:       }

941:       sum2 -= *v2++ * sum1;
942:       sum3 -= *v3++ * sum1;
943:       sum4 -= *v4++ * sum1;
944:       sum5 -= *v5++ * sum1;
945:       sum3 -= *v3++ * sum2;
946:       sum4 -= *v4++ * sum2;
947:       sum5 -= *v5++ * sum2;
948:       sum4 -= *v4++ * sum3;
949:       sum5 -= *v5++ * sum3;
950:       sum5 -= *v5++ * sum4;
951: 
952:       tmp[row ++]=sum1;
953:       tmp[row ++]=sum2;
954:       tmp[row ++]=sum3;
955:       tmp[row ++]=sum4;
956:       tmp[row ++]=sum5;
957:       break;
958:     default:
959:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
960:     }
961:   }
962:   /* backward solve the upper triangular */
963:   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
964:     nsz = ns[i];
965:     aii = ai[row+1] -1;
966:     v1  = aa + aii;
967:     vi  = aj + aii;
968:     nz  = aii- ad[row];
969:     switch (nsz){               /* Each loop in 'case' is unrolled */
970:     case 1 :
971:       sum1 = tmp[row];

973:       for(j=nz ; j>1; j-=2){
974:         vi  -=2;
975:         i0   = vi[2];
976:         i1   = vi[1];
977:         tmp0 = tmps[i0];
978:         tmp1 = tmps[i1];
979:         v1   -= 2;
980:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
981:       }
982:       if (j==1){
983:         tmp0  = tmps[*vi--];
984:         sum1 -= *v1-- * tmp0;
985:       }
986:       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
987:       break;
988:     case 2 :
989:       sum1 = tmp[row];
990:       sum2 = tmp[row -1];
991:       v2   = aa + ai[row]-1;
992:       for (j=nz ; j>1; j-=2){
993:         vi  -=2;
994:         i0   = vi[2];
995:         i1   = vi[1];
996:         tmp0 = tmps[i0];
997:         tmp1 = tmps[i1];
998:         v1   -= 2;
999:         v2   -= 2;
1000:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1001:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1002:       }
1003:       if (j==1){
1004:         tmp0  = tmps[*vi--];
1005:         sum1 -= *v1-- * tmp0;
1006:         sum2 -= *v2-- * tmp0;
1007:       }
1008: 
1009:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1010:       sum2   -= *v2-- * tmp0;
1011:       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1012:       break;
1013:     case 3 :
1014:       sum1 = tmp[row];
1015:       sum2 = tmp[row -1];
1016:       sum3 = tmp[row -2];
1017:       v2   = aa + ai[row]-1;
1018:       v3   = aa + ai[row -1]-1;
1019:       for (j=nz ; j>1; j-=2){
1020:         vi  -=2;
1021:         i0   = vi[2];
1022:         i1   = vi[1];
1023:         tmp0 = tmps[i0];
1024:         tmp1 = tmps[i1];
1025:         v1   -= 2;
1026:         v2   -= 2;
1027:         v3   -= 2;
1028:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1031:       }
1032:       if (j==1){
1033:         tmp0  = tmps[*vi--];
1034:         sum1 -= *v1-- * tmp0;
1035:         sum2 -= *v2-- * tmp0;
1036:         sum3 -= *v3-- * tmp0;
1037:       }
1038:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1039:       sum2   -= *v2-- * tmp0;
1040:       sum3   -= *v3-- * tmp0;
1041:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1042:       sum3   -= *v3-- * tmp0;
1043:       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1044: 
1045:       break;
1046:     case 4 :
1047:       sum1 = tmp[row];
1048:       sum2 = tmp[row -1];
1049:       sum3 = tmp[row -2];
1050:       sum4 = tmp[row -3];
1051:       v2   = aa + ai[row]-1;
1052:       v3   = aa + ai[row -1]-1;
1053:       v4   = aa + ai[row -2]-1;

1055:       for (j=nz ; j>1; j-=2){
1056:         vi  -=2;
1057:         i0   = vi[2];
1058:         i1   = vi[1];
1059:         tmp0 = tmps[i0];
1060:         tmp1 = tmps[i1];
1061:         v1  -= 2;
1062:         v2  -= 2;
1063:         v3  -= 2;
1064:         v4  -= 2;
1065:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1066:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1067:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1068:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1069:       }
1070:       if (j==1){
1071:         tmp0  = tmps[*vi--];
1072:         sum1 -= *v1-- * tmp0;
1073:         sum2 -= *v2-- * tmp0;
1074:         sum3 -= *v3-- * tmp0;
1075:         sum4 -= *v4-- * tmp0;
1076:       }

1078:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1079:       sum2   -= *v2-- * tmp0;
1080:       sum3   -= *v3-- * tmp0;
1081:       sum4   -= *v4-- * tmp0;
1082:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1083:       sum3   -= *v3-- * tmp0;
1084:       sum4   -= *v4-- * tmp0;
1085:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1086:       sum4   -= *v4-- * tmp0;
1087:       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1088:       break;
1089:     case 5 :
1090:       sum1 = tmp[row];
1091:       sum2 = tmp[row -1];
1092:       sum3 = tmp[row -2];
1093:       sum4 = tmp[row -3];
1094:       sum5 = tmp[row -4];
1095:       v2   = aa + ai[row]-1;
1096:       v3   = aa + ai[row -1]-1;
1097:       v4   = aa + ai[row -2]-1;
1098:       v5   = aa + ai[row -3]-1;
1099:       for (j=nz ; j>1; j-=2){
1100:         vi  -= 2;
1101:         i0   = vi[2];
1102:         i1   = vi[1];
1103:         tmp0 = tmps[i0];
1104:         tmp1 = tmps[i1];
1105:         v1   -= 2;
1106:         v2   -= 2;
1107:         v3   -= 2;
1108:         v4   -= 2;
1109:         v5   -= 2;
1110:         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1111:         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1112:         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1113:         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1114:         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1115:       }
1116:       if (j==1){
1117:         tmp0  = tmps[*vi--];
1118:         sum1 -= *v1-- * tmp0;
1119:         sum2 -= *v2-- * tmp0;
1120:         sum3 -= *v3-- * tmp0;
1121:         sum4 -= *v4-- * tmp0;
1122:         sum5 -= *v5-- * tmp0;
1123:       }

1125:       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1126:       sum2   -= *v2-- * tmp0;
1127:       sum3   -= *v3-- * tmp0;
1128:       sum4   -= *v4-- * tmp0;
1129:       sum5   -= *v5-- * tmp0;
1130:       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1131:       sum3   -= *v3-- * tmp0;
1132:       sum4   -= *v4-- * tmp0;
1133:       sum5   -= *v5-- * tmp0;
1134:       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1135:       sum4   -= *v4-- * tmp0;
1136:       sum5   -= *v5-- * tmp0;
1137:       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1138:       sum5   -= *v5-- * tmp0;
1139:       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1140:       break;
1141:     default:
1142:       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1143:     }
1144:   }
1145:   ISRestoreIndices(isrow,&rout);
1146:   ISRestoreIndices(iscol,&cout);
1147:   VecRestoreArray(bb,&b);
1148:   VecRestoreArray(xx,&x);
1149:   PetscLogFlops(2*a->nz - A->cmap.n);
1150:   return(0);
1151: }

1155: PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1156: {
1157:   Mat            C = *B;
1158:   Mat_inode      *a = (Mat_inode*)A->data,*b = (Mat_inode*)C->data;
1159:   IS             iscol = b->col,isrow = b->row,isicol = b->icol;
1161:   PetscInt       *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1162:   PetscInt       *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,row,prow;
1163:   PetscInt       *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1164:   PetscInt       *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1165:   PetscScalar    *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1166:   PetscScalar    tmp,*ba = b->a,*aa = a->a,*pv,*rtmps1,*rtmps2,*rtmps3;
1167:   PetscReal      rs=0.0,rsum[3];
1168:   LUShift_Ctx    sctx;
1169:   PetscInt       newshift;


1173:   sctx.shift_top  = 0;
1174:   sctx.nshift_max = 0;
1175:   sctx.shift_lo   = 0;
1176:   sctx.shift_hi   = 0;

1178:   /* if both shift schemes are chosen by user, only use info->shiftpd */
1179:   if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1180:   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1181:     sctx.shift_top = 0;
1182:     for (i=0; i<n; i++) {
1183:       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1184:       rs    = 0.0;
1185:       ajtmp = aj + ai[i];
1186:       rtmp1 = aa + ai[i];
1187:       nz = ai[i+1] - ai[i];
1188:       for (j=0; j<nz; j++){
1189:         if (*ajtmp != i){
1190:           rs += PetscAbsScalar(*rtmp1++);
1191:         } else {
1192:           rs -= PetscRealPart(*rtmp1++);
1193:         }
1194:         ajtmp++;
1195:       }
1196:       if (rs>sctx.shift_top) sctx.shift_top = rs;
1197:     }
1198:     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1199:     sctx.shift_top   *= 1.1;
1200:     sctx.nshift_max   = 5;
1201:     sctx.shift_lo     = 0.;
1202:     sctx.shift_hi     = 1.;
1203:   }
1204:   sctx.shift_amount = 0;
1205:   sctx.nshift       = 0;

1207:   ISGetIndices(isrow,&r);
1208:   ISGetIndices(iscol,&c);
1209:   ISGetIndices(isicol,&ic);
1210:   PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1211:   PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1212:   ics    = ic ; rtmps1 = rtmp1 ;
1213:   rtmp2  = rtmp1 + n;  rtmps2 = rtmp2 ;
1214:   rtmp3  = rtmp2 + n;  rtmps3 = rtmp3 ;
1215: 
1216:   node_max = a->inode.node_count;
1217:   ns       = a->inode.size ;
1218:   if (!ns){
1219:     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1220:   }

1222:   /* If max inode size > 3, split it into two inodes.*/
1223:   /* also map the inode sizes according to the ordering */
1224:   PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1225:   for (i=0,j=0; i<node_max; ++i,++j){
1226:     if (ns[i]>3) {
1227:       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1228:       ++j;
1229:       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1230:     } else {
1231:       tmp_vec1[j] = ns[i];
1232:     }
1233:   }
1234:   /* Use the correct node_max */
1235:   node_max = j;

1237:   /* Now reorder the inode info based on mat re-ordering info */
1238:   /* First create a row -> inode_size_array_index map */
1239:   PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1240:   PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1241:   for (i=0,row=0; i<node_max; i++) {
1242:     nodesz = tmp_vec1[i];
1243:     for (j=0; j<nodesz; j++,row++) {
1244:       nsmap[row] = i;
1245:     }
1246:   }
1247:   /* Using nsmap, create a reordered ns structure */
1248:   for (i=0,j=0; i< node_max; i++) {
1249:     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1250:     tmp_vec2[i] = nodesz;
1251:     j        += nodesz;
1252:   }
1253:   PetscFree(nsmap);
1254:   PetscFree(tmp_vec1);
1255:   /* Now use the correct ns */
1256:   ns = tmp_vec2;

1258:   do {
1259:     sctx.lushift = PETSC_FALSE;
1260:     /* Now loop over each block-row, and do the factorization */
1261:     for (i=0,row=0; i<node_max; i++) {
1262:       nodesz   = ns[i];
1263:       nz    = bi[row+1] - bi[row];
1264:       bjtmp = bj + bi[row];

1266:       switch (nodesz){
1267:       case 1:
1268:         for  (j=0; j<nz; j++){
1269:           idx         = bjtmp[j];
1270:           rtmps1[idx] = 0.0;
1271:         }
1272: 
1273:         /* load in initial (unfactored row) */
1274:         idx   = r[row];
1275:         nz    = ai[idx+1] - ai[idx];
1276:         ajtmp = aj + ai[idx];
1277:         v1    = aa + ai[idx];

1279:         for (j=0; j<nz; j++) {
1280:           idx        = ics[ajtmp[j]];
1281:           rtmp1[idx] = v1[j];
1282:           if (sctx.nshift && ajtmp[j] == r[row]) {
1283:             rtmp1[idx] += sctx.shift_amount;
1284:           }
1285:         }
1286:         prow = *bjtmp++ ;
1287:         while (prow < row) {
1288:           pc1 = rtmp1 + prow;
1289:           if (*pc1 != 0.0){
1290:             pv   = ba + bd[prow];
1291:             pj   = nbj + bd[prow];
1292:             mul1 = *pc1 * *pv++;
1293:             *pc1 = mul1;
1294:             nz   = bi[prow+1] - bd[prow] - 1;
1295:             PetscLogFlops(2*nz);
1296:             for (j=0; j<nz; j++) {
1297:               tmp = pv[j];
1298:               idx = pj[j];
1299:               rtmps1[idx] -= mul1 * tmp;
1300:             }
1301:           }
1302:           prow = *bjtmp++ ;
1303:         }
1304:         nz  = bi[row+1] - bi[row];
1305:         pj  = bj + bi[row];
1306:         pc1 = ba + bi[row];

1308:         sctx.pv    = rtmp1[row];
1309:         rs         = 0.0;
1310:         rtmp1[row] = 1.0/rtmp1[row];
1311:         for (j=0; j<nz; j++) {
1312:           idx    = pj[j];
1313:           pc1[j] = rtmps1[idx];
1314:           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1315:         }

1317:         sctx.rs  = rs;
1318:         MatLUCheckShift_inline(info,sctx,newshift);
1319:         if (newshift == 1){
1320:           goto endofwhile;
1321:         } else if (newshift == -1){
1322:           SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G, inode.size %D",row,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1323:         }
1324:         break;
1325: 
1326:       case 2:
1327:         for  (j=0; j<nz; j++) {
1328:           idx         = bjtmp[j];
1329:           rtmps1[idx] = 0.0;
1330:           rtmps2[idx] = 0.0;
1331:         }
1332: 
1333:         /* load in initial (unfactored row) */
1334:         idx   = r[row];
1335:         nz    = ai[idx+1] - ai[idx];
1336:         ajtmp = aj + ai[idx];
1337:         v1    = aa + ai[idx];
1338:         v2    = aa + ai[idx+1];
1339: 
1340:         for (j=0; j<nz; j++) {
1341:           idx        = ics[ajtmp[j]];
1342:           rtmp1[idx] = v1[j];
1343:           rtmp2[idx] = v2[j];
1344:           if (sctx.nshift && ajtmp[j] == r[row]) {
1345:             rtmp1[idx] += sctx.shift_amount;
1346:           }
1347:           if (sctx.nshift && ajtmp[j] == r[row+1]) {
1348:             rtmp2[idx] += sctx.shift_amount;
1349:           }
1350:         }
1351:         prow = *bjtmp++ ;
1352:         while (prow < row) {
1353:           pc1 = rtmp1 + prow;
1354:           pc2 = rtmp2 + prow;
1355:           if (*pc1 != 0.0 || *pc2 != 0.0){
1356:             pv   = ba + bd[prow];
1357:             pj   = nbj + bd[prow];
1358:             mul1 = *pc1 * *pv;
1359:             mul2 = *pc2 * *pv;
1360:             ++pv;
1361:             *pc1 = mul1;
1362:             *pc2 = mul2;
1363: 
1364:             nz   = bi[prow+1] - bd[prow] - 1;
1365:             PetscLogFlops(2*2*nz);
1366:             for (j=0; j<nz; j++) {
1367:               tmp = pv[j];
1368:               idx = pj[j];
1369:               rtmps1[idx] -= mul1 * tmp;
1370:               rtmps2[idx] -= mul2 * tmp;
1371:             }
1372:           }
1373:           prow = *bjtmp++ ;
1374:         }
1375:         /* Now take care of the odd element*/
1376:         pc1 = rtmp1 + prow;
1377:         pc2 = rtmp2 + prow;
1378:         if (*pc2 != 0.0){
1379:           pj   = nbj + bd[prow];

1381:           rs   = 0.0;
1382:           mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1383:           *pc2 = mul2;
1384:           nz   = bi[prow+1] - bd[prow] - 1;
1385:           PetscLogFlops(2*nz);
1386:           for (j=0; j<nz; j++) {
1387:             idx = pj[j] ;
1388:             tmp = rtmp1[idx];
1389:             rtmp2[idx] -= mul2 * tmp;
1390:             if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1391:           }
1392: 
1393:           sctx.rs  = rs;
1394:           sctx.pv  = *pc1;
1395:           MatLUCheckShift_inline(info,sctx,newshift);
1396:           if (newshift == 1){
1397:             goto endofwhile; /* sctx.shift_amount is updated */
1398:           } else if (newshift == -1){
1399:             SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",prow,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1400:           }
1401:         }
1402: 
1403:         nz  = bi[row+1] - bi[row];
1404:         pj  = bj + bi[row];
1405:         pc1 = ba + bi[row];
1406:         pc2 = ba + bi[row+1];

1408:         rsum[0] = rsum[1] = 0.0;
1409:         rtmp1[row]   = 1.0/rtmp1[row];
1410:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1411:         for (j=0; j<nz; j++) {
1412:           idx    = pj[j];
1413:           pc1[j] = rtmps1[idx];
1414:           pc2[j] = rtmps2[idx];
1415:           if (idx != row)   rsum[0] += PetscAbsScalar(pc1[j]);
1416:           if (idx != row+1) rsum[1] += PetscAbsScalar(pc2[j]);
1417:         }

1419:         sctx.pv = 1.0/rtmp1[row]; /* rtmp1[row] = 1.0/diag[row] */
1420:         sctx.rs = rsum[0];
1421:         MatLUCheckShift_inline(info,sctx,newshift);
1422:         if (newshift == 1){
1423:           goto endofwhile;
1424:         } else if (newshift == -1){
1425:           SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",row,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1426:         }
1427:         sctx.pv = 1.0/rtmp2[row+1];
1428:         sctx.rs = rsum[1];
1429:         MatLUCheckShift_inline(info,sctx,newshift);
1430:         if (newshift == 1){
1431:           goto endofwhile;
1432:         } else if (newshift == -1){
1433:           SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",row+1,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1434:         }
1435:         break;

1437:       case 3:
1438:         for  (j=0; j<nz; j++) {
1439:           idx         = bjtmp[j];
1440:           rtmps1[idx] = 0.0;
1441:           rtmps2[idx] = 0.0;
1442:           rtmps3[idx] = 0.0;
1443:         }
1444:         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1445:         idx   = r[row];
1446:         nz    = ai[idx+1] - ai[idx];
1447:         ajtmp = aj + ai[idx];
1448:         v1    = aa + ai[idx];
1449:         v2    = aa + ai[idx+1];
1450:         v3    = aa + ai[idx+2];
1451:         for (j=0; j<nz; j++) {
1452:           idx        = ics[ajtmp[j]];
1453:           rtmp1[idx] = v1[j];
1454:           rtmp2[idx] = v2[j];
1455:           rtmp3[idx] = v3[j];
1456:           if (sctx.nshift && ajtmp[j] == r[row]) {
1457:             rtmp1[idx] += sctx.shift_amount;
1458:           }
1459:           if (sctx.nshift && ajtmp[j] == r[row+1]) {
1460:             rtmp2[idx] += sctx.shift_amount;
1461:           }
1462:           if (sctx.nshift && ajtmp[j] == r[row+2]) {
1463:             rtmp3[idx] += sctx.shift_amount;
1464:           }
1465:         }
1466:         /* loop over all pivot row blocks above this row block */
1467:         prow = *bjtmp++ ;
1468:         while (prow < row) {
1469:           pc1 = rtmp1 + prow;
1470:           pc2 = rtmp2 + prow;
1471:           pc3 = rtmp3 + prow;
1472:           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1473:             pv   = ba  + bd[prow];
1474:             pj   = nbj + bd[prow];
1475:             mul1 = *pc1 * *pv;
1476:             mul2 = *pc2 * *pv;
1477:             mul3 = *pc3 * *pv;
1478:             ++pv;
1479:             *pc1 = mul1;
1480:             *pc2 = mul2;
1481:             *pc3 = mul3;
1482: 
1483:             nz   = bi[prow+1] - bd[prow] - 1;
1484:             PetscLogFlops(3*2*nz);
1485:             /* update this row based on pivot row */
1486:             for (j=0; j<nz; j++) {
1487:               tmp = pv[j];
1488:               idx = pj[j];
1489:               rtmps1[idx] -= mul1 * tmp;
1490:               rtmps2[idx] -= mul2 * tmp;
1491:               rtmps3[idx] -= mul3 * tmp;
1492:             }
1493:           }
1494:           prow = *bjtmp++ ;
1495:         }
1496:         /* Now take care of diagonal block in this set of rows */
1497:         pc1 = rtmp1 + prow;
1498:         pc2 = rtmp2 + prow;
1499:         pc3 = rtmp3 + prow;
1500:         if (*pc2 != 0.0 || *pc3 != 0.0){
1501:           pj   = nbj + bd[prow];

1503:           sctx.rs = 1.0; /* for simplicity, set rs=1.0 */
1504:           sctx.pv = *pc1;
1505:           MatLUCheckShift_inline(info,sctx,newshift);
1506:           if (newshift == 1){
1507:             goto endofwhile;
1508:           } else if (newshift == -1){
1509:             SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",prow,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1510:           }

1512:           mul2 = (*pc2)/(*pc1);
1513:           mul3 = (*pc3)/(*pc1);
1514:           *pc2 = mul2;
1515:           *pc3 = mul3;
1516:           nz   = bi[prow+1] - bd[prow] - 1;
1517:           PetscLogFlops(2*2*nz);
1518:           for (j=0; j<nz; j++) {
1519:             idx = pj[j] ;
1520:             tmp = rtmp1[idx];
1521:             rtmp2[idx] -= mul2 * tmp;
1522:             rtmp3[idx] -= mul3 * tmp;
1523:           }
1524:         }
1525:         ++prow;
1526:         pc2 = rtmp2 + prow;
1527:         pc3 = rtmp3 + prow;
1528:         if (*pc3 != 0.0){
1529:           pj   = nbj + bd[prow];
1530:           pj   = nbj + bd[prow];

1532:           sctx.rs  = 1.0; /* for simplicity, set rs=1.0 */
1533:           sctx.pv  = *pc2;
1534:           MatLUCheckShift_inline(info,sctx,newshift);
1535:           if (newshift == 1){
1536:             goto endofwhile;
1537:           } else if (newshift == -1){
1538:             SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",prow,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1539:           }
1540:           mul3 = (*pc3)/(*pc2);
1541:           *pc3 = mul3;
1542:           nz   = bi[prow+1] - bd[prow] - 1;
1543:           PetscLogFlops(2*2*nz);
1544:           for (j=0; j<nz; j++) {
1545:             idx = pj[j] ;
1546:             tmp = rtmp2[idx];
1547:             rtmp3[idx] -= mul3 * tmp;
1548:           }
1549:         }
1550:         nz  = bi[row+1] - bi[row];
1551:         pj  = bj + bi[row];
1552:         pc1 = ba + bi[row];
1553:         pc2 = ba + bi[row+1];
1554:         pc3 = ba + bi[row+2];

1556:         rsum[0] = rsum[1] = rsum[2] = 0.0;
1557:         rtmp1[row]   = 1.0/rtmp1[row];
1558:         rtmp2[row+1] = 1.0/rtmp2[row+1];
1559:         rtmp3[row+2] = 1.0/rtmp3[row+2];
1560:         /* copy row entries from dense representation to sparse */
1561:         for (j=0; j<nz; j++) {
1562:           idx    = pj[j];
1563:           pc1[j] = rtmps1[idx];
1564:           pc2[j] = rtmps2[idx];
1565:           pc3[j] = rtmps3[idx];
1566:           if (idx != row) rsum[0] += PetscAbsScalar(pc1[j]);
1567:           if (idx != row+1) rsum[1] += PetscAbsScalar(pc2[j]);
1568:           if (idx != row+2) rsum[2] += PetscAbsScalar(pc3[j]);
1569:         }

1571:         /* sctx.rs = rs/3.0; */
1572:         sctx.pv = 1.0/rtmp1[row];
1573:         sctx.rs = rsum[0];
1574:         MatLUCheckShift_inline(info,sctx,newshift);
1575:         if (newshift == 1){
1576:           goto endofwhile;
1577:         } else if (newshift == -1){
1578:           SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D",row,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1579:         }
1580:         sctx.pv = 1.0/rtmp2[row+1];
1581:         sctx.rs = rsum[1];
1582:         MatLUCheckShift_inline(info,sctx,newshift);
1583:         if (newshift == 1){
1584:           goto endofwhile;
1585:         } else if (newshift == -1){
1586:           SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D, 2nd row",row+1,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1587:         }
1588:         sctx.pv = 1.0/rtmp3[row+2];
1589:         sctx.rs = rsum[2];
1590:         MatLUCheckShift_inline(info,sctx,newshift);
1591:         if (newshift == 1){
1592:           goto endofwhile;
1593:         } else if (newshift == -1){
1594:           SETERRQ5(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G inode.size %D, 3rd row",row+2,PetscAbsScalar(sctx.pv),info->zeropivot,rs,nodesz);
1595:         }
1596:         break;
1597:       default:
1598:         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1599:       }
1600:       row += nodesz;                 /* Update the row */
1601:     }
1602:     endofwhile:;
1603:   } while (sctx.lushift);
1604:   PetscFree(rtmp1);
1605:   PetscFree(tmp_vec2);
1606:   ISRestoreIndices(isicol,&ic);
1607:   ISRestoreIndices(isrow,&r);
1608:   ISRestoreIndices(iscol,&c);
1609:   C->factor      = FACTOR_LU;
1610:   C->assembled   = PETSC_TRUE;
1611:   if (sctx.nshift) {
1612:     if (info->shiftnz) {
1613:       PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1614:     } else if (info->shiftpd) {
1615:       PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);
1616:     }
1617:   }
1618:   PetscLogFlops(C->cmap.n);
1619:   return(0);
1620: }

1622: /*
1623:      Makes a longer coloring[] array and calls the usual code with that
1624: */
1627: PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1628: {
1629:   Mat_inode       *a = (Mat_inode*)mat->data;
1630:   PetscErrorCode  ierr;
1631:   PetscInt        n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1632:   PetscInt        *colorused,i;
1633:   ISColoringValue *newcolor;

1636:   PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
1637:   /* loop over inodes, marking a color for each column*/
1638:   row = 0;
1639:   for (i=0; i<m; i++){
1640:     for (j=0; j<ns[i]; j++) {
1641:       newcolor[row++] = coloring[i] + j*ncolors;
1642:     }
1643:   }

1645:   /* eliminate unneeded colors */
1646:   PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
1647:   PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
1648:   for (i=0; i<n; i++) {
1649:     colorused[newcolor[i]] = 1;
1650:   }

1652:   for (i=1; i<5*ncolors; i++) {
1653:     colorused[i] += colorused[i-1];
1654:   }
1655:   ncolors = colorused[5*ncolors-1];
1656:   for (i=0; i<n; i++) {
1657:     newcolor[i] = colorused[newcolor[i]];
1658:   }
1659:   PetscFree(colorused);
1660:   ISColoringCreate(mat->comm,ncolors,n,newcolor,iscoloring);
1661:   PetscFree(coloring);
1662:   return(0);
1663: }

1665: /*
1666:     samestructure indicates that the matrix has not changed its nonzero structure so we 
1667:     do not need to recompute the inodes 
1668: */
1671: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
1672: {
1673:   Mat_inode      *a = (Mat_inode*)A->data;
1675:   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
1676:   PetscTruth     flag,flg;

1679:   if (a->inode.checked && samestructure) return(0);

1681:   a->inode.checked = PETSC_TRUE;

1683:   /* Notes: We set a->inode.limit=5 in MatCreate_Inode(). */
1684:   if (!a->inode.use) {PetscInfo(A,"Not using Inode routines due to MatSetOption(MAT_DO_NOT_USE_INODES\n"); return(0);}
1685:   PetscOptionsHasName(A->prefix,"-mat_no_inode",&flg);
1686:   if (flg) {PetscInfo(A,"Not using Inode routines due to -mat_no_inode\n");return(0);}
1687:   PetscOptionsHasName(A->prefix,"-mat_no_unroll",&flg);
1688:   if (flg) {PetscInfo(A,"Not using Inode routines due to -mat_no_unroll\n");return(0);}
1689:   PetscOptionsGetInt(A->prefix,"-mat_inode_limit",&a->inode.limit,PETSC_NULL);
1690:   if (a->inode.limit > a->inode.max_limit) a->inode.limit = a->inode.max_limit;
1691:   m = A->rmap.n;
1692:   if (a->inode.size) {ns = a->inode.size;}
1693:   else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}

1695:   i          = 0;
1696:   node_count = 0;
1697:   idx        = a->j;
1698:   ii         = a->i;
1699:   while (i < m){                /* For each row */
1700:     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
1701:     /* Limits the number of elements in a node to 'a->inode.limit' */
1702:     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
1703:       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
1704:       if (nzy != nzx) break;
1705:       idy  += nzx;             /* Same nonzero pattern */
1706:       PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
1707:       if (!flag) break;
1708:     }
1709:     ns[node_count++] = blk_size;
1710:     idx += blk_size*nzx;
1711:     i    = j;
1712:   }
1713:   /* If not enough inodes found,, do not use inode version of the routines */
1714:   if (!a->inode.size && m && node_count > 0.9*m) {
1715:     PetscFree(ns);
1716:     a->inode.node_count     = 0;
1717:     a->inode.size           = PETSC_NULL;
1718:     a->inode.use            = PETSC_FALSE;
1719:     PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
1720:   } else {
1721:     A->ops->mult            = MatMult_Inode;
1722:     A->ops->multadd         = MatMultAdd_Inode;
1723:     A->ops->solve           = MatSolve_Inode;
1724:     A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
1725:     A->ops->getrowij        = MatGetRowIJ_Inode;
1726:     A->ops->restorerowij    = MatRestoreRowIJ_Inode;
1727:     A->ops->getcolumnij     = MatGetColumnIJ_Inode;
1728:     A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
1729:     A->ops->coloringpatch   = MatColoringPatch_Inode;
1730:     a->inode.node_count     = node_count;
1731:     a->inode.size           = ns;
1732:     PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
1733:   }
1734:   return(0);
1735: }

1737: /*
1738:      This is really ugly. if inodes are used this replaces the 
1739:   permutations with ones that correspond to rows/cols of the matrix
1740:   rather then inode blocks
1741: */
1744: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1745: {
1746:   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);

1749:   PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
1750:   if (f) {
1751:     (*f)(A,rperm,cperm);
1752:   }
1753:   return(0);
1754: }

1759: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
1760: {
1761:   Mat_inode      *a=(Mat_inode*)A->data;
1763:   PetscInt       m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1764:   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
1765:   PetscInt       nslim_col,*ns_col;
1766:   IS             ris = *rperm,cis = *cperm;

1769:   if (!a->inode.size) return(0); /* no inodes so return */
1770:   if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */

1772:   Mat_CreateColInode(A,&nslim_col,&ns_col);
1773:   PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
1774:   PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
1775:   permc = permr + m;

1777:   ISGetIndices(ris,&ridx);
1778:   ISGetIndices(cis,&cidx);

1780:   /* Form the inode structure for the rows of permuted matric using inv perm*/
1781:   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];

1783:   /* Construct the permutations for rows*/
1784:   for (i=0,row = 0; i<nslim_row; ++i){
1785:     indx      = ridx[i];
1786:     start_val = tns[indx];
1787:     end_val   = tns[indx + 1];
1788:     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1789:   }

1791:   /* Form the inode structure for the columns of permuted matrix using inv perm*/
1792:   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];

1794:  /* Construct permutations for columns */
1795:   for (i=0,col=0; i<nslim_col; ++i){
1796:     indx      = cidx[i];
1797:     start_val = tns[indx];
1798:     end_val   = tns[indx + 1];
1799:     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1800:   }

1802:   ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1803:   ISSetPermutation(*rperm);
1804:   ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1805:   ISSetPermutation(*cperm);
1806: 
1807:   ISRestoreIndices(ris,&ridx);
1808:   ISRestoreIndices(cis,&cidx);

1810:   PetscFree(ns_col);
1811:   PetscFree(permr);
1812:   ISDestroy(cis);
1813:   ISDestroy(ris);
1814:   PetscFree(tns);
1815:   return(0);
1816: }

1821: /*@C
1822:    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.

1824:    Collective on Mat

1826:    Input Parameter:
1827: .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ

1829:    Output Parameter:
1830: +  node_count - no of inodes present in the matrix.
1831: .  sizes      - an array of size node_count,with sizes of each inode.
1832: -  limit      - the max size used to generate the inodes.

1834:    Level: advanced

1836:    Notes: This routine returns some internal storage information
1837:    of the matrix, it is intended to be used by advanced users.
1838:    It should be called after the matrix is assembled.
1839:    The contents of the sizes[] array should not be changed.
1840:    PETSC_NULL may be passed for information not requested.

1842: .keywords: matrix, seqaij, get, inode

1844: .seealso: MatGetInfo()
1845: @*/
1846: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1847: {
1848:   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);

1851:   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1852:   PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
1853:   if (f) {
1854:     (*f)(A,node_count,sizes,limit);
1855:   }
1856:   return(0);
1857: }

1862: PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1863: {
1864:   Mat_inode *a = (Mat_inode*)A->data;

1867:   if (node_count) *node_count = a->inode.node_count;
1868:   if (sizes)      *sizes      = a->inode.size;
1869:   if (limit)      *limit      = a->inode.limit;
1870:   return(0);
1871: }