Actual source code: sbaijfact.c

  1: #define PETSCMAT_DLL

 3:  #include src/mat/impls/baij/seq/baij.h
 4:  #include src/mat/impls/sbaij/seq/sbaij.h
 5:  #include src/inline/ilu.h
 6:  #include include/petscis.h

  8: #if !defined(PETSC_USE_COMPLEX)
  9: /* 
 10:   input:
 11:    F -- numeric factor 
 12:   output:
 13:    nneg, nzero, npos: matrix inertia 
 14: */

 18: PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos)
 19: {
 20:   Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data;
 21:   PetscScalar  *dd = fact_ptr->a;
 22:   PetscInt     mbs=fact_ptr->mbs,bs=F->rmap.bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->i;

 25:   if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs);
 26:   nneig_tmp = 0; npos_tmp = 0;
 27:   for (i=0; i<mbs; i++){
 28:     if (PetscRealPart(dd[*fi]) > 0.0){
 29:       npos_tmp++;
 30:     } else if (PetscRealPart(dd[*fi]) < 0.0){
 31:       nneig_tmp++;
 32:     }
 33:     fi++;
 34:   }
 35:   if (nneig) *nneig = nneig_tmp;
 36:   if (npos)  *npos  = npos_tmp;
 37:   if (nzero) *nzero = mbs - nneig_tmp - npos_tmp;

 39:   return(0);
 40: }
 41: #endif /* !defined(PETSC_USE_COMPLEX) */

 43: /* 
 44:   Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP.
 45:   Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad. 
 46: */
 49: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
 50: {
 51:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
 53:   PetscInt       *rip,i,mbs = a->mbs,*ai,*aj;
 54:   PetscInt       *jutmp,bs = A->rmap.bs,bs2=a->bs2;
 55:   PetscInt       m,reallocs = 0,prow;
 56:   PetscInt       *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
 57:   PetscReal      f = info->fill;
 58:   PetscTruth     perm_identity;

 61:   /* check whether perm is the identity mapping */
 62:   ISIdentity(perm,&perm_identity);
 63:   ISGetIndices(perm,&rip);
 64: 
 65:   if (perm_identity){ /* without permutation */
 66:     a->permute = PETSC_FALSE;
 67:     ai = a->i; aj = a->j;
 68:   } else {            /* non-trivial permutation */
 69:     a->permute = PETSC_TRUE;
 70:     MatReorderingSeqSBAIJ(A,perm);
 71:     ai = a->inew; aj = a->jnew;
 72:   }
 73: 
 74:   /* initialization */
 75:   PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
 76:   umax  = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1;
 77:   PetscMalloc(umax*sizeof(PetscInt),&ju);
 78:   iu[0] = mbs+1;
 79:   juidx = mbs + 1; /* index for ju */
 80:   PetscMalloc(2*mbs*sizeof(PetscInt),&jl); /* linked list for pivot row */
 81:   q     = jl + mbs;   /* linked list for col index */
 82:   for (i=0; i<mbs; i++){
 83:     jl[i] = mbs;
 84:     q[i] = 0;
 85:   }

 87:   /* for each row k */
 88:   for (k=0; k<mbs; k++){
 89:     for (i=0; i<mbs; i++) q[i] = 0;  /* to be removed! */
 90:     nzk  = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
 91:     q[k] = mbs;
 92:     /* initialize nonzero structure of k-th row to row rip[k] of A */
 93:     jmin = ai[rip[k]] +1; /* exclude diag[k] */
 94:     jmax = ai[rip[k]+1];
 95:     for (j=jmin; j<jmax; j++){
 96:       vj = rip[aj[j]]; /* col. value */
 97:       if(vj > k){
 98:         qm = k;
 99:         do {
100:           m  = qm; qm = q[m];
101:         } while(qm < vj);
102:         if (qm == vj) {
103:           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
104:         }
105:         nzk++;
106:         q[m]  = vj;
107:         q[vj] = qm;
108:       } /* if(vj > k) */
109:     } /* for (j=jmin; j<jmax; j++) */

111:     /* modify nonzero structure of k-th row by computing fill-in
112:        for each row i to be merged in */
113:     prow = k;
114:     prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */
115: 
116:     while (prow < k){
117:       /* merge row prow into k-th row */
118:       jmin = iu[prow] + 1; jmax = iu[prow+1];
119:       qm = k;
120:       for (j=jmin; j<jmax; j++){
121:         vj = ju[j];
122:         do {
123:           m = qm; qm = q[m];
124:         } while (qm < vj);
125:         if (qm != vj){
126:          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
127:         }
128:       }
129:       prow = jl[prow]; /* next pivot row */
130:     }
131: 
132:     /* add k to row list for first nonzero element in k-th row */
133:     if (nzk > 0){
134:       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
135:       jl[k] = jl[i]; jl[i] = k;
136:     }
137:     iu[k+1] = iu[k] + nzk;

139:     /* allocate more space to ju if needed */
140:     if (iu[k+1] > umax) {
141:       /* estimate how much additional space we will need */
142:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
143:       /* just double the memory each time */
144:       maxadd = umax;
145:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
146:       umax += maxadd;

148:       /* allocate a longer ju */
149:       PetscMalloc(umax*sizeof(PetscInt),&jutmp);
150:       PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
151:       PetscFree(ju);
152:       ju   = jutmp;
153:       reallocs++; /* count how many times we realloc */
154:     }

156:     /* save nonzero structure of k-th row in ju */
157:     i=k;
158:     while (nzk --) {
159:       i           = q[i];
160:       ju[juidx++] = i;
161:     }
162:   }

164: #if defined(PETSC_USE_INFO)
165:   if (ai[mbs] != 0) {
166:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
167:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
168:     PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
169:     PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
170:     PetscInfo(A,"for best performance.\n");
171:   } else {
172:     PetscInfo(A,"Empty matrix.\n");
173:   }
174: #endif

176:   ISRestoreIndices(perm,&rip);
177:   PetscFree(jl);

179:   /* put together the new matrix */
180:   MatCreate(A->comm,B);
181:   MatSetSizes(*B,bs*mbs,bs*mbs,bs*mbs,bs*mbs);
182:   MatSetType(*B,A->type_name);
183:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(*B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);

185:   /* PetscLogObjectParent(*B,iperm); */
186:   b = (Mat_SeqSBAIJ*)(*B)->data;
187:   b->singlemalloc = PETSC_FALSE;
188:   PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
189:   b->j    = ju;
190:   b->i    = iu;
191:   b->diag = 0;
192:   b->ilen = 0;
193:   b->imax = 0;
194:   b->row  = perm;
195:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
196:   PetscObjectReference((PetscObject)perm);
197:   b->icol = perm;
198:   PetscObjectReference((PetscObject)perm);
199:   PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
200:   /* In b structure:  Free imax, ilen, old a, old j.  
201:      Allocate idnew, solve_work, new a, new j */
202:   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
203:   b->maxnz = b->nz = iu[mbs];
204: 
205:   (*B)->factor                 = FACTOR_CHOLESKY;
206:   (*B)->info.factor_mallocs    = reallocs;
207:   (*B)->info.fill_ratio_given  = f;
208:   if (ai[mbs] != 0) {
209:     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
210:   } else {
211:     (*B)->info.fill_ratio_needed = 0.0;
212:   }

214:   if (perm_identity){
215:     switch (bs) {
216:       case 1:
217:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
218:         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
219:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=1\n");
220:         break;
221:       case 2:
222:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
223:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
224:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=2\n");
225:         break;
226:       case 3:
227:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
228:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
229:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=3\n");
230:         break;
231:       case 4:
232:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
233:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
234:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=4\n");
235:         break;
236:       case 5:
237:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
238:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
239:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=5\n");
240:         break;
241:       case 6:
242:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
243:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
244:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=6\n");
245:         break;
246:       case 7:
247:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
248:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
249:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=7\n");
250:       break;
251:       default:
252:         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
253:         (*B)->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
254:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS>7\n");
255:       break;
256:     }
257:   }
258:   return(0);
259: }
260: /*
261:     Symbolic U^T*D*U factorization for SBAIJ format. 
262: */
263:  #include petscbt.h
264:  #include src/mat/utils/freespace.h
267: PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *fact)
268: {
269:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
270:   Mat_SeqSBAIJ       *b;
271:   Mat                B;
272:   PetscErrorCode     ierr;
273:   PetscTruth         perm_identity;
274:   PetscReal          fill = info->fill;
275:   PetscInt           *rip,i,mbs=a->mbs,bs=A->rmap.bs,*ai,*aj,reallocs=0,prow;
276:   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
277:   PetscInt           nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr;
278:   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
279:   PetscBT            lnkbt;

282:   /*  
283:    This code originally uses Modified Sparse Row (MSR) storage
284:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise!
285:    Then it is rewritten so the factor B takes seqsbaij format. However the associated 
286:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity, 
287:    thus the original code in MSR format is still used for these cases. 
288:    The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever 
289:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
290:   */
291:   if (bs > 1){
292:     MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,fact);
293:     return(0);
294:   }

296:   /* check whether perm is the identity mapping */
297:   ISIdentity(perm,&perm_identity);

299:   if (perm_identity){
300:     a->permute = PETSC_FALSE;
301:     ai = a->i; aj = a->j;
302:   } else {
303:     a->permute = PETSC_TRUE;
304:     MatReorderingSeqSBAIJ(A,perm);
305:     ai = a->inew; aj = a->jnew;
306:   }
307:   ISGetIndices(perm,&rip);

309:   /* initialization */
310:   PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);
311:   ui[0] = 0;

313:   /* jl: linked list for storing indices of the pivot rows 
314:      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
315:   PetscMalloc((3*mbs+1)*sizeof(PetscInt)+mbs*sizeof(PetscInt*),&jl);
316:   il     = jl + mbs;
317:   cols   = il + mbs;
318:   ui_ptr = (PetscInt**)(cols + mbs);
319: 
320:   for (i=0; i<mbs; i++){
321:     jl[i] = mbs; il[i] = 0;
322:   }

324:   /* create and initialize a linked list for storing column indices of the active row k */
325:   nlnk = mbs + 1;
326:   PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);

328:   /* initial FreeSpace size is fill*(ai[mbs]+1) */
329:   PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);
330:   current_space = free_space;

332:   for (k=0; k<mbs; k++){  /* for each active row k */
333:     /* initialize lnk by the column indices of row rip[k] of A */
334:     nzk   = 0;
335:     ncols = ai[rip[k]+1] - ai[rip[k]];
336:     for (j=0; j<ncols; j++){
337:       i = *(aj + ai[rip[k]] + j);
338:       cols[j] = rip[i];
339:     }
340:     PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);
341:     nzk += nlnk;

343:     /* update lnk by computing fill-in for each pivot row to be merged in */
344:     prow = jl[k]; /* 1st pivot row */
345: 
346:     while (prow < k){
347:       nextprow = jl[prow];
348:       /* merge prow into k-th row */
349:       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
350:       jmax = ui[prow+1];
351:       ncols = jmax-jmin;
352:       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
353:       PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);
354:       nzk += nlnk;

356:       /* update il and jl for prow */
357:       if (jmin < jmax){
358:         il[prow] = jmin;
359:         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
360:       }
361:       prow = nextprow;
362:     }

364:     /* if free space is not available, make more free space */
365:     if (current_space->local_remaining<nzk) {
366:       i = mbs - k + 1; /* num of unfactored rows */
367:       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
368:       PetscFreeSpaceGet(i,&current_space);
369:       reallocs++;
370:     }

372:     /* copy data into free space, then initialize lnk */
373:     PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);

375:     /* add the k-th row into il and jl */
376:     if (nzk-1 > 0){
377:       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
378:       jl[k] = jl[i]; jl[i] = k;
379:       il[k] = ui[k] + 1;
380:     }
381:     ui_ptr[k] = current_space->array;
382:     current_space->array           += nzk;
383:     current_space->local_used      += nzk;
384:     current_space->local_remaining -= nzk;

386:     ui[k+1] = ui[k] + nzk;
387:   }

389: #if defined(PETSC_USE_INFO)
390:   if (ai[mbs] != 0) {
391:     PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
392:     PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
393:     PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
394:     PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
395:   } else {
396:     PetscInfo(A,"Empty matrix.\n");
397:   }
398: #endif

400:   ISRestoreIndices(perm,&rip);
401:   PetscFree(jl);

403:   /* destroy list of free space and other temporary array(s) */
404:   PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);
405:   PetscFreeSpaceContiguous(&free_space,uj);
406:   PetscLLDestroy(lnk,lnkbt);

408:   /* put together the new matrix in MATSEQSBAIJ format */
409:   MatCreate(PETSC_COMM_SELF,fact);
410:   MatSetSizes(*fact,mbs,mbs,mbs,mbs);
411:   B    = *fact;
412:   MatSetType(B,MATSEQSBAIJ);
413:   MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);

415:   b = (Mat_SeqSBAIJ*)B->data;
416:   b->singlemalloc = PETSC_FALSE;
417:   PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);
418:   b->j    = uj;
419:   b->i    = ui;
420:   b->diag = 0;
421:   b->ilen = 0;
422:   b->imax = 0;
423:   b->row  = perm;
424:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
425:   PetscObjectReference((PetscObject)perm);
426:   b->icol = perm;
427:   PetscObjectReference((PetscObject)perm);
428:   PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);
429:   PetscLogObjectMemory(B,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
430:   b->maxnz = b->nz = ui[mbs];
431: 
432:   B->factor                 = FACTOR_CHOLESKY;
433:   B->info.factor_mallocs    = reallocs;
434:   B->info.fill_ratio_given  = fill;
435:   if (ai[mbs] != 0) {
436:     B->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
437:   } else {
438:     B->info.fill_ratio_needed = 0.0;
439:   }

441:   if (perm_identity){
442:     switch (bs) {
443:       case 1:
444:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
445:         B->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
446:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=1\n");
447:         break;
448:       case 2:
449:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
450:         B->ops->solve           = MatSolve_SeqSBAIJ_2_NaturalOrdering;
451:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=2\n");
452:         break;
453:       case 3:
454:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
455:         B->ops->solve           = MatSolve_SeqSBAIJ_3_NaturalOrdering;
456:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=3\n");
457:         break;
458:       case 4:
459:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
460:         B->ops->solve           = MatSolve_SeqSBAIJ_4_NaturalOrdering;
461:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=4\n");
462:         break;
463:       case 5:
464:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
465:         B->ops->solve           = MatSolve_SeqSBAIJ_5_NaturalOrdering;
466:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=5\n");
467:         break;
468:       case 6:
469:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
470:         B->ops->solve           = MatSolve_SeqSBAIJ_6_NaturalOrdering;
471:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=6\n");
472:         break;
473:       case 7:
474:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
475:         B->ops->solve           = MatSolve_SeqSBAIJ_7_NaturalOrdering;
476:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS=7\n");
477:       break;
478:       default:
479:         B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
480:         B->ops->solve           = MatSolve_SeqSBAIJ_N_NaturalOrdering;
481:         PetscInfo(A,"Using special in-place natural ordering factor and solve BS>7\n");
482:       break;
483:     }
484:   }
485:   return(0);
486: }
489: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat A,MatFactorInfo *info,Mat *B)
490: {
491:   Mat            C = *B;
492:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
493:   IS             perm = b->row;
495:   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
496:   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
497:   PetscInt       bs=A->rmap.bs,bs2 = a->bs2;
498:   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
499:   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
500:   MatScalar      *work;
501:   PetscInt       *pivots;

504:   /* initialization */
505:   PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
506:   PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
507:   PetscMalloc(2*mbs*sizeof(PetscInt),&il);
508:   jl   = il + mbs;
509:   for (i=0; i<mbs; i++) {
510:     jl[i] = mbs; il[0] = 0;
511:   }
512:   PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
513:   uik  = dk + bs2;
514:   work = uik + bs2;
515:   PetscMalloc(bs*sizeof(PetscInt),&pivots);
516: 
517:   ISGetIndices(perm,&perm_ptr);
518: 
519:   /* check permutation */
520:   if (!a->permute){
521:     ai = a->i; aj = a->j; aa = a->a;
522:   } else {
523:     ai   = a->inew; aj = a->jnew;
524:     PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);
525:     PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));
526:     PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);
527:     PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));

529:     for (i=0; i<mbs; i++){
530:       jmin = ai[i]; jmax = ai[i+1];
531:       for (j=jmin; j<jmax; j++){
532:         while (a2anew[j] != j){
533:           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
534:           for (k1=0; k1<bs2; k1++){
535:             dk[k1]       = aa[k*bs2+k1];
536:             aa[k*bs2+k1] = aa[j*bs2+k1];
537:             aa[j*bs2+k1] = dk[k1];
538:           }
539:         }
540:         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
541:         if (i > aj[j]){
542:           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
543:           ap = aa + j*bs2;                     /* ptr to the beginning of j-th block of aa */
544:           for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */
545:           for (k=0; k<bs; k++){               /* j-th block of aa <- dk^T */
546:             for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1];
547:           }
548:         }
549:       }
550:     }
551:     PetscFree(a2anew);
552:   }
553: 
554:   /* for each row k */
555:   for (k = 0; k<mbs; k++){

557:     /*initialize k-th row with elements nonzero in row perm(k) of A */
558:     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
559: 
560:     ap = aa + jmin*bs2;
561:     for (j = jmin; j < jmax; j++){
562:       vj = perm_ptr[aj[j]];         /* block col. index */
563:       rtmp_ptr = rtmp + vj*bs2;
564:       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
565:     }

567:     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
568:     PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
569:     i = jl[k]; /* first row to be added to k_th row  */

571:     while (i < k){
572:       nexti = jl[i]; /* next row to be added to k_th row */

574:       /* compute multiplier */
575:       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */

577:       /* uik = -inv(Di)*U_bar(i,k) */
578:       diag = ba + i*bs2;
579:       u    = ba + ili*bs2;
580:       PetscMemzero(uik,bs2*sizeof(MatScalar));
581:       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
582: 
583:       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
584:       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
585: 
586:       /* update -U(i,k) */
587:       PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));

589:       /* add multiple of row i to k-th row ... */
590:       jmin = ili + 1; jmax = bi[i+1];
591:       if (jmin < jmax){
592:         for (j=jmin; j<jmax; j++) {
593:           /* rtmp += -U(i,k)^T * U_bar(i,j) */
594:           rtmp_ptr = rtmp + bj[j]*bs2;
595:           u = ba + j*bs2;
596:           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
597:         }
598: 
599:         /* ... add i to row list for next nonzero entry */
600:         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
601:         j     = bj[jmin];
602:         jl[i] = jl[j]; jl[j] = i; /* update jl */
603:       }
604:       i = nexti;
605:     }

607:     /* save nonzero entries in k-th row of U ... */

609:     /* invert diagonal block */
610:     diag = ba+k*bs2;
611:     PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
612:     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
613: 
614:     jmin = bi[k]; jmax = bi[k+1];
615:     if (jmin < jmax) {
616:       for (j=jmin; j<jmax; j++){
617:          vj = bj[j];           /* block col. index of U */
618:          u   = ba + j*bs2;
619:          rtmp_ptr = rtmp + vj*bs2;
620:          for (k1=0; k1<bs2; k1++){
621:            *u++        = *rtmp_ptr;
622:            *rtmp_ptr++ = 0.0;
623:          }
624:       }
625: 
626:       /* ... add k to row list for first nonzero entry in k-th row */
627:       il[k] = jmin;
628:       i     = bj[jmin];
629:       jl[k] = jl[i]; jl[i] = k;
630:     }
631:   }

633:   PetscFree(rtmp);
634:   PetscFree(il);
635:   PetscFree(dk);
636:   PetscFree(pivots);
637:   if (a->permute){
638:     PetscFree(aa);
639:   }

641:   ISRestoreIndices(perm,&perm_ptr);
642:   C->factor       = FACTOR_CHOLESKY;
643:   C->assembled    = PETSC_TRUE;
644:   C->preallocated = PETSC_TRUE;
645:   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
646:   return(0);
647: }

651: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
652: {
653:   Mat            C = *B;
654:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
656:   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
657:   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
658:   PetscInt       bs=A->rmap.bs,bs2 = a->bs2;
659:   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
660:   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;
661:   MatScalar      *work;
662:   PetscInt       *pivots;

665:   PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);
666:   PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));
667:   PetscMalloc(2*mbs*sizeof(PetscInt),&il);
668:   jl   = il + mbs;
669:   for (i=0; i<mbs; i++) {
670:     jl[i] = mbs; il[0] = 0;
671:   }
672:   PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);
673:   uik  = dk + bs2;
674:   work = uik + bs2;
675:   PetscMalloc(bs*sizeof(PetscInt),&pivots);
676: 
677:   ai = a->i; aj = a->j; aa = a->a;
678: 
679:   /* for each row k */
680:   for (k = 0; k<mbs; k++){

682:     /*initialize k-th row with elements nonzero in row k of A */
683:     jmin = ai[k]; jmax = ai[k+1];
684:     ap = aa + jmin*bs2;
685:     for (j = jmin; j < jmax; j++){
686:       vj = aj[j];         /* block col. index */
687:       rtmp_ptr = rtmp + vj*bs2;
688:       for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++;
689:     }

691:     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
692:     PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));
693:     i = jl[k]; /* first row to be added to k_th row  */

695:     while (i < k){
696:       nexti = jl[i]; /* next row to be added to k_th row */

698:       /* compute multiplier */
699:       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */

701:       /* uik = -inv(Di)*U_bar(i,k) */
702:       diag = ba + i*bs2;
703:       u    = ba + ili*bs2;
704:       PetscMemzero(uik,bs2*sizeof(MatScalar));
705:       Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u);
706: 
707:       /* update D(k) += -U(i,k)^T * U_bar(i,k) */
708:       Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u);
709: 
710:       /* update -U(i,k) */
711:       PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));

713:       /* add multiple of row i to k-th row ... */
714:       jmin = ili + 1; jmax = bi[i+1];
715:       if (jmin < jmax){
716:         for (j=jmin; j<jmax; j++) {
717:           /* rtmp += -U(i,k)^T * U_bar(i,j) */
718:           rtmp_ptr = rtmp + bj[j]*bs2;
719:           u = ba + j*bs2;
720:           Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u);
721:         }
722: 
723:         /* ... add i to row list for next nonzero entry */
724:         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
725:         j     = bj[jmin];
726:         jl[i] = jl[j]; jl[j] = i; /* update jl */
727:       }
728:       i = nexti;
729:     }

731:     /* save nonzero entries in k-th row of U ... */

733:     /* invert diagonal block */
734:     diag = ba+k*bs2;
735:     PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));
736:     Kernel_A_gets_inverse_A(bs,diag,pivots,work);
737: 
738:     jmin = bi[k]; jmax = bi[k+1];
739:     if (jmin < jmax) {
740:       for (j=jmin; j<jmax; j++){
741:          vj = bj[j];           /* block col. index of U */
742:          u   = ba + j*bs2;
743:          rtmp_ptr = rtmp + vj*bs2;
744:          for (k1=0; k1<bs2; k1++){
745:            *u++        = *rtmp_ptr;
746:            *rtmp_ptr++ = 0.0;
747:          }
748:       }
749: 
750:       /* ... add k to row list for first nonzero entry in k-th row */
751:       il[k] = jmin;
752:       i     = bj[jmin];
753:       jl[k] = jl[i]; jl[i] = k;
754:     }
755:   }

757:   PetscFree(rtmp);
758:   PetscFree(il);
759:   PetscFree(dk);
760:   PetscFree(pivots);

762:   C->factor    = FACTOR_CHOLESKY;
763:   C->assembled = PETSC_TRUE;
764:   C->preallocated = PETSC_TRUE;
765:   PetscLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
766:   return(0);
767: }

769: /*
770:     Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP.
771:     Version for blocks 2 by 2.
772: */
775: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat A,MatFactorInfo *info,Mat *B)
776: {
777:   Mat            C = *B;
778:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
779:   IS             perm = b->row;
781:   PetscInt       *perm_ptr,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
782:   PetscInt       *ai,*aj,*a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
783:   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
784:   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;

787:   /* initialization */
788:   /* il and jl record the first nonzero element in each row of the accessing 
789:      window U(0:k, k:mbs-1).
790:      jl:    list of rows to be added to uneliminated rows 
791:             i>= k: jl(i) is the first row to be added to row i
792:             i<  k: jl(i) is the row following row i in some list of rows
793:             jl(i) = mbs indicates the end of a list                        
794:      il(i): points to the first nonzero element in columns k,...,mbs-1 of 
795:             row i of U */
796:   PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
797:   PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
798:   PetscMalloc(2*mbs*sizeof(PetscInt),&il);
799:   jl   = il + mbs;
800:   for (i=0; i<mbs; i++) {
801:     jl[i] = mbs; il[0] = 0;
802:   }
803:   PetscMalloc(8*sizeof(MatScalar),&dk);
804:   uik  = dk + 4;
805:   ISGetIndices(perm,&perm_ptr);

807:   /* check permutation */
808:   if (!a->permute){
809:     ai = a->i; aj = a->j; aa = a->a;
810:   } else {
811:     ai   = a->inew; aj = a->jnew;
812:     PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);
813:     PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));
814:     PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);
815:     PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));

817:     for (i=0; i<mbs; i++){
818:       jmin = ai[i]; jmax = ai[i+1];
819:       for (j=jmin; j<jmax; j++){
820:         while (a2anew[j] != j){
821:           k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k;
822:           for (k1=0; k1<4; k1++){
823:             dk[k1]       = aa[k*4+k1];
824:             aa[k*4+k1] = aa[j*4+k1];
825:             aa[j*4+k1] = dk[k1];
826:           }
827:         }
828:         /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */
829:         if (i > aj[j]){
830:           /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */
831:           ap = aa + j*4;     /* ptr to the beginning of the block */
832:           dk[1] = ap[1];     /* swap ap[1] and ap[2] */
833:           ap[1] = ap[2];
834:           ap[2] = dk[1];
835:         }
836:       }
837:     }
838:     PetscFree(a2anew);
839:   }

841:   /* for each row k */
842:   for (k = 0; k<mbs; k++){

844:     /*initialize k-th row with elements nonzero in row perm(k) of A */
845:     jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1];
846:     ap = aa + jmin*4;
847:     for (j = jmin; j < jmax; j++){
848:       vj = perm_ptr[aj[j]];         /* block col. index */
849:       rtmp_ptr = rtmp + vj*4;
850:       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
851:     }

853:     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
854:     PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
855:     i = jl[k]; /* first row to be added to k_th row  */

857:     while (i < k){
858:       nexti = jl[i]; /* next row to be added to k_th row */

860:       /* compute multiplier */
861:       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */

863:       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
864:       diag = ba + i*4;
865:       u    = ba + ili*4;
866:       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
867:       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
868:       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
869:       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
870: 
871:       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
872:       dk[0] += uik[0]*u[0] + uik[1]*u[1];
873:       dk[1] += uik[2]*u[0] + uik[3]*u[1];
874:       dk[2] += uik[0]*u[2] + uik[1]*u[3];
875:       dk[3] += uik[2]*u[2] + uik[3]*u[3];

877:       /* update -U(i,k): ba[ili] = uik */
878:       PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));

880:       /* add multiple of row i to k-th row ... */
881:       jmin = ili + 1; jmax = bi[i+1];
882:       if (jmin < jmax){
883:         for (j=jmin; j<jmax; j++) {
884:           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
885:           rtmp_ptr = rtmp + bj[j]*4;
886:           u = ba + j*4;
887:           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
888:           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
889:           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
890:           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
891:         }
892: 
893:         /* ... add i to row list for next nonzero entry */
894:         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
895:         j     = bj[jmin];
896:         jl[i] = jl[j]; jl[j] = i; /* update jl */
897:       }
898:       i = nexti;
899:     }

901:     /* save nonzero entries in k-th row of U ... */

903:     /* invert diagonal block */
904:     diag = ba+k*4;
905:     PetscMemcpy(diag,dk,4*sizeof(MatScalar));
906:     Kernel_A_gets_inverse_A_2(diag);
907: 
908:     jmin = bi[k]; jmax = bi[k+1];
909:     if (jmin < jmax) {
910:       for (j=jmin; j<jmax; j++){
911:          vj = bj[j];           /* block col. index of U */
912:          u   = ba + j*4;
913:          rtmp_ptr = rtmp + vj*4;
914:          for (k1=0; k1<4; k1++){
915:            *u++        = *rtmp_ptr;
916:            *rtmp_ptr++ = 0.0;
917:          }
918:       }
919: 
920:       /* ... add k to row list for first nonzero entry in k-th row */
921:       il[k] = jmin;
922:       i     = bj[jmin];
923:       jl[k] = jl[i]; jl[i] = k;
924:     }
925:   }

927:   PetscFree(rtmp);
928:   PetscFree(il);
929:   PetscFree(dk);
930:   if (a->permute) {
931:     PetscFree(aa);
932:   }
933:   ISRestoreIndices(perm,&perm_ptr);
934:   C->factor    = FACTOR_CHOLESKY;
935:   C->assembled = PETSC_TRUE;
936:   C->preallocated = PETSC_TRUE;
937:   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
938:   return(0);
939: }

941: /*
942:       Version for when blocks are 2 by 2 Using natural ordering
943: */
946: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
947: {
948:   Mat            C = *B;
949:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
951:   PetscInt       i,j,mbs=a->mbs,*bi=b->i,*bj=b->j;
952:   PetscInt       *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili;
953:   MatScalar      *ba = b->a,*aa,*ap,*dk,*uik;
954:   MatScalar      *u,*diag,*rtmp,*rtmp_ptr;

957:   /* initialization */
958:   /* il and jl record the first nonzero element in each row of the accessing 
959:      window U(0:k, k:mbs-1).
960:      jl:    list of rows to be added to uneliminated rows 
961:             i>= k: jl(i) is the first row to be added to row i
962:             i<  k: jl(i) is the row following row i in some list of rows
963:             jl(i) = mbs indicates the end of a list                        
964:      il(i): points to the first nonzero element in columns k,...,mbs-1 of 
965:             row i of U */
966:   PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);
967:   PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));
968:   PetscMalloc(2*mbs*sizeof(PetscInt),&il);
969:   jl   = il + mbs;
970:   for (i=0; i<mbs; i++) {
971:     jl[i] = mbs; il[0] = 0;
972:   }
973:   PetscMalloc(8*sizeof(MatScalar),&dk);
974:   uik  = dk + 4;
975: 
976:   ai = a->i; aj = a->j; aa = a->a;

978:   /* for each row k */
979:   for (k = 0; k<mbs; k++){

981:     /*initialize k-th row with elements nonzero in row k of A */
982:     jmin = ai[k]; jmax = ai[k+1];
983:     ap = aa + jmin*4;
984:     for (j = jmin; j < jmax; j++){
985:       vj = aj[j];         /* block col. index */
986:       rtmp_ptr = rtmp + vj*4;
987:       for (i=0; i<4; i++) *rtmp_ptr++ = *ap++;
988:     }
989: 
990:     /* modify k-th row by adding in those rows i with U(i,k) != 0 */
991:     PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));
992:     i = jl[k]; /* first row to be added to k_th row  */

994:     while (i < k){
995:       nexti = jl[i]; /* next row to be added to k_th row */

997:       /* compute multiplier */
998:       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */

1000:       /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */
1001:       diag = ba + i*4;
1002:       u    = ba + ili*4;
1003:       uik[0] = -(diag[0]*u[0] + diag[2]*u[1]);
1004:       uik[1] = -(diag[1]*u[0] + diag[3]*u[1]);
1005:       uik[2] = -(diag[0]*u[2] + diag[2]*u[3]);
1006:       uik[3] = -(diag[1]*u[2] + diag[3]*u[3]);
1007: 
1008:       /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */
1009:       dk[0] += uik[0]*u[0] + uik[1]*u[1];
1010:       dk[1] += uik[2]*u[0] + uik[3]*u[1];
1011:       dk[2] += uik[0]*u[2] + uik[1]*u[3];
1012:       dk[3] += uik[2]*u[2] + uik[3]*u[3];

1014:       /* update -U(i,k): ba[ili] = uik */
1015:       PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));

1017:       /* add multiple of row i to k-th row ... */
1018:       jmin = ili + 1; jmax = bi[i+1];
1019:       if (jmin < jmax){
1020:         for (j=jmin; j<jmax; j++) {
1021:           /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */
1022:           rtmp_ptr = rtmp + bj[j]*4;
1023:           u = ba + j*4;
1024:           rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1];
1025:           rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1];
1026:           rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3];
1027:           rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3];
1028:         }
1029: 
1030:         /* ... add i to row list for next nonzero entry */
1031:         il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
1032:         j     = bj[jmin];
1033:         jl[i] = jl[j]; jl[j] = i; /* update jl */
1034:       }
1035:       i = nexti;
1036:     }

1038:     /* save nonzero entries in k-th row of U ... */

1040:     /* invert diagonal block */
1041:     diag = ba+k*4;
1042:     PetscMemcpy(diag,dk,4*sizeof(MatScalar));
1043:     Kernel_A_gets_inverse_A_2(diag);
1044: 
1045:     jmin = bi[k]; jmax = bi[k+1];
1046:     if (jmin < jmax) {
1047:       for (j=jmin; j<jmax; j++){
1048:          vj = bj[j];           /* block col. index of U */
1049:          u   = ba + j*4;
1050:          rtmp_ptr = rtmp + vj*4;
1051:          for (k1=0; k1<4; k1++){
1052:            *u++        = *rtmp_ptr;
1053:            *rtmp_ptr++ = 0.0;
1054:          }
1055:       }
1056: 
1057:       /* ... add k to row list for first nonzero entry in k-th row */
1058:       il[k] = jmin;
1059:       i     = bj[jmin];
1060:       jl[k] = jl[i]; jl[i] = k;
1061:     }
1062:   }

1064:   PetscFree(rtmp);
1065:   PetscFree(il);
1066:   PetscFree(dk);

1068:   C->factor    = FACTOR_CHOLESKY;
1069:   C->assembled = PETSC_TRUE;
1070:   C->preallocated = PETSC_TRUE;
1071:   PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
1072:   return(0);
1073: }

1075: /*
1076:     Numeric U^T*D*U factorization for SBAIJ format. 
1077:     Version for blocks are 1 by 1.
1078: */
1081: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat A,MatFactorInfo *info,Mat *B)
1082: {
1083:   Mat            C = *B;
1084:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1085:   IS             ip=b->row;
1087:   PetscInt       *rip,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol;
1088:   PetscInt       *ai,*aj,*a2anew;
1089:   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
1090:   MatScalar      *rtmp,*ba=b->a,*bval,*aa,dk,uikdi;
1091:   PetscReal      zeropivot,rs,shiftnz;
1092:   PetscReal      shiftpd;
1093:   ChShift_Ctx    sctx;
1094:   PetscInt       newshift;

1097:   /* initialization */
1098:   shiftnz   = info->shiftnz;
1099:   shiftpd   = info->shiftpd;
1100:   zeropivot = info->zeropivot;

1102:   ISGetIndices(ip,&rip);
1103:   if (!a->permute){
1104:     ai = a->i; aj = a->j; aa = a->a;
1105:   } else {
1106:     ai = a->inew; aj = a->jnew;
1107:     nz = ai[mbs];
1108:     PetscMalloc(nz*sizeof(MatScalar),&aa);
1109:     a2anew = a->a2anew;
1110:     bval   = a->a;
1111:     for (j=0; j<nz; j++){
1112:       aa[a2anew[j]] = *(bval++);
1113:     }
1114:   }
1115: 
1116:   /* initialization */
1117:   /* il and jl record the first nonzero element in each row of the accessing 
1118:      window U(0:k, k:mbs-1).
1119:      jl:    list of rows to be added to uneliminated rows 
1120:             i>= k: jl(i) is the first row to be added to row i
1121:             i<  k: jl(i) is the row following row i in some list of rows
1122:             jl(i) = mbs indicates the end of a list                        
1123:      il(i): points to the first nonzero element in columns k,...,mbs-1 of 
1124:             row i of U */
1125:   nz   = (2*mbs+1)*sizeof(PetscInt)+mbs*sizeof(MatScalar);
1126:   PetscMalloc(nz,&il);
1127:   jl   = il + mbs;
1128:   rtmp = (MatScalar*)(jl + mbs);

1130:   sctx.shift_amount = 0;
1131:   sctx.nshift       = 0;
1132:   do {
1133:     sctx.chshift = PETSC_FALSE;
1134:     for (i=0; i<mbs; i++) {
1135:       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1136:     }
1137: 
1138:     for (k = 0; k<mbs; k++){
1139:       /*initialize k-th row by the perm[k]-th row of A */
1140:       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
1141:       bval = ba + bi[k];
1142:       for (j = jmin; j < jmax; j++){
1143:         col = rip[aj[j]];
1144:         rtmp[col] = aa[j];
1145:         *bval++  = 0.0; /* for in-place factorization */
1146:       }

1148:       /* shift the diagonal of the matrix */
1149:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;

1151:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1152:       dk = rtmp[k];
1153:       i = jl[k]; /* first row to be added to k_th row  */

1155:       while (i < k){
1156:         nexti = jl[i]; /* next row to be added to k_th row */

1158:         /* compute multiplier, update diag(k) and U(i,k) */
1159:         ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1160:         uikdi = - ba[ili]*ba[bi[i]];  /* diagonal(k) */
1161:         dk += uikdi*ba[ili];
1162:         ba[ili] = uikdi; /* -U(i,k) */

1164:         /* add multiple of row i to k-th row */
1165:         jmin = ili + 1; jmax = bi[i+1];
1166:         if (jmin < jmax){
1167:           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
1168:           /* update il and jl for row i */
1169:           il[i] = jmin;
1170:           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1171:         }
1172:         i = nexti;
1173:       }

1175:       /* shift the diagonals when zero pivot is detected */
1176:       /* compute rs=sum of abs(off-diagonal) */
1177:       rs   = 0.0;
1178:       jmin = bi[k]+1;
1179:       nz   = bi[k+1] - jmin;
1180:       if (nz){
1181:         bcol = bj + jmin;
1182:         while (nz--){
1183:           rs += PetscAbsScalar(rtmp[*bcol]);
1184:           bcol++;
1185:         }
1186:       }

1188:       sctx.rs = rs;
1189:       sctx.pv = dk;
1190:       MatCholeskyCheckShift_inline(info,sctx,newshift);
1191:       if (newshift == 1){
1192:         break;    /* sctx.shift_amount is updated */
1193:       } else if (newshift == -1){
1194:         SETERRQ4(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G",k,PetscAbsScalar(dk),zeropivot,rs);
1195:       }

1197:       /* copy data into U(k,:) */
1198:       ba[bi[k]] = 1.0/dk; /* U(k,k) */
1199:       jmin = bi[k]+1; jmax = bi[k+1];
1200:       if (jmin < jmax) {
1201:         for (j=jmin; j<jmax; j++){
1202:           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
1203:         }
1204:         /* add the k-th row into il and jl */
1205:         il[k] = jmin;
1206:         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1207:       }
1208:     }
1209:   } while (sctx.chshift);
1210:   PetscFree(il);
1211:   if (a->permute){PetscFree(aa);}

1213:   ISRestoreIndices(ip,&rip);
1214:   C->factor       = FACTOR_CHOLESKY;
1215:   C->assembled    = PETSC_TRUE;
1216:   C->preallocated = PETSC_TRUE;
1217:   PetscLogFlops(C->rmap.N);
1218:     if (sctx.nshift){
1219:     if (shiftnz) {
1220:       PetscInfo2(0,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1221:     } else if (shiftpd) {
1222:       PetscInfo2(0,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1223:     }
1224:   }
1225:   return(0);
1226: }

1228: /*
1229:   Version for when blocks are 1 by 1 Using natural ordering
1230: */
1233: PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
1234: {
1235:   Mat            C = *B;
1236:   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data;
1238:   PetscInt       i,j,mbs = a->mbs;
1239:   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
1240:   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
1241:   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
1242:   PetscReal      zeropivot,rs,shiftnz;
1243:   PetscReal      shiftpd;
1244:   ChShift_Ctx    sctx;
1245:   PetscInt       newshift;

1248:   /* initialization */
1249:   shiftnz   = info->shiftnz;
1250:   shiftpd   = info->shiftpd;
1251:   zeropivot = info->zeropivot;

1253:   /* il and jl record the first nonzero element in each row of the accessing 
1254:      window U(0:k, k:mbs-1).
1255:      jl:    list of rows to be added to uneliminated rows 
1256:             i>= k: jl(i) is the first row to be added to row i
1257:             i<  k: jl(i) is the row following row i in some list of rows
1258:             jl(i) = mbs indicates the end of a list                        
1259:      il(i): points to the first nonzero element in U(i,k:mbs-1) 
1260:   */
1261:   PetscMalloc(mbs*sizeof(MatScalar),&rtmp);
1262:   PetscMalloc(2*mbs*sizeof(PetscInt),&il);
1263:   jl   = il + mbs;

1265:   sctx.shift_amount = 0;
1266:   sctx.nshift       = 0;
1267:   do {
1268:     sctx.chshift = PETSC_FALSE;
1269:     for (i=0; i<mbs; i++) {
1270:       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
1271:     }

1273:     for (k = 0; k<mbs; k++){
1274:       /*initialize k-th row with elements nonzero in row perm(k) of A */
1275:       nz   = ai[k+1] - ai[k];
1276:       acol = aj + ai[k];
1277:       aval = aa + ai[k];
1278:       bval = ba + bi[k];
1279:       while (nz -- ){
1280:         rtmp[*acol++] = *aval++;
1281:         *bval++       = 0.0; /* for in-place factorization */
1282:       }

1284:       /* shift the diagonal of the matrix */
1285:       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
1286: 
1287:       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
1288:       dk = rtmp[k];
1289:       i  = jl[k]; /* first row to be added to k_th row  */

1291:       while (i < k){
1292:         nexti = jl[i]; /* next row to be added to k_th row */
1293:         /* compute multiplier, update D(k) and U(i,k) */
1294:         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
1295:         uikdi = - ba[ili]*ba[bi[i]];
1296:         dk   += uikdi*ba[ili];
1297:         ba[ili] = uikdi; /* -U(i,k) */

1299:         /* add multiple of row i to k-th row ... */
1300:         jmin = ili + 1;
1301:         nz   = bi[i+1] - jmin;
1302:         if (nz > 0){
1303:           bcol = bj + jmin;
1304:           bval = ba + jmin;
1305:           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
1306:           /* update il and jl for i-th row */
1307:           il[i] = jmin;
1308:           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
1309:         }
1310:         i = nexti;
1311:       }

1313:       /* shift the diagonals when zero pivot is detected */
1314:       /* compute rs=sum of abs(off-diagonal) */
1315:       rs   = 0.0;
1316:       jmin = bi[k]+1;
1317:       nz   = bi[k+1] - jmin;
1318:       if (nz){
1319:         bcol = bj + jmin;
1320:         while (nz--){
1321:           rs += PetscAbsScalar(rtmp[*bcol]);
1322:           bcol++;
1323:         }
1324:       }

1326:       sctx.rs = rs;
1327:       sctx.pv = dk;
1328:       MatCholeskyCheckShift_inline(info,sctx,newshift);
1329:       if (newshift == 1){
1330:         break;    /* sctx.shift_amount is updated */
1331:       } else if (newshift == -1){
1332:         SETERRQ4(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot row %D value %G tolerance %G * rs %G",k,PetscAbsScalar(dk),zeropivot,rs);
1333:       }
1334: 
1335:       /* copy data into U(k,:) */
1336:       ba[bi[k]] = 1.0/dk;
1337:       jmin      = bi[k]+1;
1338:       nz        = bi[k+1] - jmin;
1339:       if (nz){
1340:         bcol = bj + jmin;
1341:         bval = ba + jmin;
1342:         while (nz--){
1343:           *bval++       = rtmp[*bcol];
1344:           rtmp[*bcol++] = 0.0;
1345:         }
1346:         /* add k-th row into il and jl */
1347:         il[k] = jmin;
1348:         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1349:       }
1350:     } /* end of for (k = 0; k<mbs; k++) */
1351:   } while (sctx.chshift);
1352:   PetscFree(rtmp);
1353:   PetscFree(il);
1354: 
1355:   C->factor       = FACTOR_CHOLESKY;
1356:   C->assembled    = PETSC_TRUE;
1357:   C->preallocated = PETSC_TRUE;
1358:   PetscLogFlops(C->rmap.N);
1359:   if (sctx.nshift){
1360:     if (shiftnz) {
1361:       PetscInfo2(0,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1362:     } else if (shiftpd) {
1363:       PetscInfo2(0,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1364:     }
1365:   }
1366:   return(0);
1367: }

1371: PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info)
1372: {
1374:   Mat            C;

1377:   MatCholeskyFactorSymbolic(A,perm,info,&C);
1378:   MatCholeskyFactorNumeric(A,info,&C);
1379:   MatHeaderCopy(A,C);
1380:   return(0);
1381: }