Actual source code: sbaijfact2.c
1: #define PETSCMAT_DLL
3: /*
4: Factorization code for SBAIJ format.
5: */
7: #include src/mat/impls/sbaij/seq/sbaij.h
8: #include src/mat/impls/baij/seq/baij.h
9: #include src/inline/ilu.h
10: #include src/inline/dot.h
14: PetscErrorCode MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
15: {
16: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
17: IS isrow=a->row;
18: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
19: PetscErrorCode ierr;
20: PetscInt nz,*vj,k,*r,idx,k1;
21: PetscInt bs=A->rmap.bs,bs2 = a->bs2;
22: MatScalar *aa=a->a,*v,*diag;
23: PetscScalar *x,*xk,*xj,*b,*xk_tmp,*t;
26: VecGetArray(bb,&b);
27: VecGetArray(xx,&x);
28: t = a->solve_work;
29: ISGetIndices(isrow,&r);
30: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
32: /* solve U^T * D * y = b by forward substitution */
33: xk = t;
34: for (k=0; k<mbs; k++) { /* t <- perm(b) */
35: idx = bs*r[k];
36: for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
37: }
38: for (k=0; k<mbs; k++){
39: v = aa + bs2*ai[k];
40: xk = t + k*bs; /* Dk*xk = k-th block of x */
41: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
42: nz = ai[k+1] - ai[k];
43: vj = aj + ai[k];
44: xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
45: while (nz--) {
46: /* x(:) += U(k,:)^T*(Dk*xk) */
47: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
48: vj++; xj = t + (*vj)*bs;
49: v += bs2;
50: }
51: /* xk = inv(Dk)*(Dk*xk) */
52: diag = aa+k*bs2; /* ptr to inv(Dk) */
53: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
54: }
56: /* solve U*x = y by back substitution */
57: for (k=mbs-1; k>=0; k--){
58: v = aa + bs2*ai[k];
59: xk = t + k*bs; /* xk */
60: nz = ai[k+1] - ai[k];
61: vj = aj + ai[k];
62: xj = t + (*vj)*bs;
63: while (nz--) {
64: /* xk += U(k,:)*x(:) */
65: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
66: vj++;
67: v += bs2; xj = t + (*vj)*bs;
68: }
69: idx = bs*r[k];
70: for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
71: }
73: PetscFree(xk_tmp);
74: ISRestoreIndices(isrow,&r);
75: VecRestoreArray(bb,&b);
76: VecRestoreArray(xx,&x);
77: PetscLogFlops(bs2*(2*a->nz + mbs));
78: return(0);
79: }
83: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
84: {
85: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
87: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
88: PetscInt nz,*vj,k;
89: PetscInt bs=A->rmap.bs,bs2 = a->bs2;
90: MatScalar *aa=a->a,*v,*diag;
91: PetscScalar *x,*xk,*xj,*b,*xk_tmp;
94:
95: VecGetArray(bb,&b);
96: VecGetArray(xx,&x);
98: PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);
100: /* solve U^T * D * y = b by forward substitution */
101: PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar)); /* x <- b */
102: for (k=0; k<mbs; k++){
103: v = aa + bs2*ai[k];
104: xk = x + k*bs; /* Dk*xk = k-th block of x */
105: PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar)); /* xk_tmp <- xk */
106: nz = ai[k+1] - ai[k];
107: vj = aj + ai[k];
108: xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
109: while (nz--) {
110: /* x(:) += U(k,:)^T*(Dk*xk) */
111: Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
112: vj++; xj = x + (*vj)*bs;
113: v += bs2;
114: }
115: /* xk = inv(Dk)*(Dk*xk) */
116: diag = aa+k*bs2; /* ptr to inv(Dk) */
117: Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
118: }
120: /* solve U*x = y by back substitution */
121: for (k=mbs-1; k>=0; k--){
122: v = aa + bs2*ai[k];
123: xk = x + k*bs; /* xk */
124: nz = ai[k+1] - ai[k];
125: vj = aj + ai[k];
126: xj = x + (*vj)*bs;
127: while (nz--) {
128: /* xk += U(k,:)*x(:) */
129: Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
130: vj++;
131: v += bs2; xj = x + (*vj)*bs;
132: }
133: }
135: PetscFree(xk_tmp);
136: VecRestoreArray(bb,&b);
137: VecRestoreArray(xx,&x);
138: PetscLogFlops(bs2*(2*a->nz + mbs));
139: return(0);
140: }
144: PetscErrorCode MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
145: {
146: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
147: IS isrow=a->row;
148: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
150: PetscInt nz,*vj,k,*r,idx;
151: MatScalar *aa=a->a,*v,*d;
152: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
155: VecGetArray(bb,&b);
156: VecGetArray(xx,&x);
157: t = a->solve_work;
158: ISGetIndices(isrow,&r);
160: /* solve U^T * D * y = b by forward substitution */
161: tp = t;
162: for (k=0; k<mbs; k++) { /* t <- perm(b) */
163: idx = 7*r[k];
164: tp[0] = b[idx];
165: tp[1] = b[idx+1];
166: tp[2] = b[idx+2];
167: tp[3] = b[idx+3];
168: tp[4] = b[idx+4];
169: tp[5] = b[idx+5];
170: tp[6] = b[idx+6];
171: tp += 7;
172: }
173:
174: for (k=0; k<mbs; k++){
175: v = aa + 49*ai[k];
176: vj = aj + ai[k];
177: tp = t + k*7;
178: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
179: nz = ai[k+1] - ai[k];
180: tp = t + (*vj)*7;
181: while (nz--) {
182: tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
183: tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
184: tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
185: tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
186: tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
187: tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
188: tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
189: vj++; tp = t + (*vj)*7;
190: v += 49;
191: }
193: /* xk = inv(Dk)*(Dk*xk) */
194: d = aa+k*49; /* ptr to inv(Dk) */
195: tp = t + k*7;
196: tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
197: tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
198: tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
199: tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
200: tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
201: tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
202: tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
203: }
205: /* solve U*x = y by back substitution */
206: for (k=mbs-1; k>=0; k--){
207: v = aa + 49*ai[k];
208: vj = aj + ai[k];
209: tp = t + k*7;
210: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */
211: nz = ai[k+1] - ai[k];
212:
213: tp = t + (*vj)*7;
214: while (nz--) {
215: /* xk += U(k,:)*x(:) */
216: x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
217: x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
218: x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
219: x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
220: x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
221: x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
222: x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
223: vj++; tp = t + (*vj)*7;
224: v += 49;
225: }
226: tp = t + k*7;
227: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
228: idx = 7*r[k];
229: x[idx] = x0;
230: x[idx+1] = x1;
231: x[idx+2] = x2;
232: x[idx+3] = x3;
233: x[idx+4] = x4;
234: x[idx+5] = x5;
235: x[idx+6] = x6;
236: }
238: ISRestoreIndices(isrow,&r);
239: VecRestoreArray(bb,&b);
240: VecRestoreArray(xx,&x);
241: PetscLogFlops(49*(2*a->nz + mbs));
242: return(0);
243: }
247: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
248: {
249: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
251: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
252: MatScalar *aa=a->a,*v,*d;
253: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4,x5,x6;
254: PetscInt nz,*vj,k;
257: VecGetArray(bb,&b);
258: VecGetArray(xx,&x);
259:
260: /* solve U^T * D * y = b by forward substitution */
261: PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar)); /* x <- b */
262: for (k=0; k<mbs; k++){
263: v = aa + 49*ai[k];
264: xp = x + k*7;
265: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
266: nz = ai[k+1] - ai[k];
267: vj = aj + ai[k];
268: xp = x + (*vj)*7;
269: while (nz--) {
270: /* x(:) += U(k,:)^T*(Dk*xk) */
271: xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
272: xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
273: xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
274: xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
275: xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
276: xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
277: xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
278: vj++; xp = x + (*vj)*7;
279: v += 49;
280: }
281: /* xk = inv(Dk)*(Dk*xk) */
282: d = aa+k*49; /* ptr to inv(Dk) */
283: xp = x + k*7;
284: xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
285: xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
286: xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
287: xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
288: xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
289: xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
290: xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
291: }
293: /* solve U*x = y by back substitution */
294: for (k=mbs-1; k>=0; k--){
295: v = aa + 49*ai[k];
296: xp = x + k*7;
297: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
298: nz = ai[k+1] - ai[k];
299: vj = aj + ai[k];
300: xp = x + (*vj)*7;
301: while (nz--) {
302: /* xk += U(k,:)*x(:) */
303: x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
304: x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
305: x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
306: x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
307: x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
308: x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
309: x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
310: vj++;
311: v += 49; xp = x + (*vj)*7;
312: }
313: xp = x + k*7;
314: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
315: }
317: VecRestoreArray(bb,&b);
318: VecRestoreArray(xx,&x);
319: PetscLogFlops(49*(2*a->nz + mbs));
320: return(0);
321: }
325: PetscErrorCode MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
326: {
327: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
328: IS isrow=a->row;
329: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
331: PetscInt nz,*vj,k,*r,idx;
332: MatScalar *aa=a->a,*v,*d;
333: PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
336: VecGetArray(bb,&b);
337: VecGetArray(xx,&x);
338: t = a->solve_work;
339: ISGetIndices(isrow,&r);
341: /* solve U^T * D * y = b by forward substitution */
342: tp = t;
343: for (k=0; k<mbs; k++) { /* t <- perm(b) */
344: idx = 6*r[k];
345: tp[0] = b[idx];
346: tp[1] = b[idx+1];
347: tp[2] = b[idx+2];
348: tp[3] = b[idx+3];
349: tp[4] = b[idx+4];
350: tp[5] = b[idx+5];
351: tp += 6;
352: }
353:
354: for (k=0; k<mbs; k++){
355: v = aa + 36*ai[k];
356: vj = aj + ai[k];
357: tp = t + k*6;
358: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
359: nz = ai[k+1] - ai[k];
360: tp = t + (*vj)*6;
361: while (nz--) {
362: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
363: tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
364: tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
365: tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
366: tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
367: tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
368: vj++; tp = t + (*vj)*6;
369: v += 36;
370: }
372: /* xk = inv(Dk)*(Dk*xk) */
373: d = aa+k*36; /* ptr to inv(Dk) */
374: tp = t + k*6;
375: tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
376: tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
377: tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
378: tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
379: tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
380: tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
381: }
383: /* solve U*x = y by back substitution */
384: for (k=mbs-1; k>=0; k--){
385: v = aa + 36*ai[k];
386: vj = aj + ai[k];
387: tp = t + k*6;
388: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
389: nz = ai[k+1] - ai[k];
390:
391: tp = t + (*vj)*6;
392: while (nz--) {
393: /* xk += U(k,:)*x(:) */
394: x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
395: x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
396: x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
397: x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
398: x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
399: x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
400: vj++; tp = t + (*vj)*6;
401: v += 36;
402: }
403: tp = t + k*6;
404: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
405: idx = 6*r[k];
406: x[idx] = x0;
407: x[idx+1] = x1;
408: x[idx+2] = x2;
409: x[idx+3] = x3;
410: x[idx+4] = x4;
411: x[idx+5] = x5;
412: }
414: ISRestoreIndices(isrow,&r);
415: VecRestoreArray(bb,&b);
416: VecRestoreArray(xx,&x);
417: PetscLogFlops(36*(2*a->nz + mbs));
418: return(0);
419: }
423: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
424: {
425: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
426: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
427: MatScalar *aa=a->a,*v,*d;
428: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4,x5;
430: PetscInt nz,*vj,k;
433:
434: VecGetArray(bb,&b);
435: VecGetArray(xx,&x);
436:
437: /* solve U^T * D * y = b by forward substitution */
438: PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar)); /* x <- b */
439: for (k=0; k<mbs; k++){
440: v = aa + 36*ai[k];
441: xp = x + k*6;
442: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
443: nz = ai[k+1] - ai[k];
444: vj = aj + ai[k];
445: xp = x + (*vj)*6;
446: while (nz--) {
447: /* x(:) += U(k,:)^T*(Dk*xk) */
448: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
449: xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
450: xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
451: xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
452: xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
453: xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
454: vj++; xp = x + (*vj)*6;
455: v += 36;
456: }
457: /* xk = inv(Dk)*(Dk*xk) */
458: d = aa+k*36; /* ptr to inv(Dk) */
459: xp = x + k*6;
460: xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
461: xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
462: xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
463: xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
464: xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
465: xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
466: }
468: /* solve U*x = y by back substitution */
469: for (k=mbs-1; k>=0; k--){
470: v = aa + 36*ai[k];
471: xp = x + k*6;
472: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
473: nz = ai[k+1] - ai[k];
474: vj = aj + ai[k];
475: xp = x + (*vj)*6;
476: while (nz--) {
477: /* xk += U(k,:)*x(:) */
478: x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
479: x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
480: x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
481: x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
482: x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
483: x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
484: vj++;
485: v += 36; xp = x + (*vj)*6;
486: }
487: xp = x + k*6;
488: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
489: }
491: VecRestoreArray(bb,&b);
492: VecRestoreArray(xx,&x);
493: PetscLogFlops(36*(2*a->nz + mbs));
494: return(0);
495: }
499: PetscErrorCode MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
500: {
501: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
502: IS isrow=a->row;
503: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
505: PetscInt nz,*vj,k,*r,idx;
506: MatScalar *aa=a->a,*v,*diag;
507: PetscScalar *x,*b,x0,x1,x2,x3,x4,*t,*tp;
510: VecGetArray(bb,&b);
511: VecGetArray(xx,&x);
512: t = a->solve_work;
513: ISGetIndices(isrow,&r);
515: /* solve U^T * D * y = b by forward substitution */
516: tp = t;
517: for (k=0; k<mbs; k++) { /* t <- perm(b) */
518: idx = 5*r[k];
519: tp[0] = b[idx];
520: tp[1] = b[idx+1];
521: tp[2] = b[idx+2];
522: tp[3] = b[idx+3];
523: tp[4] = b[idx+4];
524: tp += 5;
525: }
526:
527: for (k=0; k<mbs; k++){
528: v = aa + 25*ai[k];
529: vj = aj + ai[k];
530: tp = t + k*5;
531: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
532: nz = ai[k+1] - ai[k];
534: tp = t + (*vj)*5;
535: while (nz--) {
536: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
537: tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
538: tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
539: tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
540: tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
541: vj++; tp = t + (*vj)*5;
542: v += 25;
543: }
545: /* xk = inv(Dk)*(Dk*xk) */
546: diag = aa+k*25; /* ptr to inv(Dk) */
547: tp = t + k*5;
548: tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
549: tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
550: tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
551: tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
552: tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
553: }
555: /* solve U*x = y by back substitution */
556: for (k=mbs-1; k>=0; k--){
557: v = aa + 25*ai[k];
558: vj = aj + ai[k];
559: tp = t + k*5;
560: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
561: nz = ai[k+1] - ai[k];
562:
563: tp = t + (*vj)*5;
564: while (nz--) {
565: /* xk += U(k,:)*x(:) */
566: x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
567: x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
568: x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
569: x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
570: x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
571: vj++; tp = t + (*vj)*5;
572: v += 25;
573: }
574: tp = t + k*5;
575: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
576: idx = 5*r[k];
577: x[idx] = x0;
578: x[idx+1] = x1;
579: x[idx+2] = x2;
580: x[idx+3] = x3;
581: x[idx+4] = x4;
582: }
584: ISRestoreIndices(isrow,&r);
585: VecRestoreArray(bb,&b);
586: VecRestoreArray(xx,&x);
587: PetscLogFlops(25*(2*a->nz + mbs));
588: return(0);
589: }
593: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
594: {
595: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
596: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
597: MatScalar *aa=a->a,*v,*diag;
598: PetscScalar *x,*xp,*b,x0,x1,x2,x3,x4;
600: PetscInt nz,*vj,k;
603:
604: VecGetArray(bb,&b);
605: VecGetArray(xx,&x);
607: /* solve U^T * D * y = b by forward substitution */
608: PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar)); /* x <- b */
609: for (k=0; k<mbs; k++){
610: v = aa + 25*ai[k];
611: xp = x + k*5;
612: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
613: nz = ai[k+1] - ai[k];
614: vj = aj + ai[k];
615: xp = x + (*vj)*5;
616: while (nz--) {
617: /* x(:) += U(k,:)^T*(Dk*xk) */
618: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
619: xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
620: xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
621: xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
622: xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
623: vj++; xp = x + (*vj)*5;
624: v += 25;
625: }
626: /* xk = inv(Dk)*(Dk*xk) */
627: diag = aa+k*25; /* ptr to inv(Dk) */
628: xp = x + k*5;
629: xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
630: xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
631: xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
632: xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
633: xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
634: }
636: /* solve U*x = y by back substitution */
637: for (k=mbs-1; k>=0; k--){
638: v = aa + 25*ai[k];
639: xp = x + k*5;
640: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
641: nz = ai[k+1] - ai[k];
642: vj = aj + ai[k];
643: xp = x + (*vj)*5;
644: while (nz--) {
645: /* xk += U(k,:)*x(:) */
646: x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
647: x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
648: x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
649: x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
650: x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
651: vj++;
652: v += 25; xp = x + (*vj)*5;
653: }
654: xp = x + k*5;
655: xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
656: }
658: VecRestoreArray(bb,&b);
659: VecRestoreArray(xx,&x);
660: PetscLogFlops(25*(2*a->nz + mbs));
661: return(0);
662: }
666: PetscErrorCode MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
667: {
668: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
669: IS isrow=a->row;
670: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
672: PetscInt nz,*vj,k,*r,idx;
673: MatScalar *aa=a->a,*v,*diag;
674: PetscScalar *x,*b,x0,x1,x2,x3,*t,*tp;
677: VecGetArray(bb,&b);
678: VecGetArray(xx,&x);
679: t = a->solve_work;
680: ISGetIndices(isrow,&r);
682: /* solve U^T * D * y = b by forward substitution */
683: tp = t;
684: for (k=0; k<mbs; k++) { /* t <- perm(b) */
685: idx = 4*r[k];
686: tp[0] = b[idx];
687: tp[1] = b[idx+1];
688: tp[2] = b[idx+2];
689: tp[3] = b[idx+3];
690: tp += 4;
691: }
692:
693: for (k=0; k<mbs; k++){
694: v = aa + 16*ai[k];
695: vj = aj + ai[k];
696: tp = t + k*4;
697: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
698: nz = ai[k+1] - ai[k];
700: tp = t + (*vj)*4;
701: while (nz--) {
702: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
703: tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
704: tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
705: tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
706: vj++; tp = t + (*vj)*4;
707: v += 16;
708: }
710: /* xk = inv(Dk)*(Dk*xk) */
711: diag = aa+k*16; /* ptr to inv(Dk) */
712: tp = t + k*4;
713: tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
714: tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
715: tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
716: tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
717: }
719: /* solve U*x = y by back substitution */
720: for (k=mbs-1; k>=0; k--){
721: v = aa + 16*ai[k];
722: vj = aj + ai[k];
723: tp = t + k*4;
724: x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
725: nz = ai[k+1] - ai[k];
726:
727: tp = t + (*vj)*4;
728: while (nz--) {
729: /* xk += U(k,:)*x(:) */
730: x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
731: x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
732: x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
733: x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
734: vj++; tp = t + (*vj)*4;
735: v += 16;
736: }
737: tp = t + k*4;
738: tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
739: idx = 4*r[k];
740: x[idx] = x0;
741: x[idx+1] = x1;
742: x[idx+2] = x2;
743: x[idx+3] = x3;
744: }
746: ISRestoreIndices(isrow,&r);
747: VecRestoreArray(bb,&b);
748: VecRestoreArray(xx,&x);
749: PetscLogFlops(16*(2*a->nz + mbs));
750: return(0);
751: }
753: /*
754: Special case where the matrix was factored in the natural ordering.
755: This eliminates the need for the column and row permutation.
756: */
759: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
760: {
761: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
762: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
763: MatScalar *aa=a->a,*v,*diag;
764: PetscScalar *x,*xp,*b,x0,x1,x2,x3;
766: PetscInt nz,*vj,k;
769:
770: VecGetArray(bb,&b);
771: VecGetArray(xx,&x);
773: /* solve U^T * D * y = b by forward substitution */
774: PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar)); /* x <- b */
775: for (k=0; k<mbs; k++){
776: v = aa + 16*ai[k];
777: xp = x + k*4;
778: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
779: nz = ai[k+1] - ai[k];
780: vj = aj + ai[k];
781: xp = x + (*vj)*4;
782: while (nz--) {
783: /* x(:) += U(k,:)^T*(Dk*xk) */
784: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
785: xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
786: xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
787: xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
788: vj++; xp = x + (*vj)*4;
789: v += 16;
790: }
791: /* xk = inv(Dk)*(Dk*xk) */
792: diag = aa+k*16; /* ptr to inv(Dk) */
793: xp = x + k*4;
794: xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
795: xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
796: xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
797: xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
798: }
800: /* solve U*x = y by back substitution */
801: for (k=mbs-1; k>=0; k--){
802: v = aa + 16*ai[k];
803: xp = x + k*4;
804: x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
805: nz = ai[k+1] - ai[k];
806: vj = aj + ai[k];
807: xp = x + (*vj)*4;
808: while (nz--) {
809: /* xk += U(k,:)*x(:) */
810: x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
811: x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
812: x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
813: x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
814: vj++;
815: v += 16; xp = x + (*vj)*4;
816: }
817: xp = x + k*4;
818: xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
819: }
821: VecRestoreArray(bb,&b);
822: VecRestoreArray(xx,&x);
823: PetscLogFlops(16*(2*a->nz + mbs));
824: return(0);
825: }
829: PetscErrorCode MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
830: {
831: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
832: IS isrow=a->row;
833: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
835: PetscInt nz,*vj,k,*r,idx;
836: MatScalar *aa=a->a,*v,*diag;
837: PetscScalar *x,*b,x0,x1,x2,*t,*tp;
840: VecGetArray(bb,&b);
841: VecGetArray(xx,&x);
842: t = a->solve_work;
843: ISGetIndices(isrow,&r);
845: /* solve U^T * D * y = b by forward substitution */
846: tp = t;
847: for (k=0; k<mbs; k++) { /* t <- perm(b) */
848: idx = 3*r[k];
849: tp[0] = b[idx];
850: tp[1] = b[idx+1];
851: tp[2] = b[idx+2];
852: tp += 3;
853: }
854:
855: for (k=0; k<mbs; k++){
856: v = aa + 9*ai[k];
857: vj = aj + ai[k];
858: tp = t + k*3;
859: x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
860: nz = ai[k+1] - ai[k];
862: tp = t + (*vj)*3;
863: while (nz--) {
864: tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
865: tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
866: tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
867: vj++; tp = t + (*vj)*3;
868: v += 9;
869: }
871: /* xk = inv(Dk)*(Dk*xk) */
872: diag = aa+k*9; /* ptr to inv(Dk) */
873: tp = t + k*3;
874: tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
875: tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
876: tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
877: }
879: /* solve U*x = y by back substitution */
880: for (k=mbs-1; k>=0; k--){
881: v = aa + 9*ai[k];
882: vj = aj + ai[k];
883: tp = t + k*3;
884: x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */
885: nz = ai[k+1] - ai[k];
886:
887: tp = t + (*vj)*3;
888: while (nz--) {
889: /* xk += U(k,:)*x(:) */
890: x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
891: x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
892: x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
893: vj++; tp = t + (*vj)*3;
894: v += 9;
895: }
896: tp = t + k*3;
897: tp[0] = x0; tp[1] = x1; tp[2] = x2;
898: idx = 3*r[k];
899: x[idx] = x0;
900: x[idx+1] = x1;
901: x[idx+2] = x2;
902: }
904: ISRestoreIndices(isrow,&r);
905: VecRestoreArray(bb,&b);
906: VecRestoreArray(xx,&x);
907: PetscLogFlops(9*(2*a->nz + mbs));
908: return(0);
909: }
911: /*
912: Special case where the matrix was factored in the natural ordering.
913: This eliminates the need for the column and row permutation.
914: */
917: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
918: {
919: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
920: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
921: MatScalar *aa=a->a,*v,*diag;
922: PetscScalar *x,*xp,*b,x0,x1,x2;
924: PetscInt nz,*vj,k;
927:
928: VecGetArray(bb,&b);
929: VecGetArray(xx,&x);
931: /* solve U^T * D * y = b by forward substitution */
932: PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));
933: for (k=0; k<mbs; k++){
934: v = aa + 9*ai[k];
935: xp = x + k*3;
936: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
937: nz = ai[k+1] - ai[k];
938: vj = aj + ai[k];
939: xp = x + (*vj)*3;
940: while (nz--) {
941: /* x(:) += U(k,:)^T*(Dk*xk) */
942: xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
943: xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
944: xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
945: vj++; xp = x + (*vj)*3;
946: v += 9;
947: }
948: /* xk = inv(Dk)*(Dk*xk) */
949: diag = aa+k*9; /* ptr to inv(Dk) */
950: xp = x + k*3;
951: xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
952: xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
953: xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
954: }
956: /* solve U*x = y by back substitution */
957: for (k=mbs-1; k>=0; k--){
958: v = aa + 9*ai[k];
959: xp = x + k*3;
960: x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */
961: nz = ai[k+1] - ai[k];
962: vj = aj + ai[k];
963: xp = x + (*vj)*3;
964: while (nz--) {
965: /* xk += U(k,:)*x(:) */
966: x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
967: x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
968: x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
969: vj++;
970: v += 9; xp = x + (*vj)*3;
971: }
972: xp = x + k*3;
973: xp[0] = x0; xp[1] = x1; xp[2] = x2;
974: }
976: VecRestoreArray(bb,&b);
977: VecRestoreArray(xx,&x);
978: PetscLogFlops(9*(2*a->nz + mbs));
979: return(0);
980: }
984: PetscErrorCode MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
985: {
986: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ *)A->data;
987: IS isrow=a->row;
988: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
990: PetscInt nz,*vj,k,k2,*r,idx;
991: MatScalar *aa=a->a,*v,*diag;
992: PetscScalar *x,*b,x0,x1,*t;
995: VecGetArray(bb,&b);
996: VecGetArray(xx,&x);
997: t = a->solve_work;
998: /* printf("called MatSolve_SeqSBAIJ_2\n"); */
999: ISGetIndices(isrow,&r);
1001: /* solve U^T * D * y = perm(b) by forward substitution */
1002: for (k=0; k<mbs; k++) { /* t <- perm(b) */
1003: idx = 2*r[k];
1004: t[k*2] = b[idx];
1005: t[k*2+1] = b[idx+1];
1006: }
1007: for (k=0; k<mbs; k++){
1008: v = aa + 4*ai[k];
1009: vj = aj + ai[k];
1010: k2 = k*2;
1011: x0 = t[k2]; x1 = t[k2+1];
1012: nz = ai[k+1] - ai[k];
1013: while (nz--) {
1014: t[(*vj)*2] += v[0]*x0 + v[1]*x1;
1015: t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1016: vj++; v += 4;
1017: }
1018: diag = aa+k*4; /* ptr to inv(Dk) */
1019: t[k2] = diag[0]*x0 + diag[2]*x1;
1020: t[k2+1] = diag[1]*x0 + diag[3]*x1;
1021: }
1023: /* solve U*x = y by back substitution */
1024: for (k=mbs-1; k>=0; k--){
1025: v = aa + 4*ai[k];
1026: vj = aj + ai[k];
1027: k2 = k*2;
1028: x0 = t[k2]; x1 = t[k2+1];
1029: nz = ai[k+1] - ai[k];
1030: while (nz--) {
1031: x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1032: x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1033: vj++; v += 4;
1034: }
1035: t[k2] = x0;
1036: t[k2+1] = x1;
1037: idx = 2*r[k];
1038: x[idx] = x0;
1039: x[idx+1] = x1;
1040: }
1042: ISRestoreIndices(isrow,&r);
1043: VecRestoreArray(bb,&b);
1044: VecRestoreArray(xx,&x);
1045: PetscLogFlops(4*(2*a->nz + mbs));
1046: return(0);
1047: }
1049: /*
1050: Special case where the matrix was factored in the natural ordering.
1051: This eliminates the need for the column and row permutation.
1052: */
1055: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1056: {
1057: Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data;
1058: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1059: MatScalar *aa=a->a,*v,*diag;
1060: PetscScalar *x,*b,x0,x1;
1062: PetscInt nz,*vj,k,k2;
1065:
1066: VecGetArray(bb,&b);
1067: VecGetArray(xx,&x);
1069: /* solve U^T * D * y = b by forward substitution */
1070: PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));
1071: for (k=0; k<mbs; k++){
1072: v = aa + 4*ai[k];
1073: vj = aj + ai[k];
1074: k2 = k*2;
1075: x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */
1076: nz = ai[k+1] - ai[k];
1077:
1078: while (nz--) {
1079: /* x(:) += U(k,:)^T*(Dk*xk) */
1080: x[(*vj)*2] += v[0]*x0 + v[1]*x1;
1081: x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1082: vj++; v += 4;
1083: }
1084: /* xk = inv(Dk)*(Dk*xk) */
1085: diag = aa+k*4; /* ptr to inv(Dk) */
1086: x[k2] = diag[0]*x0 + diag[2]*x1;
1087: x[k2+1] = diag[1]*x0 + diag[3]*x1;
1088: }
1090: /* solve U*x = y by back substitution */
1091: for (k=mbs-1; k>=0; k--){
1092: v = aa + 4*ai[k];
1093: vj = aj + ai[k];
1094: k2 = k*2;
1095: x0 = x[k2]; x1 = x[k2+1]; /* xk */
1096: nz = ai[k+1] - ai[k];
1097: while (nz--) {
1098: /* xk += U(k,:)*x(:) */
1099: x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1100: x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1101: vj++; v += 4;
1102: }
1103: x[k2] = x0;
1104: x[k2+1] = x1;
1105: }
1107: VecRestoreArray(bb,&b);
1108: VecRestoreArray(xx,&x);
1109: PetscLogFlops(4*(2*a->nz + mbs)); /* bs2*(2*a->nz + mbs) */
1110: return(0);
1111: }
1115: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1116: {
1117: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1118: IS isrow=a->row;
1120: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rip;
1121: MatScalar *aa=a->a,*v;
1122: PetscScalar *x,*b,xk,*t;
1123: PetscInt nz,*vj,k;
1126: if (!mbs) return(0);
1127: /* printf(" MatSolve_SeqSBAIJ_1 is called\n"); */
1129: VecGetArray(bb,&b);
1130: VecGetArray(xx,&x);
1131: t = a->solve_work;
1133: ISGetIndices(isrow,&rip);
1134:
1135: /* solve U^T*D*y = perm(b) by forward substitution */
1136: for (k=0; k<mbs; k++) t[k] = b[rip[k]];
1137: for (k=0; k<mbs; k++){
1138: v = aa + ai[k] + 1;
1139: vj = aj + ai[k] + 1;
1140: xk = t[k];
1141: nz = ai[k+1] - ai[k] - 1;
1142: while (nz--) t[*vj++] += (*v++) * xk;
1143: t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */
1144: }
1146: /* solve U*x = y by back substitution */
1147: for (k=mbs-1; k>=0; k--){
1148: v = aa + ai[k] + 1;
1149: vj = aj + ai[k] + 1;
1150: xk = t[k];
1151: nz = ai[k+1] - ai[k] - 1;
1152: while (nz--) xk += (*v++) * t[*vj++];
1153: t[k] = xk;
1154: x[rip[k]] = xk;
1155: }
1157: ISRestoreIndices(isrow,&rip);
1158: VecRestoreArray(bb,&b);
1159: VecRestoreArray(xx,&x);
1160: PetscLogFlops(4*a->nz + A->rmap.N);
1161: return(0);
1162: }
1166: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1167: {
1168: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1172: if (A->rmap.bs == 1) {
1173: MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1174: } else {
1175: IS isrow=a->row;
1176: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rip,i;
1177: MatScalar *aa=a->a,*v;
1178: PetscScalar *x,*b,*t;
1179: PetscInt nz,*vj,k,n;
1180: if (bb->n > a->solves_work_n) {
1181: PetscFree(a->solves_work);
1182: PetscMalloc(bb->n*A->rmap.N*sizeof(PetscScalar),&a->solves_work);
1183: a->solves_work_n = bb->n;
1184: }
1185: n = bb->n;
1186: VecGetArray(bb->v,&b);
1187: VecGetArray(xx->v,&x);
1188: t = a->solves_work;
1190: ISGetIndices(isrow,&rip);
1191:
1192: /* solve U^T*D*y = perm(b) by forward substitution */
1193: for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rip[k]+i*mbs];} /* values are stored interlaced in t */
1194: for (k=0; k<mbs; k++){
1195: v = aa + ai[k];
1196: vj = aj + ai[k];
1197: nz = ai[k+1] - ai[k];
1198: while (nz--) {
1199: for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1200: v++;vj++;
1201: }
1202: for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1203: }
1204:
1205: /* solve U*x = y by back substitution */
1206: for (k=mbs-1; k>=0; k--){
1207: v = aa + ai[k];
1208: vj = aj + ai[k];
1209: nz = ai[k+1] - ai[k];
1210: while (nz--) {
1211: for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1212: v++;vj++;
1213: }
1214: for (i=0; i<n; i++) x[rip[k]+i*mbs] = t[n*k+i];
1215: }
1217: ISRestoreIndices(isrow,&rip);
1218: VecRestoreArray(bb->v,&b);
1219: VecRestoreArray(xx->v,&x);
1220: PetscLogFlops(bb->n*(4*a->nz + A->rmap.N));
1221: }
1222: return(0);
1223: }
1225: /*
1226: Special case where the matrix was ILU(0) factored in the natural
1227: ordering. This eliminates the need for the column and row permutation.
1228: */
1231: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1232: {
1233: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1235: PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j;
1236: MatScalar *aa=a->a,*v;
1237: PetscScalar *x,*b,xk;
1238: PetscInt nz,*vj,k;
1241: VecGetArray(bb,&b);
1242: VecGetArray(xx,&x);
1243:
1244: /* solve U^T*D*y = b by forward substitution */
1245: PetscMemcpy(x,b,mbs*sizeof(PetscScalar));
1246: for (k=0; k<mbs; k++){
1247: v = aa + ai[k] + 1;
1248: vj = aj + ai[k] + 1;
1249: xk = x[k];
1250: nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */
1251: while (nz--) x[*vj++] += (*v++) * xk;
1252: x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
1253: }
1255: /* solve U*x = y by back substitution */
1256: for (k=mbs-2; k>=0; k--){
1257: v = aa + ai[k] + 1;
1258: vj = aj + ai[k] + 1;
1259: xk = x[k];
1260: nz = ai[k+1] - ai[k] - 1;
1261: while (nz--) xk += (*v++) * x[*vj++];
1262: x[k] = xk;
1263: }
1265: VecRestoreArray(bb,&b);
1266: VecRestoreArray(xx,&x);
1267: PetscLogFlops(4*a->nz + A->rmap.N);
1268: return(0);
1269: }
1271: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
1274: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat A,IS perm,MatFactorInfo *info,Mat *B)
1275: {
1276: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
1278: PetscInt *rip,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
1279: PetscInt *jutmp,bs = A->rmap.bs,bs2=a->bs2;
1280: PetscInt m,reallocs = 0,*levtmp;
1281: PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
1282: PetscInt incrlev,*lev,shift,prow,nz;
1283: PetscReal f = info->fill,levels = info->levels;
1284: PetscTruth perm_identity;
1287: /* check whether perm is the identity mapping */
1288: ISIdentity(perm,&perm_identity);
1290: if (perm_identity){
1291: a->permute = PETSC_FALSE;
1292: ai = a->i; aj = a->j;
1293: } else { /* non-trivial permutation */
1294: a->permute = PETSC_TRUE;
1295: MatReorderingSeqSBAIJ(A, perm);
1296: ai = a->inew; aj = a->jnew;
1297: }
1298:
1299: /* initialization */
1300: ISGetIndices(perm,&rip);
1301: umax = (PetscInt)(f*ai[mbs] + 1);
1302: PetscMalloc(umax*sizeof(PetscInt),&lev);
1303: umax += mbs + 1;
1304: shift = mbs + 1;
1305: PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);
1306: PetscMalloc(umax*sizeof(PetscInt),&ju);
1307: iu[0] = mbs + 1;
1308: juidx = mbs + 1;
1309: /* prowl: linked list for pivot row */
1310: PetscMalloc((3*mbs+1)*sizeof(PetscInt),&prowl);
1311: /* q: linked list for col index */
1312: q = prowl + mbs;
1313: levtmp = q + mbs;
1314:
1315: for (i=0; i<mbs; i++){
1316: prowl[i] = mbs;
1317: q[i] = 0;
1318: }
1320: /* for each row k */
1321: for (k=0; k<mbs; k++){
1322: nzk = 0;
1323: q[k] = mbs;
1324: /* copy current row into linked list */
1325: nz = ai[rip[k]+1] - ai[rip[k]];
1326: j = ai[rip[k]];
1327: while (nz--){
1328: vj = rip[aj[j++]];
1329: if (vj > k){
1330: qm = k;
1331: do {
1332: m = qm; qm = q[m];
1333: } while(qm < vj);
1334: if (qm == vj) {
1335: SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
1336: }
1337: nzk++;
1338: q[m] = vj;
1339: q[vj] = qm;
1340: levtmp[vj] = 0; /* initialize lev for nonzero element */
1341: }
1342: }
1344: /* modify nonzero structure of k-th row by computing fill-in
1345: for each row prow to be merged in */
1346: prow = k;
1347: prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
1348:
1349: while (prow < k){
1350: /* merge row prow into k-th row */
1351: jmin = iu[prow] + 1;
1352: jmax = iu[prow+1];
1353: qm = k;
1354: for (j=jmin; j<jmax; j++){
1355: incrlev = lev[j-shift] + 1;
1356: if (incrlev > levels) continue;
1357: vj = ju[j];
1358: do {
1359: m = qm; qm = q[m];
1360: } while (qm < vj);
1361: if (qm != vj){ /* a fill */
1362: nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1363: levtmp[vj] = incrlev;
1364: } else {
1365: if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
1366: }
1367: }
1368: prow = prowl[prow]; /* next pivot row */
1369: }
1370:
1371: /* add k to row list for first nonzero element in k-th row */
1372: if (nzk > 1){
1373: i = q[k]; /* col value of first nonzero element in k_th row of U */
1374: prowl[k] = prowl[i]; prowl[i] = k;
1375: }
1376: iu[k+1] = iu[k] + nzk;
1378: /* allocate more space to ju and lev if needed */
1379: if (iu[k+1] > umax) {
1380: /* estimate how much additional space we will need */
1381: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1382: /* just double the memory each time */
1383: maxadd = umax;
1384: if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1385: umax += maxadd;
1387: /* allocate a longer ju */
1388: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1389: PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));
1390: PetscFree(ju);
1391: ju = jutmp;
1393: PetscMalloc(umax*sizeof(PetscInt),&jutmp);
1394: PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));
1395: PetscFree(lev);
1396: lev = jutmp;
1397: reallocs += 2; /* count how many times we realloc */
1398: }
1400: /* save nonzero structure of k-th row in ju */
1401: i=k;
1402: while (nzk --) {
1403: i = q[i];
1404: ju[juidx] = i;
1405: lev[juidx-shift] = levtmp[i];
1406: juidx++;
1407: }
1408: }
1409:
1410: #if defined(PETSC_USE_INFO)
1411: if (ai[mbs] != 0) {
1412: PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1413: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);
1414: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
1415: PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);
1416: PetscInfo(A,"for best performance.\n");
1417: } else {
1418: PetscInfo(A,"Empty matrix.\n");
1419: }
1420: #endif
1422: ISRestoreIndices(perm,&rip);
1423: PetscFree(prowl);
1424: PetscFree(lev);
1426: /* put together the new matrix */
1427: MatCreate(A->comm,B);
1428: MatSetSizes(*B,bs*mbs,bs*mbs,bs*mbs,bs*mbs);
1429: MatSetType(*B,A->type_name);
1430: MatSeqSBAIJSetPreallocation_SeqSBAIJ(*B,bs,0,PETSC_NULL);
1432: /* PetscLogObjectParent(*B,iperm); */
1433: b = (Mat_SeqSBAIJ*)(*B)->data;
1434: PetscFree2(b->imax,b->ilen);
1435: b->singlemalloc = PETSC_FALSE;
1436: /* the next line frees the default space generated by the Create() */
1437: PetscFree3(b->a,b->j,b->i);
1438: PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);
1439: b->j = ju;
1440: b->i = iu;
1441: b->diag = 0;
1442: b->ilen = 0;
1443: b->imax = 0;
1444:
1445: if (b->row) {
1446: ISDestroy(b->row);
1447: }
1448: if (b->icol) {
1449: ISDestroy(b->icol);
1450: }
1451: b->row = perm;
1452: b->icol = perm;
1453: PetscObjectReference((PetscObject)perm);
1454: PetscObjectReference((PetscObject)perm);
1455: PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);
1456: /* In b structure: Free imax, ilen, old a, old j.
1457: Allocate idnew, solve_work, new a, new j */
1458: PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
1459: b->maxnz = b->nz = iu[mbs];
1460:
1461: (*B)->factor = FACTOR_CHOLESKY;
1462: (*B)->info.factor_mallocs = reallocs;
1463: (*B)->info.fill_ratio_given = f;
1464: if (ai[mbs] != 0) {
1465: (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1466: } else {
1467: (*B)->info.fill_ratio_needed = 0.0;
1468: }
1470: if (perm_identity){
1471: switch (bs) {
1472: case 1:
1473: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1474: (*B)->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1475: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1476: (*B)->ops->solves = MatSolves_SeqSBAIJ_1;
1477: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=1\n");
1478: break;
1479: case 2:
1480: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1481: (*B)->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1482: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1483: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=2\n");
1484: break;
1485: case 3:
1486: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1487: (*B)->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1488: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1489: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=3\n");
1490: break;
1491: case 4:
1492: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1493: (*B)->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1494: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1495: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=4\n");
1496: break;
1497: case 5:
1498: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1499: (*B)->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1500: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1501: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=5\n");
1502: break;
1503: case 6:
1504: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1505: (*B)->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1506: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1507: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=6\n");
1508: break;
1509: case 7:
1510: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1511: (*B)->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1512: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1513: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=7\n");
1514: break;
1515: default:
1516: (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1517: (*B)->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1518: (*B)->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1519: PetscInfo(A,"Using special in-place natural ordering factor and solve BS>7\n");
1520: break;
1521: }
1522: }
1524: return(0);
1525: }
1527: #include petscbt.h
1528: #include src/mat/utils/freespace.h
1531: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat A,IS perm,MatFactorInfo *info,Mat *fact)
1532: {
1533: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1534: Mat_SeqSBAIJ *b;
1535: Mat B;
1536: PetscErrorCode ierr;
1537: PetscTruth perm_identity;
1538: PetscInt bs=A->rmap.bs,am=a->mbs;
1539: PetscInt reallocs=0,*rip,i,*ai,*aj,*ui;
1540: PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
1541: PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
1542: PetscReal fill=info->fill,levels=info->levels;
1543: PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
1544: PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
1545: PetscBT lnkbt;
1548: /*
1549: This code originally uses Modified Sparse Row (MSR) storage
1550: (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
1551: Then it is rewritten so the factor B takes seqsbaij format. However the associated
1552: MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
1553: thus the original code in MSR format is still used for these cases.
1554: The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
1555: MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
1556: */
1557: if (bs > 1){
1558: MatICCFactorSymbolic_SeqSBAIJ_MSR(A,perm,info,fact);
1559: return(0);
1560: }
1562: /* check whether perm is the identity mapping */
1563: ISIdentity(perm,&perm_identity);
1564:
1565: /* special case that simply copies fill pattern */
1566: if (!levels && perm_identity) {
1567: a->permute = PETSC_FALSE;
1568: ai = a->i; aj = a->j;
1569: PetscMalloc((am+1)*sizeof(PetscInt),&ui);
1570: PetscMemcpy(ui,ai,(am+1)*sizeof(PetscInt));
1571: PetscMalloc(ai[am]*sizeof(PetscInt),&uj);
1572: PetscMemcpy(uj,aj,ai[am]*sizeof(PetscInt));
1574: } else { /* case: levels>0 || (levels=0 && !perm_identity) */
1575: if (perm_identity){
1576: a->permute = PETSC_FALSE;
1577: ai = a->i; aj = a->j;
1578: } else { /* non-trivial permutation */
1579: a->permute = PETSC_TRUE;
1580: MatReorderingSeqSBAIJ(A, perm);
1581: ai = a->inew; aj = a->jnew;
1582: }
1583: ISGetIndices(perm,&rip);
1585: /* initialization */
1586: PetscMalloc((am+1)*sizeof(PetscInt),&ui);
1587: ui[0] = 0;
1589: /* jl: linked list for storing indices of the pivot rows
1590: il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1591: PetscMalloc((2*am+1)*sizeof(PetscInt)+2*am*sizeof(PetscInt*),&jl);
1592: il = jl + am;
1593: uj_ptr = (PetscInt**)(il + am);
1594: uj_lvl_ptr = (PetscInt**)(uj_ptr + am);
1595: for (i=0; i<am; i++){
1596: jl[i] = am; il[i] = 0;
1597: }
1599: /* create and initialize a linked list for storing column indices of the active row k */
1600: nlnk = am + 1;
1601: PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);
1603: /* initial FreeSpace size is fill*(ai[am]+1) */
1604: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);
1605: current_space = free_space;
1606: PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);
1607: current_space_lvl = free_space_lvl;
1609: for (k=0; k<am; k++){ /* for each active row k */
1610: /* initialize lnk by the column indices of row rip[k] */
1611: nzk = 0;
1612: ncols = ai[rip[k]+1] - ai[rip[k]];
1613: cols = aj+ai[rip[k]];
1614: PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);
1615: nzk += nlnk;
1617: /* update lnk by computing fill-in for each pivot row to be merged in */
1618: prow = jl[k]; /* 1st pivot row */
1619:
1620: while (prow < k){
1621: nextprow = jl[prow];
1622:
1623: /* merge prow into k-th row */
1624: jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1625: jmax = ui[prow+1];
1626: ncols = jmax-jmin;
1627: i = jmin - ui[prow];
1628: cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1629: j = *(uj_lvl_ptr[prow] + i - 1);
1630: cols_lvl = uj_lvl_ptr[prow]+i;
1631: PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);
1632: nzk += nlnk;
1634: /* update il and jl for prow */
1635: if (jmin < jmax){
1636: il[prow] = jmin;
1637: j = *cols; jl[prow] = jl[j]; jl[j] = prow;
1638: }
1639: prow = nextprow;
1640: }
1642: /* if free space is not available, make more free space */
1643: if (current_space->local_remaining<nzk) {
1644: i = am - k + 1; /* num of unfactored rows */
1645: i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1646: PetscFreeSpaceGet(i,¤t_space);
1647: PetscFreeSpaceGet(i,¤t_space_lvl);
1648: reallocs++;
1649: }
1651: /* copy data into free_space and free_space_lvl, then initialize lnk */
1652: PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
1654: /* add the k-th row into il and jl */
1655: if (nzk-1 > 0){
1656: i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1657: jl[k] = jl[i]; jl[i] = k;
1658: il[k] = ui[k] + 1;
1659: }
1660: uj_ptr[k] = current_space->array;
1661: uj_lvl_ptr[k] = current_space_lvl->array;
1663: current_space->array += nzk;
1664: current_space->local_used += nzk;
1665: current_space->local_remaining -= nzk;
1666: current_space_lvl->array += nzk;
1667: current_space_lvl->local_used += nzk;
1668: current_space_lvl->local_remaining -= nzk;
1670: ui[k+1] = ui[k] + nzk;
1671: }
1673: #if defined(PETSC_USE_INFO)
1674: if (ai[am] != 0) {
1675: PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
1676: PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);
1677: PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);
1678: PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);
1679: } else {
1680: PetscInfo(A,"Empty matrix.\n");
1681: }
1682: #endif
1684: ISRestoreIndices(perm,&rip);
1685: PetscFree(jl);
1687: /* destroy list of free space and other temporary array(s) */
1688: PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);
1689: PetscFreeSpaceContiguous(&free_space,uj);
1690: PetscIncompleteLLDestroy(lnk,lnkbt);
1691: PetscFreeSpaceDestroy(free_space_lvl);
1692: } /* end of case: levels>0 || (levels=0 && !perm_identity) */
1694: /* put together the new matrix in MATSEQSBAIJ format */
1695: MatCreate(PETSC_COMM_SELF,fact);
1696: MatSetSizes(*fact,am,am,am,am);
1697: B = *fact;
1698: MatSetType(B,MATSEQSBAIJ);
1699: MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);
1701: b = (Mat_SeqSBAIJ*)B->data;
1702: PetscFree2(b->imax,b->ilen);
1703: b->singlemalloc = PETSC_FALSE;
1704: /* the next line frees the default space generated by the Create() */
1705: PetscFree3(b->a,b->j,b->i);
1706: PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);
1707: b->j = uj;
1708: b->i = ui;
1709: b->diag = 0;
1710: b->ilen = 0;
1711: b->imax = 0;
1712: b->row = perm;
1713: b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1714: PetscObjectReference((PetscObject)perm);
1715: b->icol = perm;
1716: PetscObjectReference((PetscObject)perm);
1717: PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);
1718: b->maxnz = b->nz = ui[am];
1719:
1720: B->factor = FACTOR_CHOLESKY;
1721: B->info.factor_mallocs = reallocs;
1722: B->info.fill_ratio_given = fill;
1723: if (ai[am] != 0) {
1724: B->info.fill_ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
1725: } else {
1726: B->info.fill_ratio_needed = 0.0;
1727: }
1729: if (perm_identity){
1730: switch (bs) {
1731: case 1:
1732: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1733: B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1734: B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1735: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=1\n");
1736: break;
1737: case 2:
1738: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1739: B->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1740: B->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1741: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=2\n");
1742: break;
1743: case 3:
1744: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1745: B->ops->solve = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1746: B->ops->solvetranspose = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1747: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=3\n");
1748: break;
1749: case 4:
1750: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1751: B->ops->solve = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1752: B->ops->solvetranspose = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1753: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=4\n");
1754: break;
1755: case 5:
1756: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1757: B->ops->solve = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1758: B->ops->solvetranspose = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1759: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=5\n");
1760: break;
1761: case 6:
1762: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1763: B->ops->solve = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1764: B->ops->solvetranspose = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1765: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=6\n");
1766: break;
1767: case 7:
1768: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1769: B->ops->solve = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1770: B->ops->solvetranspose = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1771: PetscInfo(A,"Using special in-place natural ordering factor and solve BS=7\n");
1772: break;
1773: default:
1774: B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1775: B->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1776: B->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1777: PetscInfo(A,"Using special in-place natural ordering factor and solve BS>7\n");
1778: break;
1779: }
1780: }
1781: return(0);
1782: }