Actual source code: baijfact12.c

  1: #define PETSCMAT_DLL

  3: /*
  4:     Factorization code for BAIJ format. 
  5: */
 6:  #include src/mat/impls/baij/seq/baij.h
 7:  #include src/inline/ilu.h

 11: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat A,MatFactorInfo *info,Mat *B)
 12: {
 13: /*
 14:     Default Version for when blocks are 4 by 4 Using natural ordering
 15: */
 16:   Mat            C = *B;
 17:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
 19:   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
 20:   PetscInt       *ajtmpold,*ajtmp,nz,row;
 21:   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
 22:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
 23:   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
 24:   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
 25:   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
 26:   MatScalar      m13,m14,m15,m16;
 27:   MatScalar      *ba = b->a,*aa = a->a;
 28:   PetscTruth     pivotinblocks = b->pivotinblocks;

 31:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);

 33:   for (i=0; i<n; i++) {
 34:     nz    = bi[i+1] - bi[i];
 35:     ajtmp = bj + bi[i];
 36:     for  (j=0; j<nz; j++) {
 37:       x = rtmp+16*ajtmp[j];
 38:       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
 39:       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
 40:     }
 41:     /* load in initial (unfactored row) */
 42:     nz       = ai[i+1] - ai[i];
 43:     ajtmpold = aj + ai[i];
 44:     v        = aa + 16*ai[i];
 45:     for (j=0; j<nz; j++) {
 46:       x    = rtmp+16*ajtmpold[j];
 47:       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
 48:       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
 49:       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
 50:       x[14] = v[14]; x[15] = v[15];
 51:       v    += 16;
 52:     }
 53:     row = *ajtmp++;
 54:     while (row < i) {
 55:       pc  = rtmp + 16*row;
 56:       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
 57:       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
 58:       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
 59:       p15 = pc[14]; p16 = pc[15];
 60:       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
 61:           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
 62:           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
 63:           || p16 != 0.0) {
 64:         pv = ba + 16*diag_offset[row];
 65:         pj = bj + diag_offset[row] + 1;
 66:         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
 67:         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
 68:         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
 69:         x15 = pv[14]; x16 = pv[15];
 70:         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
 71:         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
 72:         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
 73:         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;

 75:         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
 76:         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
 77:         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
 78:         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;

 80:         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
 81:         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
 82:         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
 83:         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;

 85:         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
 86:         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
 87:         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
 88:         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
 89:         nz = bi[row+1] - diag_offset[row] - 1;
 90:         pv += 16;
 91:         for (j=0; j<nz; j++) {
 92:           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
 93:           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
 94:           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
 95:           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
 96:           x    = rtmp + 16*pj[j];
 97:           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
 98:           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
 99:           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
100:           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;

102:           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
103:           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
104:           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
105:           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;

107:           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
108:           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
109:           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
110:           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;

112:           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
113:           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
114:           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
115:           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;

117:           pv   += 16;
118:         }
119:         PetscLogFlops(128*nz+112);
120:       }
121:       row = *ajtmp++;
122:     }
123:     /* finished row so stick it into b->a */
124:     pv = ba + 16*bi[i];
125:     pj = bj + bi[i];
126:     nz = bi[i+1] - bi[i];
127:     for (j=0; j<nz; j++) {
128:       x      = rtmp+16*pj[j];
129:       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
130:       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
131:       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
132:       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
133:       pv   += 16;
134:     }
135:     /* invert diagonal block */
136:     w = ba + 16*diag_offset[i];
137:     if (pivotinblocks) {
138:       Kernel_A_gets_inverse_A_4(w);
139:     } else {
140:       Kernel_A_gets_inverse_A_4_nopivot(w);
141:     }
142:   }

144:   PetscFree(rtmp);
145:   C->factor    = FACTOR_LU;
146:   C->assembled = PETSC_TRUE;
147:   PetscLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
148:   return(0);
149: }


152: #if defined(PETSC_HAVE_SSE)

154: #include PETSC_HAVE_SSE

156: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
159: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat A,MatFactorInfo *info,Mat *B)
160: {
161:   Mat         C = *B;
162:   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
164:   int i,j,n = a->mbs;
165:   int         *bj = b->j,*bjtmp,*pj;
166:   int         row;
167:   int         *ajtmpold,nz,*bi=b->i;
168:   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
169:   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
170:   MatScalar   *ba = b->a,*aa = a->a;
171:   int         nonzero=0;
172: /*    int            nonzero=0,colscale = 16; */
173:   PetscTruth  pivotinblocks = b->pivotinblocks;

176:   SSE_SCOPE_BEGIN;

178:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
179:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
180:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
181:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
182: /*    if ((unsigned long)bj==(unsigned long)aj) { */
183: /*      colscale = 4; */
184: /*    } */
185:   for (i=0; i<n; i++) {
186:     nz    = bi[i+1] - bi[i];
187:     bjtmp = bj + bi[i];
188:     /* zero out the 4x4 block accumulators */
189:     /* zero out one register */
190:     XOR_PS(XMM7,XMM7);
191:     for  (j=0; j<nz; j++) {
192:       x = rtmp+16*bjtmp[j];
193: /*        x = rtmp+4*bjtmp[j]; */
194:       SSE_INLINE_BEGIN_1(x)
195:         /* Copy zero register to memory locations */
196:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
197:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
198:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
199:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
200:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
201:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
202:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
203:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
204:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
205:       SSE_INLINE_END_1;
206:     }
207:     /* load in initial (unfactored row) */
208:     nz       = ai[i+1] - ai[i];
209:     ajtmpold = aj + ai[i];
210:     v        = aa + 16*ai[i];
211:     for (j=0; j<nz; j++) {
212:       x = rtmp+16*ajtmpold[j];
213: /*        x = rtmp+colscale*ajtmpold[j]; */
214:       /* Copy v block into x block */
215:       SSE_INLINE_BEGIN_2(v,x)
216:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
217:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
218:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

220:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
221:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

223:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
224:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

226:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
227:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

229:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
230:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

232:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
233:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

235:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
236:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

238:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
239:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
240:       SSE_INLINE_END_2;

242:       v += 16;
243:     }
244: /*      row = (*bjtmp++)/4; */
245:     row = *bjtmp++;
246:     while (row < i) {
247:       pc  = rtmp + 16*row;
248:       SSE_INLINE_BEGIN_1(pc)
249:         /* Load block from lower triangle */
250:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
251:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
252:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

254:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
255:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

257:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
258:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

260:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
261:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

263:         /* Compare block to zero block */

265:         SSE_COPY_PS(XMM4,XMM7)
266:         SSE_CMPNEQ_PS(XMM4,XMM0)

268:         SSE_COPY_PS(XMM5,XMM7)
269:         SSE_CMPNEQ_PS(XMM5,XMM1)

271:         SSE_COPY_PS(XMM6,XMM7)
272:         SSE_CMPNEQ_PS(XMM6,XMM2)

274:         SSE_CMPNEQ_PS(XMM7,XMM3)

276:         /* Reduce the comparisons to one SSE register */
277:         SSE_OR_PS(XMM6,XMM7)
278:         SSE_OR_PS(XMM5,XMM4)
279:         SSE_OR_PS(XMM5,XMM6)
280:       SSE_INLINE_END_1;

282:       /* Reduce the one SSE register to an integer register for branching */
283:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
284:       MOVEMASK(nonzero,XMM5);

286:       /* If block is nonzero ... */
287:       if (nonzero) {
288:         pv = ba + 16*diag_offset[row];
289:         PREFETCH_L1(&pv[16]);
290:         pj = bj + diag_offset[row] + 1;

292:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
293:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
294:         /* but the diagonal was inverted already */
295:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

297:         SSE_INLINE_BEGIN_2(pv,pc)
298:           /* Column 0, product is accumulated in XMM4 */
299:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
300:           SSE_SHUFFLE(XMM4,XMM4,0x00)
301:           SSE_MULT_PS(XMM4,XMM0)

303:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
304:           SSE_SHUFFLE(XMM5,XMM5,0x00)
305:           SSE_MULT_PS(XMM5,XMM1)
306:           SSE_ADD_PS(XMM4,XMM5)

308:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
309:           SSE_SHUFFLE(XMM6,XMM6,0x00)
310:           SSE_MULT_PS(XMM6,XMM2)
311:           SSE_ADD_PS(XMM4,XMM6)

313:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
314:           SSE_SHUFFLE(XMM7,XMM7,0x00)
315:           SSE_MULT_PS(XMM7,XMM3)
316:           SSE_ADD_PS(XMM4,XMM7)

318:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
319:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

321:           /* Column 1, product is accumulated in XMM5 */
322:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
323:           SSE_SHUFFLE(XMM5,XMM5,0x00)
324:           SSE_MULT_PS(XMM5,XMM0)

326:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
327:           SSE_SHUFFLE(XMM6,XMM6,0x00)
328:           SSE_MULT_PS(XMM6,XMM1)
329:           SSE_ADD_PS(XMM5,XMM6)

331:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
332:           SSE_SHUFFLE(XMM7,XMM7,0x00)
333:           SSE_MULT_PS(XMM7,XMM2)
334:           SSE_ADD_PS(XMM5,XMM7)

336:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
337:           SSE_SHUFFLE(XMM6,XMM6,0x00)
338:           SSE_MULT_PS(XMM6,XMM3)
339:           SSE_ADD_PS(XMM5,XMM6)

341:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
342:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

344:           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

346:           /* Column 2, product is accumulated in XMM6 */
347:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
348:           SSE_SHUFFLE(XMM6,XMM6,0x00)
349:           SSE_MULT_PS(XMM6,XMM0)

351:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
352:           SSE_SHUFFLE(XMM7,XMM7,0x00)
353:           SSE_MULT_PS(XMM7,XMM1)
354:           SSE_ADD_PS(XMM6,XMM7)

356:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
357:           SSE_SHUFFLE(XMM7,XMM7,0x00)
358:           SSE_MULT_PS(XMM7,XMM2)
359:           SSE_ADD_PS(XMM6,XMM7)

361:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
362:           SSE_SHUFFLE(XMM7,XMM7,0x00)
363:           SSE_MULT_PS(XMM7,XMM3)
364:           SSE_ADD_PS(XMM6,XMM7)
365: 
366:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
367:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

369:           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
370:           /* Column 3, product is accumulated in XMM0 */
371:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
372:           SSE_SHUFFLE(XMM7,XMM7,0x00)
373:           SSE_MULT_PS(XMM0,XMM7)

375:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
376:           SSE_SHUFFLE(XMM7,XMM7,0x00)
377:           SSE_MULT_PS(XMM1,XMM7)
378:           SSE_ADD_PS(XMM0,XMM1)

380:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
381:           SSE_SHUFFLE(XMM1,XMM1,0x00)
382:           SSE_MULT_PS(XMM1,XMM2)
383:           SSE_ADD_PS(XMM0,XMM1)

385:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
386:           SSE_SHUFFLE(XMM7,XMM7,0x00)
387:           SSE_MULT_PS(XMM3,XMM7)
388:           SSE_ADD_PS(XMM0,XMM3)

390:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
391:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

393:           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
394:           /* This is code to be maintained and read by humans afterall. */
395:           /* Copy Multiplier Col 3 into XMM3 */
396:           SSE_COPY_PS(XMM3,XMM0)
397:           /* Copy Multiplier Col 2 into XMM2 */
398:           SSE_COPY_PS(XMM2,XMM6)
399:           /* Copy Multiplier Col 1 into XMM1 */
400:           SSE_COPY_PS(XMM1,XMM5)
401:           /* Copy Multiplier Col 0 into XMM0 */
402:           SSE_COPY_PS(XMM0,XMM4)
403:         SSE_INLINE_END_2;

405:         /* Update the row: */
406:         nz = bi[row+1] - diag_offset[row] - 1;
407:         pv += 16;
408:         for (j=0; j<nz; j++) {
409:           PREFETCH_L1(&pv[16]);
410:           x = rtmp + 16*pj[j];
411: /*            x = rtmp + 4*pj[j]; */

413:           /* X:=X-M*PV, One column at a time */
414:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
415:           SSE_INLINE_BEGIN_2(x,pv)
416:             /* Load First Column of X*/
417:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
418:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

420:             /* Matrix-Vector Product: */
421:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
422:             SSE_SHUFFLE(XMM5,XMM5,0x00)
423:             SSE_MULT_PS(XMM5,XMM0)
424:             SSE_SUB_PS(XMM4,XMM5)

426:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
427:             SSE_SHUFFLE(XMM6,XMM6,0x00)
428:             SSE_MULT_PS(XMM6,XMM1)
429:             SSE_SUB_PS(XMM4,XMM6)

431:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
432:             SSE_SHUFFLE(XMM7,XMM7,0x00)
433:             SSE_MULT_PS(XMM7,XMM2)
434:             SSE_SUB_PS(XMM4,XMM7)

436:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
437:             SSE_SHUFFLE(XMM5,XMM5,0x00)
438:             SSE_MULT_PS(XMM5,XMM3)
439:             SSE_SUB_PS(XMM4,XMM5)

441:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
442:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

444:             /* Second Column */
445:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
446:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

448:             /* Matrix-Vector Product: */
449:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
450:             SSE_SHUFFLE(XMM6,XMM6,0x00)
451:             SSE_MULT_PS(XMM6,XMM0)
452:             SSE_SUB_PS(XMM5,XMM6)

454:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
455:             SSE_SHUFFLE(XMM7,XMM7,0x00)
456:             SSE_MULT_PS(XMM7,XMM1)
457:             SSE_SUB_PS(XMM5,XMM7)

459:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
460:             SSE_SHUFFLE(XMM4,XMM4,0x00)
461:             SSE_MULT_PS(XMM4,XMM2)
462:             SSE_SUB_PS(XMM5,XMM4)

464:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
465:             SSE_SHUFFLE(XMM6,XMM6,0x00)
466:             SSE_MULT_PS(XMM6,XMM3)
467:             SSE_SUB_PS(XMM5,XMM6)
468: 
469:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
470:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

472:             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

474:             /* Third Column */
475:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
476:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

478:             /* Matrix-Vector Product: */
479:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
480:             SSE_SHUFFLE(XMM7,XMM7,0x00)
481:             SSE_MULT_PS(XMM7,XMM0)
482:             SSE_SUB_PS(XMM6,XMM7)

484:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
485:             SSE_SHUFFLE(XMM4,XMM4,0x00)
486:             SSE_MULT_PS(XMM4,XMM1)
487:             SSE_SUB_PS(XMM6,XMM4)

489:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
490:             SSE_SHUFFLE(XMM5,XMM5,0x00)
491:             SSE_MULT_PS(XMM5,XMM2)
492:             SSE_SUB_PS(XMM6,XMM5)

494:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
495:             SSE_SHUFFLE(XMM7,XMM7,0x00)
496:             SSE_MULT_PS(XMM7,XMM3)
497:             SSE_SUB_PS(XMM6,XMM7)
498: 
499:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
500:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
501: 
502:             /* Fourth Column */
503:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
504:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

506:             /* Matrix-Vector Product: */
507:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
508:             SSE_SHUFFLE(XMM5,XMM5,0x00)
509:             SSE_MULT_PS(XMM5,XMM0)
510:             SSE_SUB_PS(XMM4,XMM5)

512:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
513:             SSE_SHUFFLE(XMM6,XMM6,0x00)
514:             SSE_MULT_PS(XMM6,XMM1)
515:             SSE_SUB_PS(XMM4,XMM6)

517:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
518:             SSE_SHUFFLE(XMM7,XMM7,0x00)
519:             SSE_MULT_PS(XMM7,XMM2)
520:             SSE_SUB_PS(XMM4,XMM7)

522:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
523:             SSE_SHUFFLE(XMM5,XMM5,0x00)
524:             SSE_MULT_PS(XMM5,XMM3)
525:             SSE_SUB_PS(XMM4,XMM5)
526: 
527:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
528:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
529:           SSE_INLINE_END_2;
530:           pv   += 16;
531:         }
532:         PetscLogFlops(128*nz+112);
533:       }
534:       row = *bjtmp++;
535: /*        row = (*bjtmp++)/4; */
536:     }
537:     /* finished row so stick it into b->a */
538:     pv = ba + 16*bi[i];
539:     pj = bj + bi[i];
540:     nz = bi[i+1] - bi[i];

542:     /* Copy x block back into pv block */
543:     for (j=0; j<nz; j++) {
544:       x  = rtmp+16*pj[j];
545: /*        x  = rtmp+4*pj[j]; */

547:       SSE_INLINE_BEGIN_2(x,pv)
548:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
549:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
550:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

552:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
553:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

555:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
556:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

558:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
559:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

561:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
562:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

564:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
565:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

567:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
568:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

570:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
571:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
572:       SSE_INLINE_END_2;
573:       pv += 16;
574:     }
575:     /* invert diagonal block */
576:     w = ba + 16*diag_offset[i];
577:     if (pivotinblocks) {
578:       Kernel_A_gets_inverse_A_4(w);
579:     } else {
580:       Kernel_A_gets_inverse_A_4_nopivot(w);
581:     }
582: /*      Kernel_A_gets_inverse_A_4_SSE(w); */
583:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
584:   }

586:   PetscFree(rtmp);
587:   C->factor    = FACTOR_LU;
588:   C->assembled = PETSC_TRUE;
589:   PetscLogFlops(1.3333*64*b->mbs);
590:   /* Flop Count from inverting diagonal blocks */
591:   SSE_SCOPE_END;
592:   return(0);
593: }

597: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat *B)
598: {
599:   Mat            A=*B,C=*B;
600:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
602:   int i,j,n = a->mbs;
603:   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
604:   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
605:   unsigned int   row;
606:   int            nz,*bi=b->i;
607:   int            *diag_offset = b->diag,*ai=a->i;
608:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
609:   MatScalar      *ba = b->a,*aa = a->a;
610:   int            nonzero=0;
611: /*    int            nonzero=0,colscale = 16; */
612:   PetscTruth     pivotinblocks = b->pivotinblocks;

615:   SSE_SCOPE_BEGIN;

617:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
618:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
619:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
620:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
621: /*    if ((unsigned long)bj==(unsigned long)aj) { */
622: /*      colscale = 4; */
623: /*    } */
624: 
625:   for (i=0; i<n; i++) {
626:     nz    = bi[i+1] - bi[i];
627:     bjtmp = bj + bi[i];
628:     /* zero out the 4x4 block accumulators */
629:     /* zero out one register */
630:     XOR_PS(XMM7,XMM7);
631:     for  (j=0; j<nz; j++) {
632:       x = rtmp+16*((unsigned int)bjtmp[j]);
633: /*        x = rtmp+4*bjtmp[j]; */
634:       SSE_INLINE_BEGIN_1(x)
635:         /* Copy zero register to memory locations */
636:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
637:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
638:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
639:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
640:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
641:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
642:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
643:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
644:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
645:       SSE_INLINE_END_1;
646:     }
647:     /* load in initial (unfactored row) */
648:     nz    = ai[i+1] - ai[i];
649:     ajtmp = aj + ai[i];
650:     v     = aa + 16*ai[i];
651:     for (j=0; j<nz; j++) {
652:       x = rtmp+16*((unsigned int)ajtmp[j]);
653: /*        x = rtmp+colscale*ajtmp[j]; */
654:       /* Copy v block into x block */
655:       SSE_INLINE_BEGIN_2(v,x)
656:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
657:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
658:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

660:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
661:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

663:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
664:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

666:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
667:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

669:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
670:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

672:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
673:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

675:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
676:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

678:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
679:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
680:       SSE_INLINE_END_2;

682:       v += 16;
683:     }
684: /*      row = (*bjtmp++)/4; */
685:     row = (unsigned int)(*bjtmp++);
686:     while (row < i) {
687:       pc  = rtmp + 16*row;
688:       SSE_INLINE_BEGIN_1(pc)
689:         /* Load block from lower triangle */
690:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
691:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
692:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

694:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
695:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

697:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
698:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

700:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
701:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

703:         /* Compare block to zero block */

705:         SSE_COPY_PS(XMM4,XMM7)
706:         SSE_CMPNEQ_PS(XMM4,XMM0)

708:         SSE_COPY_PS(XMM5,XMM7)
709:         SSE_CMPNEQ_PS(XMM5,XMM1)

711:         SSE_COPY_PS(XMM6,XMM7)
712:         SSE_CMPNEQ_PS(XMM6,XMM2)

714:         SSE_CMPNEQ_PS(XMM7,XMM3)

716:         /* Reduce the comparisons to one SSE register */
717:         SSE_OR_PS(XMM6,XMM7)
718:         SSE_OR_PS(XMM5,XMM4)
719:         SSE_OR_PS(XMM5,XMM6)
720:       SSE_INLINE_END_1;

722:       /* Reduce the one SSE register to an integer register for branching */
723:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
724:       MOVEMASK(nonzero,XMM5);

726:       /* If block is nonzero ... */
727:       if (nonzero) {
728:         pv = ba + 16*diag_offset[row];
729:         PREFETCH_L1(&pv[16]);
730:         pj = bj + diag_offset[row] + 1;

732:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
733:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
734:         /* but the diagonal was inverted already */
735:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

737:         SSE_INLINE_BEGIN_2(pv,pc)
738:           /* Column 0, product is accumulated in XMM4 */
739:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
740:           SSE_SHUFFLE(XMM4,XMM4,0x00)
741:           SSE_MULT_PS(XMM4,XMM0)

743:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
744:           SSE_SHUFFLE(XMM5,XMM5,0x00)
745:           SSE_MULT_PS(XMM5,XMM1)
746:           SSE_ADD_PS(XMM4,XMM5)

748:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
749:           SSE_SHUFFLE(XMM6,XMM6,0x00)
750:           SSE_MULT_PS(XMM6,XMM2)
751:           SSE_ADD_PS(XMM4,XMM6)

753:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
754:           SSE_SHUFFLE(XMM7,XMM7,0x00)
755:           SSE_MULT_PS(XMM7,XMM3)
756:           SSE_ADD_PS(XMM4,XMM7)

758:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
759:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

761:           /* Column 1, product is accumulated in XMM5 */
762:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
763:           SSE_SHUFFLE(XMM5,XMM5,0x00)
764:           SSE_MULT_PS(XMM5,XMM0)

766:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
767:           SSE_SHUFFLE(XMM6,XMM6,0x00)
768:           SSE_MULT_PS(XMM6,XMM1)
769:           SSE_ADD_PS(XMM5,XMM6)

771:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
772:           SSE_SHUFFLE(XMM7,XMM7,0x00)
773:           SSE_MULT_PS(XMM7,XMM2)
774:           SSE_ADD_PS(XMM5,XMM7)

776:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
777:           SSE_SHUFFLE(XMM6,XMM6,0x00)
778:           SSE_MULT_PS(XMM6,XMM3)
779:           SSE_ADD_PS(XMM5,XMM6)

781:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
782:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

784:           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

786:           /* Column 2, product is accumulated in XMM6 */
787:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
788:           SSE_SHUFFLE(XMM6,XMM6,0x00)
789:           SSE_MULT_PS(XMM6,XMM0)

791:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
792:           SSE_SHUFFLE(XMM7,XMM7,0x00)
793:           SSE_MULT_PS(XMM7,XMM1)
794:           SSE_ADD_PS(XMM6,XMM7)

796:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
797:           SSE_SHUFFLE(XMM7,XMM7,0x00)
798:           SSE_MULT_PS(XMM7,XMM2)
799:           SSE_ADD_PS(XMM6,XMM7)

801:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
802:           SSE_SHUFFLE(XMM7,XMM7,0x00)
803:           SSE_MULT_PS(XMM7,XMM3)
804:           SSE_ADD_PS(XMM6,XMM7)
805: 
806:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
807:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

809:           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
810:           /* Column 3, product is accumulated in XMM0 */
811:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
812:           SSE_SHUFFLE(XMM7,XMM7,0x00)
813:           SSE_MULT_PS(XMM0,XMM7)

815:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
816:           SSE_SHUFFLE(XMM7,XMM7,0x00)
817:           SSE_MULT_PS(XMM1,XMM7)
818:           SSE_ADD_PS(XMM0,XMM1)

820:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
821:           SSE_SHUFFLE(XMM1,XMM1,0x00)
822:           SSE_MULT_PS(XMM1,XMM2)
823:           SSE_ADD_PS(XMM0,XMM1)

825:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
826:           SSE_SHUFFLE(XMM7,XMM7,0x00)
827:           SSE_MULT_PS(XMM3,XMM7)
828:           SSE_ADD_PS(XMM0,XMM3)

830:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
831:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

833:           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
834:           /* This is code to be maintained and read by humans afterall. */
835:           /* Copy Multiplier Col 3 into XMM3 */
836:           SSE_COPY_PS(XMM3,XMM0)
837:           /* Copy Multiplier Col 2 into XMM2 */
838:           SSE_COPY_PS(XMM2,XMM6)
839:           /* Copy Multiplier Col 1 into XMM1 */
840:           SSE_COPY_PS(XMM1,XMM5)
841:           /* Copy Multiplier Col 0 into XMM0 */
842:           SSE_COPY_PS(XMM0,XMM4)
843:         SSE_INLINE_END_2;

845:         /* Update the row: */
846:         nz = bi[row+1] - diag_offset[row] - 1;
847:         pv += 16;
848:         for (j=0; j<nz; j++) {
849:           PREFETCH_L1(&pv[16]);
850:           x = rtmp + 16*((unsigned int)pj[j]);
851: /*            x = rtmp + 4*pj[j]; */

853:           /* X:=X-M*PV, One column at a time */
854:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
855:           SSE_INLINE_BEGIN_2(x,pv)
856:             /* Load First Column of X*/
857:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
858:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

860:             /* Matrix-Vector Product: */
861:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
862:             SSE_SHUFFLE(XMM5,XMM5,0x00)
863:             SSE_MULT_PS(XMM5,XMM0)
864:             SSE_SUB_PS(XMM4,XMM5)

866:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
867:             SSE_SHUFFLE(XMM6,XMM6,0x00)
868:             SSE_MULT_PS(XMM6,XMM1)
869:             SSE_SUB_PS(XMM4,XMM6)

871:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
872:             SSE_SHUFFLE(XMM7,XMM7,0x00)
873:             SSE_MULT_PS(XMM7,XMM2)
874:             SSE_SUB_PS(XMM4,XMM7)

876:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
877:             SSE_SHUFFLE(XMM5,XMM5,0x00)
878:             SSE_MULT_PS(XMM5,XMM3)
879:             SSE_SUB_PS(XMM4,XMM5)

881:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
882:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

884:             /* Second Column */
885:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
886:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

888:             /* Matrix-Vector Product: */
889:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
890:             SSE_SHUFFLE(XMM6,XMM6,0x00)
891:             SSE_MULT_PS(XMM6,XMM0)
892:             SSE_SUB_PS(XMM5,XMM6)

894:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
895:             SSE_SHUFFLE(XMM7,XMM7,0x00)
896:             SSE_MULT_PS(XMM7,XMM1)
897:             SSE_SUB_PS(XMM5,XMM7)

899:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
900:             SSE_SHUFFLE(XMM4,XMM4,0x00)
901:             SSE_MULT_PS(XMM4,XMM2)
902:             SSE_SUB_PS(XMM5,XMM4)

904:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
905:             SSE_SHUFFLE(XMM6,XMM6,0x00)
906:             SSE_MULT_PS(XMM6,XMM3)
907:             SSE_SUB_PS(XMM5,XMM6)
908: 
909:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
910:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

912:             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

914:             /* Third Column */
915:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
916:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

918:             /* Matrix-Vector Product: */
919:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
920:             SSE_SHUFFLE(XMM7,XMM7,0x00)
921:             SSE_MULT_PS(XMM7,XMM0)
922:             SSE_SUB_PS(XMM6,XMM7)

924:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
925:             SSE_SHUFFLE(XMM4,XMM4,0x00)
926:             SSE_MULT_PS(XMM4,XMM1)
927:             SSE_SUB_PS(XMM6,XMM4)

929:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
930:             SSE_SHUFFLE(XMM5,XMM5,0x00)
931:             SSE_MULT_PS(XMM5,XMM2)
932:             SSE_SUB_PS(XMM6,XMM5)

934:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
935:             SSE_SHUFFLE(XMM7,XMM7,0x00)
936:             SSE_MULT_PS(XMM7,XMM3)
937:             SSE_SUB_PS(XMM6,XMM7)
938: 
939:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
940:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
941: 
942:             /* Fourth Column */
943:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
944:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

946:             /* Matrix-Vector Product: */
947:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
948:             SSE_SHUFFLE(XMM5,XMM5,0x00)
949:             SSE_MULT_PS(XMM5,XMM0)
950:             SSE_SUB_PS(XMM4,XMM5)

952:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
953:             SSE_SHUFFLE(XMM6,XMM6,0x00)
954:             SSE_MULT_PS(XMM6,XMM1)
955:             SSE_SUB_PS(XMM4,XMM6)

957:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
958:             SSE_SHUFFLE(XMM7,XMM7,0x00)
959:             SSE_MULT_PS(XMM7,XMM2)
960:             SSE_SUB_PS(XMM4,XMM7)

962:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
963:             SSE_SHUFFLE(XMM5,XMM5,0x00)
964:             SSE_MULT_PS(XMM5,XMM3)
965:             SSE_SUB_PS(XMM4,XMM5)
966: 
967:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
968:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
969:           SSE_INLINE_END_2;
970:           pv   += 16;
971:         }
972:         PetscLogFlops(128*nz+112);
973:       }
974:       row = (unsigned int)(*bjtmp++);
975: /*        row = (*bjtmp++)/4; */
976: /*        bjtmp++; */
977:     }
978:     /* finished row so stick it into b->a */
979:     pv = ba + 16*bi[i];
980:     pj = bj + bi[i];
981:     nz = bi[i+1] - bi[i];

983:     /* Copy x block back into pv block */
984:     for (j=0; j<nz; j++) {
985:       x  = rtmp+16*((unsigned int)pj[j]);
986: /*        x  = rtmp+4*pj[j]; */

988:       SSE_INLINE_BEGIN_2(x,pv)
989:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
990:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
991:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

993:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
994:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

996:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
997:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

999:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1000:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1002:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1003:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1005:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1006:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1008:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1009:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1011:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1012:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1013:       SSE_INLINE_END_2;
1014:       pv += 16;
1015:     }
1016:     /* invert diagonal block */
1017:     w = ba + 16*diag_offset[i];
1018:     if (pivotinblocks) {
1019:       Kernel_A_gets_inverse_A_4(w);
1020:     } else {
1021:       Kernel_A_gets_inverse_A_4_nopivot(w);
1022:     }
1023: /*      Kernel_A_gets_inverse_A_4_SSE(w); */
1024:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1025:   }

1027:   PetscFree(rtmp);
1028:   C->factor    = FACTOR_LU;
1029:   C->assembled = PETSC_TRUE;
1030:   PetscLogFlops(1.3333*64*b->mbs);
1031:   /* Flop Count from inverting diagonal blocks */
1032:   SSE_SCOPE_END;
1033:   return(0);
1034: }

1038: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A,MatFactorInfo *info,Mat *B)
1039: {
1040:   Mat            C = *B;
1041:   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1043:   int  i,j,n = a->mbs;
1044:   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1045:   unsigned int   row;
1046:   int            *ajtmpold,nz,*bi=b->i;
1047:   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1048:   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1049:   MatScalar      *ba = b->a,*aa = a->a;
1050:   int            nonzero=0;
1051: /*    int            nonzero=0,colscale = 16; */
1052:   PetscTruth     pivotinblocks = b->pivotinblocks;

1055:   SSE_SCOPE_BEGIN;

1057:   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1058:   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1059:   PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);
1060:   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1061: /*    if ((unsigned long)bj==(unsigned long)aj) { */
1062: /*      colscale = 4; */
1063: /*    } */
1064:   if ((unsigned long)bj==(unsigned long)aj) {
1065:     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(B));
1066:   }
1067: 
1068:   for (i=0; i<n; i++) {
1069:     nz    = bi[i+1] - bi[i];
1070:     bjtmp = bj + bi[i];
1071:     /* zero out the 4x4 block accumulators */
1072:     /* zero out one register */
1073:     XOR_PS(XMM7,XMM7);
1074:     for  (j=0; j<nz; j++) {
1075:       x = rtmp+16*((unsigned int)bjtmp[j]);
1076: /*        x = rtmp+4*bjtmp[j]; */
1077:       SSE_INLINE_BEGIN_1(x)
1078:         /* Copy zero register to memory locations */
1079:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1080:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1081:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1082:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1083:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1084:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1085:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1086:         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1087:         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1088:       SSE_INLINE_END_1;
1089:     }
1090:     /* load in initial (unfactored row) */
1091:     nz       = ai[i+1] - ai[i];
1092:     ajtmpold = aj + ai[i];
1093:     v        = aa + 16*ai[i];
1094:     for (j=0; j<nz; j++) {
1095:       x = rtmp+16*ajtmpold[j];
1096: /*        x = rtmp+colscale*ajtmpold[j]; */
1097:       /* Copy v block into x block */
1098:       SSE_INLINE_BEGIN_2(v,x)
1099:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1100:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1101:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)

1103:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1104:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)

1106:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1107:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)

1109:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1110:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)

1112:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1113:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)

1115:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1116:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)

1118:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1119:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)

1121:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1122:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1123:       SSE_INLINE_END_2;

1125:       v += 16;
1126:     }
1127: /*      row = (*bjtmp++)/4; */
1128:     row = (unsigned int)(*bjtmp++);
1129:     while (row < i) {
1130:       pc  = rtmp + 16*row;
1131:       SSE_INLINE_BEGIN_1(pc)
1132:         /* Load block from lower triangle */
1133:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1134:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1135:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)

1137:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1138:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)

1140:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1141:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)

1143:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1144:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)

1146:         /* Compare block to zero block */

1148:         SSE_COPY_PS(XMM4,XMM7)
1149:         SSE_CMPNEQ_PS(XMM4,XMM0)

1151:         SSE_COPY_PS(XMM5,XMM7)
1152:         SSE_CMPNEQ_PS(XMM5,XMM1)

1154:         SSE_COPY_PS(XMM6,XMM7)
1155:         SSE_CMPNEQ_PS(XMM6,XMM2)

1157:         SSE_CMPNEQ_PS(XMM7,XMM3)

1159:         /* Reduce the comparisons to one SSE register */
1160:         SSE_OR_PS(XMM6,XMM7)
1161:         SSE_OR_PS(XMM5,XMM4)
1162:         SSE_OR_PS(XMM5,XMM6)
1163:       SSE_INLINE_END_1;

1165:       /* Reduce the one SSE register to an integer register for branching */
1166:       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1167:       MOVEMASK(nonzero,XMM5);

1169:       /* If block is nonzero ... */
1170:       if (nonzero) {
1171:         pv = ba + 16*diag_offset[row];
1172:         PREFETCH_L1(&pv[16]);
1173:         pj = bj + diag_offset[row] + 1;

1175:         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1176:         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1177:         /* but the diagonal was inverted already */
1178:         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */

1180:         SSE_INLINE_BEGIN_2(pv,pc)
1181:           /* Column 0, product is accumulated in XMM4 */
1182:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1183:           SSE_SHUFFLE(XMM4,XMM4,0x00)
1184:           SSE_MULT_PS(XMM4,XMM0)

1186:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1187:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1188:           SSE_MULT_PS(XMM5,XMM1)
1189:           SSE_ADD_PS(XMM4,XMM5)

1191:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1192:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1193:           SSE_MULT_PS(XMM6,XMM2)
1194:           SSE_ADD_PS(XMM4,XMM6)

1196:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1197:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1198:           SSE_MULT_PS(XMM7,XMM3)
1199:           SSE_ADD_PS(XMM4,XMM7)

1201:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1202:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)

1204:           /* Column 1, product is accumulated in XMM5 */
1205:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1206:           SSE_SHUFFLE(XMM5,XMM5,0x00)
1207:           SSE_MULT_PS(XMM5,XMM0)

1209:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1210:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1211:           SSE_MULT_PS(XMM6,XMM1)
1212:           SSE_ADD_PS(XMM5,XMM6)

1214:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1215:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1216:           SSE_MULT_PS(XMM7,XMM2)
1217:           SSE_ADD_PS(XMM5,XMM7)

1219:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1220:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1221:           SSE_MULT_PS(XMM6,XMM3)
1222:           SSE_ADD_PS(XMM5,XMM6)

1224:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1225:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)

1227:           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)

1229:           /* Column 2, product is accumulated in XMM6 */
1230:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1231:           SSE_SHUFFLE(XMM6,XMM6,0x00)
1232:           SSE_MULT_PS(XMM6,XMM0)

1234:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1235:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1236:           SSE_MULT_PS(XMM7,XMM1)
1237:           SSE_ADD_PS(XMM6,XMM7)

1239:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1240:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1241:           SSE_MULT_PS(XMM7,XMM2)
1242:           SSE_ADD_PS(XMM6,XMM7)

1244:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1245:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1246:           SSE_MULT_PS(XMM7,XMM3)
1247:           SSE_ADD_PS(XMM6,XMM7)
1248: 
1249:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1250:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1252:           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1253:           /* Column 3, product is accumulated in XMM0 */
1254:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1255:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1256:           SSE_MULT_PS(XMM0,XMM7)

1258:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1259:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1260:           SSE_MULT_PS(XMM1,XMM7)
1261:           SSE_ADD_PS(XMM0,XMM1)

1263:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1264:           SSE_SHUFFLE(XMM1,XMM1,0x00)
1265:           SSE_MULT_PS(XMM1,XMM2)
1266:           SSE_ADD_PS(XMM0,XMM1)

1268:           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1269:           SSE_SHUFFLE(XMM7,XMM7,0x00)
1270:           SSE_MULT_PS(XMM3,XMM7)
1271:           SSE_ADD_PS(XMM0,XMM3)

1273:           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1274:           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)

1276:           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1277:           /* This is code to be maintained and read by humans afterall. */
1278:           /* Copy Multiplier Col 3 into XMM3 */
1279:           SSE_COPY_PS(XMM3,XMM0)
1280:           /* Copy Multiplier Col 2 into XMM2 */
1281:           SSE_COPY_PS(XMM2,XMM6)
1282:           /* Copy Multiplier Col 1 into XMM1 */
1283:           SSE_COPY_PS(XMM1,XMM5)
1284:           /* Copy Multiplier Col 0 into XMM0 */
1285:           SSE_COPY_PS(XMM0,XMM4)
1286:         SSE_INLINE_END_2;

1288:         /* Update the row: */
1289:         nz = bi[row+1] - diag_offset[row] - 1;
1290:         pv += 16;
1291:         for (j=0; j<nz; j++) {
1292:           PREFETCH_L1(&pv[16]);
1293:           x = rtmp + 16*((unsigned int)pj[j]);
1294: /*            x = rtmp + 4*pj[j]; */

1296:           /* X:=X-M*PV, One column at a time */
1297:           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1298:           SSE_INLINE_BEGIN_2(x,pv)
1299:             /* Load First Column of X*/
1300:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1301:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1303:             /* Matrix-Vector Product: */
1304:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1305:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1306:             SSE_MULT_PS(XMM5,XMM0)
1307:             SSE_SUB_PS(XMM4,XMM5)

1309:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1310:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1311:             SSE_MULT_PS(XMM6,XMM1)
1312:             SSE_SUB_PS(XMM4,XMM6)

1314:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1315:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1316:             SSE_MULT_PS(XMM7,XMM2)
1317:             SSE_SUB_PS(XMM4,XMM7)

1319:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1320:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1321:             SSE_MULT_PS(XMM5,XMM3)
1322:             SSE_SUB_PS(XMM4,XMM5)

1324:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1325:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)

1327:             /* Second Column */
1328:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1329:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1331:             /* Matrix-Vector Product: */
1332:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1333:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1334:             SSE_MULT_PS(XMM6,XMM0)
1335:             SSE_SUB_PS(XMM5,XMM6)

1337:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1338:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1339:             SSE_MULT_PS(XMM7,XMM1)
1340:             SSE_SUB_PS(XMM5,XMM7)

1342:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1343:             SSE_SHUFFLE(XMM4,XMM4,0x00)
1344:             SSE_MULT_PS(XMM4,XMM2)
1345:             SSE_SUB_PS(XMM5,XMM4)

1347:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1348:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1349:             SSE_MULT_PS(XMM6,XMM3)
1350:             SSE_SUB_PS(XMM5,XMM6)
1351: 
1352:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1353:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)

1355:             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)

1357:             /* Third Column */
1358:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1359:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)

1361:             /* Matrix-Vector Product: */
1362:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1363:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1364:             SSE_MULT_PS(XMM7,XMM0)
1365:             SSE_SUB_PS(XMM6,XMM7)

1367:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1368:             SSE_SHUFFLE(XMM4,XMM4,0x00)
1369:             SSE_MULT_PS(XMM4,XMM1)
1370:             SSE_SUB_PS(XMM6,XMM4)

1372:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1373:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1374:             SSE_MULT_PS(XMM5,XMM2)
1375:             SSE_SUB_PS(XMM6,XMM5)

1377:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1378:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1379:             SSE_MULT_PS(XMM7,XMM3)
1380:             SSE_SUB_PS(XMM6,XMM7)
1381: 
1382:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1383:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1384: 
1385:             /* Fourth Column */
1386:             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1387:             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)

1389:             /* Matrix-Vector Product: */
1390:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1391:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1392:             SSE_MULT_PS(XMM5,XMM0)
1393:             SSE_SUB_PS(XMM4,XMM5)

1395:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1396:             SSE_SHUFFLE(XMM6,XMM6,0x00)
1397:             SSE_MULT_PS(XMM6,XMM1)
1398:             SSE_SUB_PS(XMM4,XMM6)

1400:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1401:             SSE_SHUFFLE(XMM7,XMM7,0x00)
1402:             SSE_MULT_PS(XMM7,XMM2)
1403:             SSE_SUB_PS(XMM4,XMM7)

1405:             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1406:             SSE_SHUFFLE(XMM5,XMM5,0x00)
1407:             SSE_MULT_PS(XMM5,XMM3)
1408:             SSE_SUB_PS(XMM4,XMM5)
1409: 
1410:             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1411:             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1412:           SSE_INLINE_END_2;
1413:           pv   += 16;
1414:         }
1415:         PetscLogFlops(128*nz+112);
1416:       }
1417:       row = (unsigned int)(*bjtmp++);
1418: /*        row = (*bjtmp++)/4; */
1419: /*        bjtmp++; */
1420:     }
1421:     /* finished row so stick it into b->a */
1422:     pv = ba + 16*bi[i];
1423:     pj = bj + bi[i];
1424:     nz = bi[i+1] - bi[i];

1426:     /* Copy x block back into pv block */
1427:     for (j=0; j<nz; j++) {
1428:       x  = rtmp+16*((unsigned int)pj[j]);
1429: /*        x  = rtmp+4*pj[j]; */

1431:       SSE_INLINE_BEGIN_2(x,pv)
1432:         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1433:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1434:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)

1436:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1437:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)

1439:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1440:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)

1442:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1443:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)

1445:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1446:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)

1448:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1449:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)

1451:         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1452:         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)

1454:         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1455:         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1456:       SSE_INLINE_END_2;
1457:       pv += 16;
1458:     }
1459:     /* invert diagonal block */
1460:     w = ba + 16*diag_offset[i];
1461:     if (pivotinblocks) {
1462:       Kernel_A_gets_inverse_A_4(w);
1463:     } else {
1464:       Kernel_A_gets_inverse_A_4_nopivot(w);
1465:     }
1466: /*      Kernel_A_gets_inverse_A_4_SSE(w); */
1467:     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1468:   }

1470:   PetscFree(rtmp);
1471:   C->factor    = FACTOR_LU;
1472:   C->assembled = PETSC_TRUE;
1473:   PetscLogFlops(1.3333*64*b->mbs);
1474:   /* Flop Count from inverting diagonal blocks */
1475:   SSE_SCOPE_END;
1476:   return(0);
1477: }

1479: #endif