Actual source code: dgefa4.c

  1: #define PETSCMAT_DLL

  3: /*
  4:        Inverts 4 by 4 matrix using partial pivoting.

  6:        Used by the sparse factorization routines in 
  7:      src/mat/impls/baij/seq and src/mat/impls/bdiag/seq

  9:        See also src/inline/ilu.h

 11:        This is a combination of the Linpack routines
 12:     dgefa() and dgedi() specialized for a size of 4.

 14: */
 15:  #include petsc.h

 19: PetscErrorCode Kernel_A_gets_inverse_A_4(MatScalar *a)
 20: {
 21:     PetscInt   i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3;
 22:     PetscInt   k4,j3;
 23:     MatScalar  *aa,*ax,*ay,work[16],stmp;
 24:     MatReal    tmp,max;

 26: /*     gaussian elimination with partial pivoting */

 29:     /* Parameter adjustments */
 30:     a       -= 5;

 32:     for (k = 1; k <= 3; ++k) {
 33:         kp1 = k + 1;
 34:         k3  = 4*k;
 35:         k4  = k3 + k;
 36: /*        find l = pivot index */

 38:         i__2 = 4 - k;
 39:         aa = &a[k4];
 40:         max = PetscAbsScalar(aa[0]);
 41:         l = 1;
 42:         for (ll=1; ll<i__2; ll++) {
 43:           tmp = PetscAbsScalar(aa[ll]);
 44:           if (tmp > max) { max = tmp; l = ll+1;}
 45:         }
 46:         l       += k - 1;
 47:         ipvt[k-1] = l;

 49:         if (a[l + k3] == 0.0) {
 50:           SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",k-1);
 51:         }

 53: /*           interchange if necessary */

 55:         if (l != k) {
 56:           stmp      = a[l + k3];
 57:           a[l + k3] = a[k4];
 58:           a[k4]     = stmp;
 59:         }

 61: /*           compute multipliers */

 63:         stmp = -1. / a[k4];
 64:         i__2 = 4 - k;
 65:         aa = &a[1 + k4];
 66:         for (ll=0; ll<i__2; ll++) {
 67:           aa[ll] *= stmp;
 68:         }

 70: /*           row elimination with column indexing */

 72:         ax = &a[k4+1];
 73:         for (j = kp1; j <= 4; ++j) {
 74:             j3   = 4*j;
 75:             stmp = a[l + j3];
 76:             if (l != k) {
 77:               a[l + j3] = a[k + j3];
 78:               a[k + j3] = stmp;
 79:             }

 81:             i__3 = 4 - k;
 82:             ay = &a[1+k+j3];
 83:             for (ll=0; ll<i__3; ll++) {
 84:               ay[ll] += stmp*ax[ll];
 85:             }
 86:         }
 87:     }
 88:     ipvt[3] = 4;
 89:     if (a[20] == 0.0) {
 90:       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",3);
 91:     }

 93:     /*
 94:          Now form the inverse 
 95:     */

 97:    /*     compute inverse(u) */

 99:     for (k = 1; k <= 4; ++k) {
100:         k3    = 4*k;
101:         k4    = k3 + k;
102:         a[k4] = 1.0 / a[k4];
103:         stmp  = -a[k4];
104:         i__2  = k - 1;
105:         aa    = &a[k3 + 1];
106:         for (ll=0; ll<i__2; ll++) aa[ll] *= stmp;
107:         kp1 = k + 1;
108:         if (4 < kp1) continue;
109:         ax = aa;
110:         for (j = kp1; j <= 4; ++j) {
111:             j3        = 4*j;
112:             stmp      = a[k + j3];
113:             a[k + j3] = 0.0;
114:             ay        = &a[j3 + 1];
115:             for (ll=0; ll<k; ll++) {
116:               ay[ll] += stmp*ax[ll];
117:             }
118:         }
119:     }

121:    /*    form inverse(u)*inverse(l) */

123:     for (kb = 1; kb <= 3; ++kb) {
124:         k   = 4 - kb;
125:         k3  = 4*k;
126:         kp1 = k + 1;
127:         aa  = a + k3;
128:         for (i = kp1; i <= 4; ++i) {
129:             work[i-1] = aa[i];
130:             aa[i]   = 0.0;
131:         }
132:         for (j = kp1; j <= 4; ++j) {
133:             stmp  = work[j-1];
134:             ax    = &a[4*j + 1];
135:             ay    = &a[k3 + 1];
136:             ay[0] += stmp*ax[0];
137:             ay[1] += stmp*ax[1];
138:             ay[2] += stmp*ax[2];
139:             ay[3] += stmp*ax[3];
140:         }
141:         l = ipvt[k-1];
142:         if (l != k) {
143:             ax = &a[k3 + 1];
144:             ay = &a[4*l + 1];
145:             stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp;
146:             stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp;
147:             stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp;
148:             stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp;
149:         }
150:     }
151:     return(0);
152: }

154: #if defined(PETSC_HAVE_SSE)
155: #include PETSC_HAVE_SSE

159: PetscErrorCode Kernel_A_gets_inverse_A_4_SSE(float *a)
160: {
161:   /* 
162:      This routine is converted from Intel's Small Matrix Library.
163:      See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix
164:      Order Number: 245043-001
165:      March 1999
166:      http://www.intel.com

168:      Inverse of a 4x4 matrix via Kramer's Rule:
169:      bool Invert4x4(SMLXMatrix &);
170:   */

173:   SSE_SCOPE_BEGIN;
174:     SSE_INLINE_BEGIN_1(a)

176: /* ----------------------------------------------- */

178:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
179:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0)

181:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
182:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5)

184:       SSE_COPY_PS(XMM3,XMM0)
185:       SSE_SHUFFLE(XMM3,XMM5,0x88)

187:       SSE_SHUFFLE(XMM5,XMM0,0xDD)

189:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0)
190:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0)

192:       SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6)
193:       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6)

195:       SSE_COPY_PS(XMM4,XMM0)
196:       SSE_SHUFFLE(XMM4,XMM6,0x88)

198:       SSE_SHUFFLE(XMM6,XMM0,0xDD)

200: /* ----------------------------------------------- */

202:       SSE_COPY_PS(XMM7,XMM4)
203:       SSE_MULT_PS(XMM7,XMM6)

205:       SSE_SHUFFLE(XMM7,XMM7,0xB1)

207:       SSE_COPY_PS(XMM0,XMM5)
208:       SSE_MULT_PS(XMM0,XMM7)

210:       SSE_COPY_PS(XMM2,XMM3)
211:       SSE_MULT_PS(XMM2,XMM7)

213:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

215:       SSE_COPY_PS(XMM1,XMM5)
216:       SSE_MULT_PS(XMM1,XMM7)
217:       SSE_SUB_PS(XMM1,XMM0)

219:       SSE_MULT_PS(XMM7,XMM3)
220:       SSE_SUB_PS(XMM7,XMM2)

222:       SSE_SHUFFLE(XMM7,XMM7,0x4E)
223:       SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7)

225: /* ----------------------------------------------- */

227:       SSE_COPY_PS(XMM0,XMM5)
228:       SSE_MULT_PS(XMM0,XMM4)

230:       SSE_SHUFFLE(XMM0,XMM0,0xB1)

232:       SSE_COPY_PS(XMM2,XMM6)
233:       SSE_MULT_PS(XMM2,XMM0)
234:       SSE_ADD_PS(XMM2,XMM1)
235: 
236:       SSE_COPY_PS(XMM7,XMM3)
237:       SSE_MULT_PS(XMM7,XMM0)

239:       SSE_SHUFFLE(XMM0,XMM0,0x4E)

241:       SSE_COPY_PS(XMM1,XMM6)
242:       SSE_MULT_PS(XMM1,XMM0)
243:       SSE_SUB_PS(XMM2,XMM1)

245:       SSE_MULT_PS(XMM0,XMM3)
246:       SSE_SUB_PS(XMM0,XMM7)

248:       SSE_SHUFFLE(XMM0,XMM0,0x4E)
249:       SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0)

251:       /* ----------------------------------------------- */

253:       SSE_COPY_PS(XMM7,XMM5)
254:       SSE_SHUFFLE(XMM7,XMM5,0x4E)
255:       SSE_MULT_PS(XMM7,XMM6)

257:       SSE_SHUFFLE(XMM7,XMM7,0xB1)

259:       SSE_SHUFFLE(XMM4,XMM4,0x4E)

261:       SSE_COPY_PS(XMM0,XMM4)
262:       SSE_MULT_PS(XMM0,XMM7)
263:       SSE_ADD_PS(XMM0,XMM2)

265:       SSE_COPY_PS(XMM2,XMM3)
266:       SSE_MULT_PS(XMM2,XMM7)

268:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

270:       SSE_COPY_PS(XMM1,XMM4)
271:       SSE_MULT_PS(XMM1,XMM7)
272:       SSE_SUB_PS(XMM0,XMM1)
273:       SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0)

275:       SSE_MULT_PS(XMM7,XMM3)
276:       SSE_SUB_PS(XMM7,XMM2)

278:       SSE_SHUFFLE(XMM7,XMM7,0x4E)

280:       /* ----------------------------------------------- */

282:       SSE_COPY_PS(XMM1,XMM3)
283:       SSE_MULT_PS(XMM1,XMM5)

285:       SSE_SHUFFLE(XMM1,XMM1,0xB1)

287:       SSE_COPY_PS(XMM0,XMM6)
288:       SSE_MULT_PS(XMM0,XMM1)
289:       SSE_ADD_PS(XMM0,XMM7)
290: 
291:       SSE_COPY_PS(XMM2,XMM4)
292:       SSE_MULT_PS(XMM2,XMM1)
293:       SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12)

295:       SSE_SHUFFLE(XMM1,XMM1,0x4E)

297:       SSE_COPY_PS(XMM7,XMM6)
298:       SSE_MULT_PS(XMM7,XMM1)
299:       SSE_SUB_PS(XMM7,XMM0)

301:       SSE_MULT_PS(XMM1,XMM4)
302:       SSE_SUB_PS(XMM2,XMM1)
303:       SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2)

305:       /* ----------------------------------------------- */

307:       SSE_COPY_PS(XMM1,XMM3)
308:       SSE_MULT_PS(XMM1,XMM6)

310:       SSE_SHUFFLE(XMM1,XMM1,0xB1)

312:       SSE_COPY_PS(XMM2,XMM4)
313:       SSE_MULT_PS(XMM2,XMM1)
314:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0)
315:       SSE_SUB_PS(XMM0,XMM2)

317:       SSE_COPY_PS(XMM2,XMM5)
318:       SSE_MULT_PS(XMM2,XMM1)
319:       SSE_ADD_PS(XMM2,XMM7)

321:       SSE_SHUFFLE(XMM1,XMM1,0x4E)

323:       SSE_COPY_PS(XMM7,XMM4)
324:       SSE_MULT_PS(XMM7,XMM1)
325:       SSE_ADD_PS(XMM7,XMM0)

327:       SSE_MULT_PS(XMM1,XMM5)
328:       SSE_SUB_PS(XMM2,XMM1)

330:       /* ----------------------------------------------- */

332:       SSE_MULT_PS(XMM4,XMM3)

334:       SSE_SHUFFLE(XMM4,XMM4,0xB1)

336:       SSE_COPY_PS(XMM1,XMM6)
337:       SSE_MULT_PS(XMM1,XMM4)
338:       SSE_ADD_PS(XMM1,XMM7)

340:       SSE_COPY_PS(XMM0,XMM5)
341:       SSE_MULT_PS(XMM0,XMM4)
342:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7)
343:       SSE_SUB_PS(XMM7,XMM0)

345:       SSE_SHUFFLE(XMM4,XMM4,0x4E)

347:       SSE_MULT_PS(XMM6,XMM4)
348:       SSE_SUB_PS(XMM1,XMM6)

350:       SSE_MULT_PS(XMM5,XMM4)
351:       SSE_ADD_PS(XMM5,XMM7)

353:       /* ----------------------------------------------- */

355:       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
356:       SSE_MULT_PS(XMM3,XMM0)

358:       SSE_COPY_PS(XMM4,XMM3)
359:       SSE_SHUFFLE(XMM4,XMM3,0x4E)
360:       SSE_ADD_PS(XMM4,XMM3)

362:       SSE_COPY_PS(XMM6,XMM4)
363:       SSE_SHUFFLE(XMM6,XMM4,0xB1)
364:       SSE_ADD_SS(XMM6,XMM4)

366:       SSE_COPY_PS(XMM3,XMM6)
367:       SSE_RECIP_SS(XMM3,XMM6)
368:       SSE_COPY_SS(XMM4,XMM3)
369:       SSE_ADD_SS(XMM4,XMM3)
370:       SSE_MULT_SS(XMM3,XMM3)
371:       SSE_MULT_SS(XMM6,XMM3)
372:       SSE_SUB_SS(XMM4,XMM6)

374:       SSE_SHUFFLE(XMM4,XMM4,0x00)

376:       SSE_MULT_PS(XMM0,XMM4)
377:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0)
378:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0)

380:       SSE_MULT_PS(XMM1,XMM4)
381:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1)
382:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1)

384:       SSE_MULT_PS(XMM2,XMM4)
385:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2)
386:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2)

388:       SSE_MULT_PS(XMM4,XMM5)
389:       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
390:       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)

392:       /* ----------------------------------------------- */

394:       SSE_INLINE_END_1;
395:   SSE_SCOPE_END;

397:   return(0);
398: }

400: #endif