Actual source code: blockinvert.h
1: /*
2: Kernels used in sparse ILU (and LU) and in the resulting triangular
3: solves. These are for block algorithms where the block sizes are on
4: the order of 2-6+.
6: There are TWO versions of the macros below.
7: 1) standard for MatScalar == PetscScalar use the standard BLAS for
8: block size larger than 7 and
9: 2) handcoded Fortran single precision for the matrices, since BLAS
10: does not have some arguments in single and some in double.
12: */
15: #include <petscblaslapack.h>
17: /*
18: These are C kernels,they are contained in
19: src/mat/impls/baij/seq
20: */
27: #define Kernel_A_gets_inverse_A_4_nopivot(mat) 0;\
28: {\
29: MatScalar d, di;\
30: \
31: di = mat[0];\
32: mat[0] = d = 1.0 / di;\
33: mat[4] *= -d;\
34: mat[8] *= -d;\
35: mat[12] *= -d;\
36: mat[1] *= d;\
37: mat[2] *= d;\
38: mat[3] *= d;\
39: mat[5] += mat[4] * mat[1] * di;\
40: mat[6] += mat[4] * mat[2] * di;\
41: mat[7] += mat[4] * mat[3] * di;\
42: mat[9] += mat[8] * mat[1] * di;\
43: mat[10] += mat[8] * mat[2] * di;\
44: mat[11] += mat[8] * mat[3] * di;\
45: mat[13] += mat[12] * mat[1] * di;\
46: mat[14] += mat[12] * mat[2] * di;\
47: mat[15] += mat[12] * mat[3] * di;\
48: di = mat[5];\
49: mat[5] = d = 1.0 / di;\
50: mat[1] *= -d;\
51: mat[9] *= -d;\
52: mat[13] *= -d;\
53: mat[4] *= d;\
54: mat[6] *= d;\
55: mat[7] *= d;\
56: mat[0] += mat[1] * mat[4] * di;\
57: mat[2] += mat[1] * mat[6] * di;\
58: mat[3] += mat[1] * mat[7] * di;\
59: mat[8] += mat[9] * mat[4] * di;\
60: mat[10] += mat[9] * mat[6] * di;\
61: mat[11] += mat[9] * mat[7] * di;\
62: mat[12] += mat[13] * mat[4] * di;\
63: mat[14] += mat[13] * mat[6] * di;\
64: mat[15] += mat[13] * mat[7] * di;\
65: di = mat[10];\
66: mat[10] = d = 1.0 / di;\
67: mat[2] *= -d;\
68: mat[6] *= -d;\
69: mat[14] *= -d;\
70: mat[8] *= d;\
71: mat[9] *= d;\
72: mat[11] *= d;\
73: mat[0] += mat[2] * mat[8] * di;\
74: mat[1] += mat[2] * mat[9] * di;\
75: mat[3] += mat[2] * mat[11] * di;\
76: mat[4] += mat[6] * mat[8] * di;\
77: mat[5] += mat[6] * mat[9] * di;\
78: mat[7] += mat[6] * mat[11] * di;\
79: mat[12] += mat[14] * mat[8] * di;\
80: mat[13] += mat[14] * mat[9] * di;\
81: mat[15] += mat[14] * mat[11] * di;\
82: di = mat[15];\
83: mat[15] = d = 1.0 / di;\
84: mat[3] *= -d;\
85: mat[7] *= -d;\
86: mat[11] *= -d;\
87: mat[12] *= d;\
88: mat[13] *= d;\
89: mat[14] *= d;\
90: mat[0] += mat[3] * mat[12] * di;\
91: mat[1] += mat[3] * mat[13] * di;\
92: mat[2] += mat[3] * mat[14] * di;\
93: mat[4] += mat[7] * mat[12] * di;\
94: mat[5] += mat[7] * mat[13] * di;\
95: mat[6] += mat[7] * mat[14] * di;\
96: mat[8] += mat[11] * mat[12] * di;\
97: mat[9] += mat[11] * mat[13] * di;\
98: mat[10] += mat[11] * mat[14] * di;\
99: }
102: # if defined(PETSC_HAVE_SSE)
104: # endif
111: /*
112: A = inv(A) A_gets_inverse_A
114: A - square bs by bs array stored in column major order
115: pivots - integer work array of length bs
116: W - bs by 1 work array
117: */
118: #define Kernel_A_gets_inverse_A(bs,A,pivots,W) (LINPACKdgefa((A),(bs),(pivots)) || LINPACKdgedi((A),(bs),(pivots),(W)))
120: /* -----------------------------------------------------------------------*/
122: #if !defined(PETSC_USE_REAL_MAT_SINGLE)
123: /*
124: Version that calls the BLAS directly
125: */
126: /*
127: A = A * B A_gets_A_times_B
129: A, B - square bs by bs arrays stored in column major order
130: W - square bs by bs work array
132: */
133: #define Kernel_A_gets_A_times_B(bs,A,B,W) \
134: { \
135: PetscBLASInt _bbs; \
136: PetscScalar _one = 1.0,_zero = 0.0; \
137: PetscErrorCode _ierr; \
138: _bbs = PetscBLASIntCast(bs); \
139: _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \
140: BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_one,(W),&(_bbs),(B),&(_bbs),&_zero,(A),&(_bbs));\
141: }
143: /*
145: A = A - B * C A_gets_A_minus_B_times_C
147: A, B, C - square bs by bs arrays stored in column major order
148: */
149: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
150: { \
151: PetscScalar _mone = -1.0,_one = 1.0; \
152: PetscBLASInt _bbs = PetscBLASIntCast(bs); \
153: BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_mone,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs));\
154: }
156: /*
158: A = A + B^T * C A_gets_A_plus_Btranspose_times_C
160: A, B, C - square bs by bs arrays stored in column major order
161: */
162: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) \
163: { \
164: PetscScalar _one = 1.0; \
165: PetscBLASInt _bbs = PetscBLASIntCast(bs); \
166: BLASgemm_("T","N",&(_bbs),&(_bbs),&(_bbs),&_one,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs));\
167: }
169: /*
170: v = v + A^T w v_gets_v_plus_Atranspose_times_w
172: v - array of length bs
173: A - square bs by bs array
174: w - array of length bs
175: */
176: #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) \
177: { \
178: PetscScalar _one = 1.0; \
179: PetscBLASInt _ione = 1, _bbs = PetscBLASIntCast(bs); \
180: BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione); \
181: }
183: /*
184: v = v - A w v_gets_v_minus_A_times_w
186: v - array of length bs
187: A - square bs by bs array
188: w - array of length bs
189: */
190: #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
191: { \
192: PetscScalar _mone = -1.0,_one = 1.0; \
193: PetscBLASInt _ione = 1,_bbs = PetscBLASIntCast(bs); \
194: BLASgemv_("N",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione); \
195: }
197: /*
198: v = v - A w v_gets_v_minus_transA_times_w
200: v - array of length bs
201: A - square bs by bs array
202: w - array of length bs
203: */
204: #define Kernel_v_gets_v_minus_transA_times_w(bs,v,A,w) \
205: { \
206: PetscScalar _mone = -1.0,_one = 1.0; \
207: PetscBLASInt _ione = 1,_bbs = PetscBLASIntCast(bs); \
208: BLASgemv_("T",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione); \
209: }
211: /*
212: v = v + A w v_gets_v_plus_A_times_w
214: v - array of length bs
215: A - square bs by bs array
216: w - array of length bs
217: */
218: #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
219: { \
220: PetscScalar _one = 1.0; \
221: PetscBLASInt _ione = 1,_bbs = PetscBLASIntCast(bs); \
222: BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione); \
223: }
225: /*
226: v = v + A w v_gets_v_plus_Ar_times_w
228: v - array of length bs
229: A - square bs by bs array
230: w - array of length bs
231: */
232: #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) \
233: { \
234: PetscScalar _one = 1.0; \
235: PetscBLASInt _ione = 1,_bbs,_bncols; \
236: _bbs = PetscBLASIntCast(bs); _bncols = PetscBLASIntCast(ncols); \
237: BLASgemv_("N",&(_bbs),&(_bncols),&_one,A,&(_bbs),v,&_ione,&_one,w,&_ione); \
238: }
240: /*
241: v = v - A w v_gets_v_minus_Ar_times_w
243: v - array of length ncol
244: A(bs,ncols)
245: w - array of length bs
246: */
247: #define Kernel_w_gets_w_minus_Ar_times_v(bs,ncols,w,A,v) \
248: { \
249: PetscScalar _one = 1.0,_mone = -1.0; \
250: PetscBLASInt _ione = 1,_bbs,_bncols; \
251: _bbs = PetscBLASIntCast(bs); _bncols = PetscBLASIntCast(ncols); \
252: BLASgemv_("N",&(_bbs),&(_bncols),&_mone,A,&(_bbs),v,&_ione,&_one,w,&_ione); \
253: }
255: /*
256: w = A v w_gets_A_times_v
258: v - array of length bs
259: A - square bs by bs array
260: w - array of length bs
261: */
262: #define Kernel_w_gets_A_times_v(bs,v,A,w) \
263: { \
264: PetscScalar _zero = 0.0,_one = 1.0; \
265: PetscBLASInt _ione = 1,_bbs = PetscBLASIntCast(bs); \
266: BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione); \
267: }
269: /*
270: w = A' v w_gets_transA_times_v
272: v - array of length bs
273: A - square bs by bs array
274: w - array of length bs
275: */
276: #define Kernel_w_gets_transA_times_v(bs,v,A,w) \
277: { \
278: PetscScalar _zero = 0.0,_one = 1.0; \
279: PetscBLASInt _ione = 1,_bbs = PetscBLASIntCast(bs); \
280: BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione); \
281: }
283: /*
284: z = A*x
285: */
286: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
287: { \
288: PetscScalar _one = 1.0,_zero = 0.0; \
289: PetscBLASInt _ione = 1,_bbs,_bncols; \
290: _bbs = PetscBLASIntCast(bs); _bncols = PetscBLASIntCast(ncols); \
291: BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione); \
292: }
294: /*
295: z = trans(A)*x
296: */
297: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
298: { \
299: PetscScalar _one = 1.0; \
300: PetscBLASInt _ione = 1,_bbs,_bncols;\
301: _bbs = PetscBLASIntCast(bs); _bncols = PetscBLASIntCast(ncols); \
302: BLASgemv_("T",&_bbs,&_bncols,&_one,A,&_bbs,x,&_ione,&_one,z,&_ione); \
303: }
305: #else
306: /*
307: Version that calls Fortran routines; can handle different precision
308: of matrix (array) and vectors
309: */
310: /*
311: These are Fortran kernels: They replace certain BLAS routines but
312: have some arguments that may be single precision,rather than double
313: These routines are provided in src/fortran/kernels/sgemv.F
314: They are pretty pitiful but get the job done. The intention is
315: that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
316: code is used.
317: */
319: #include <../src/mat/ftn-kernels/sgemv.h>
321: /*
322: A = A * B A_gets_A_times_B
324: A, B - square bs by bs arrays stored in column major order
325: W - square bs by bs work array
327: */
328: #define Kernel_A_gets_A_times_B(bs,A,B,W) \
329: { \
330: PetscErrorCode _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \
331: msgemmi_(&bs,A,B,W); \
332: }
334: /*
336: A = A - B * C A_gets_A_minus_B_times_C
338: A, B, C - square bs by bs arrays stored in column major order
339: */
340: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
341: { \
342: msgemm_(&bs,A,B,C); \
343: }
345: /*
346: v = v - A w v_gets_v_minus_A_times_w
348: v - array of length bs
349: A - square bs by bs array
350: w - array of length bs
351: */
352: #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
353: { \
354: msgemvm_(&bs,&bs,A,w,v); \
355: }
357: /*
358: v = v + A w v_gets_v_plus_A_times_w
360: v - array of length bs
361: A - square bs by bs array
362: w - array of length bs
363: */
364: #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
365: { \
366: msgemvp_(&bs,&bs,A,w,v);\
367: }
369: /*
370: v = v + A w v_gets_v_plus_Ar_times_w
372: v - array of length bs
373: A - square bs by bs array
374: w - array of length bs
375: */
376: #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) \
377: { \
378: msgemvp_(&bs,&ncol,A,v,w);\
379: }
381: /*
382: w = A v w_gets_A_times_v
384: v - array of length bs
385: A - square bs by bs array
386: w - array of length bs
387: */
388: #define Kernel_w_gets_A_times_v(bs,v,A,w) \
389: { \
390: msgemv_(&bs,&bs,A,v,w);\
391: }
392:
393: /*
394: z = A*x
395: */
396: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
397: { \
398: msgemv_(&bs,&ncols,A,x,z);\
399: }
401: /*
402: z = trans(A)*x
403: */
404: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
405: { \
406: msgemvt_(&bs,&ncols,A,x,z);\
407: }
409: /* These do not work yet */
410: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
411: #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)
414: #endif
416: #endif