Actual source code: vecpthread.c
2: /*
3: Implements the sequential pthread based vectors.
4: */
5: #ifndef _GNU_SOURCE
6: #define _GNU_SOURCE
7: #endif
8: #include <sched.h>
9: #include <petscconf.h>
10: #include <private/vecimpl.h> /*I "petscvec.h" I*/
11: #include <../src/vec/vec/impls/dvecimpl.h>
12: #include <petscblaslapack.h>
13: #include <private/petscaxpy.h>
14: #include <pthread.h>
15: #include <unistd.h>
23: void* PetscThreadRun(MPI_Comm Comm,void* (*pFunc)(void*),int,pthread_t*,void**);
24: void* PetscThreadStop(MPI_Comm Comm,int,pthread_t*);
25: void* DoCoreAffinity(void);
27: typedef struct {
28: const PetscScalar *x,*y;
29: PetscInt n;
30: PetscScalar result;
31: } VecDot_KernelData;
33: typedef struct {
34: PetscScalar *x;
35: PetscScalar alpha;
36: PetscInt n;
37: } VecScale_KernelData;
39: typedef struct {
40: PetscScalar *y;
41: const PetscScalar *x;
42: PetscScalar alpha;
43: PetscInt n;
44: } VecAXPY_KernelData;
46: typedef struct {
47: PetscScalar *yy;
48: const PetscScalar *xx;
49: PetscScalar alpha;
50: PetscInt n;
51: } VecAYPX_KernelData;
53: typedef struct {
54: PetscScalar *ww;
55: const PetscScalar *yy;
56: const PetscScalar *xx;
57: PetscScalar alpha;
58: PetscInt n;
59: } VecWAXPY_KernelData;
61: typedef struct {
62: const PetscScalar *x;
63: NormType typeUse;
64: PetscInt n;
65: PetscScalar result;
66: } VecNorm_KernelData;
68: typedef struct {
69: const PetscScalar* xvalin;
70: Vec* yavecin;
71: PetscInt nelem;
72: PetscInt ntoproc;
73: PetscScalar* result;
74: } VecMDot_KernelData;
76: typedef struct {
77: const PetscScalar *x;
78: PetscInt gind;
79: PetscInt localn;
80: PetscInt localind;
81: PetscReal localmax;
82: } VecMax_KernelData;
84: typedef struct {
85: const PetscScalar *x;
86: PetscInt gind;
87: PetscInt localn;
88: PetscInt localind;
89: PetscReal localmin;
90: } VecMin_KernelData;
92: typedef struct {
93: PetscScalar *wpin,*xpin,*ypin;
94: PetscInt nlocal;
95: } VecPointwiseMult_KernelData;
97: typedef struct {
98: PetscScalar *wpin,*xpin,*ypin;
99: PetscInt nlocal;
100: } VecPointwiseDivide_KernelData;
102: typedef struct {
103: PetscScalar *xpin,*ypin;
104: PetscInt nlocal;
105: } VecSwap_KernelData;
107: typedef struct {
108: PetscScalar *xpin;
109: PetscRandom rin;
110: PetscInt nlocal;
111: } VecSetRandom_KernelData;
113: typedef struct {
114: const PetscScalar *xpin;
115: PetscScalar *ypin;
116: PetscInt nlocal;
117: } VecCopy_KernelData;
119: typedef struct {
120: PetscScalar* xavalin; //vector out
121: Vec* yavecin; //array of data vectors
122: const PetscScalar* amult; //multipliers
123: PetscInt nelem; //number of elements in vector to process
124: PetscInt ntoproc; //number of data vectors
125: PetscInt ibase; //used to properly index into other vectors
126: } VecMAXPY_KernelData;
128: typedef struct {
129: PetscScalar *xpin;
130: PetscScalar alphain;
131: PetscInt nelem;
132: } VecSet_KernelData;
134: void* PetscThreadRun(MPI_Comm Comm,void* (*funcp)(void*),int iTotThreads,pthread_t* ThreadId,void** data) {
135: PetscInt ierr;
136: int i;
137: for(i=0; i<iTotThreads; i++) {
138: pthread_create(&ThreadId[i],NULL,funcp,data[i]);
139: }
140: return NULL;
141: }
143: void* PetscThreadStop(MPI_Comm Comm,int iTotThreads,pthread_t* ThreadId) {
144: int i;
145: void* joinstatus;
146: for (i=0; i<iTotThreads; i++) {
147: pthread_join(ThreadId[i], &joinstatus);
148: }
149: return NULL;
150: }
152: /* Change these macros so can be used in thread kernels */
153: #undef CHKERRQP
154: #define CHKERRQP(ierr) if (ierr) return (void*)(long int)ierr
156: void* VecDot_Kernel(void *arg)
157: {
158: if(PetscUseThreadPool==PETSC_FALSE) {
159: DoCoreAffinity();
160: }
161: VecDot_KernelData *data = (VecDot_KernelData*)arg;
162: const PetscScalar *x, *y;
163: PetscBLASInt one = 1, bn;
164: PetscInt n;
166: x = data->x;
167: y = data->y;
168: n = data->n;
169: bn = PetscBLASIntCast(n);
170: data->result = BLASdot_(&bn,x,&one,y,&one);
171: return(0);
172: }
176: PetscErrorCode VecDot_SeqPThread(Vec xin,Vec yin,PetscScalar *z)
177: {
178: const PetscScalar *ya,*xa;
179: PetscErrorCode ierr;
180: PetscInt i, iIndex = 0;
181: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
182: PetscInt Q = xin->map->n/(iNumThreads);
183: PetscInt R = xin->map->n-Q*(iNumThreads);
184: PetscBool S;
187: VecGetArrayRead(xin,&xa);
188: VecGetArrayRead(yin,&ya);
190: VecDot_KernelData* kerneldatap = (VecDot_KernelData*)malloc(iNumThreads*sizeof(VecDot_KernelData));
191: VecDot_KernelData** pdata = (VecDot_KernelData**)malloc(iNumThreads*sizeof(VecDot_KernelData*));
193: for (i=0; i<iNumThreads; i++) {
194: S = (PetscBool)(i<R);
195: kerneldatap[i].x = &xa[iIndex];
196: kerneldatap[i].y = &ya[iIndex];
197: kerneldatap[i].n = S?Q+1:Q;
198: iIndex += kerneldatap[i].n;
199: pdata[i] = &kerneldatap[i];
200: }
202: MainJob(VecDot_Kernel,(void**)pdata,iNumThreads);
204: //gather result
205: *z = 0.0;
206: for(i=0; i<iNumThreads; i++) {
207: *z += kerneldatap[i].result;
208: }
209: free(kerneldatap);
210: free(pdata);
212: VecRestoreArrayRead(xin,&xa);
213: VecRestoreArrayRead(yin,&ya);
215: if (xin->map->n > 0) {
216: PetscLogFlops(2.0*xin->map->n-1);
217: }
218: PetscFunctionReturn(ierr);
219: }
222: void* VecScale_Kernel(void *arg)
223: {
224: if(PetscUseThreadPool==PETSC_FALSE) {
225: DoCoreAffinity();
226: }
227: VecScale_KernelData *data = (VecScale_KernelData*)arg;
228: PetscScalar a,*x;
229: PetscBLASInt one = 1, bn;
230: PetscInt n;
232: x = data->x;
233: a = data->alpha;
234: n = data->n;
235: bn = PetscBLASIntCast(n);
236: BLASscal_(&bn,&a,x,&one);
237: return(0);
238: }
242: PetscErrorCode VecScale_SeqPThread(Vec xin, PetscScalar alpha)
243: {
248: if (alpha == 0.0) {
249: VecSet_Seq(xin,alpha);
250: } else if (alpha != 1.0) {
251: PetscScalar a = alpha,*xarray,*xp;
252: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
253: PetscInt i,Q = xin->map->n/(iNumThreads);
254: PetscInt R = xin->map->n-Q*(iNumThreads);
255: PetscBool S;
256: VecScale_KernelData* kerneldatap = (VecScale_KernelData*)malloc(iNumThreads*sizeof(VecScale_KernelData));
257: VecScale_KernelData** pdata = (VecScale_KernelData**)malloc(iNumThreads*sizeof(VecScale_KernelData*));
258: VecGetArray(xin,&xarray); //get address of first element in data array
259: xp = xarray;
260: for (i=0; i<iNumThreads; i++) {
261: S = (PetscBool)(i<R);
262: kerneldatap[i].x = xp;
263: kerneldatap[i].alpha = a;
264: kerneldatap[i].n = S?Q+1:Q;
265: xp += kerneldatap[i].n; //pointer arithmetic
266: pdata[i] = &kerneldatap[i];
267: }
268: MainJob(VecScale_Kernel,(void**)pdata,iNumThreads);
269: free(kerneldatap);
270: free(pdata);
271: VecRestoreArray(xin,&xarray);
272: }
273: PetscLogFlops(xin->map->n);
274: PetscFunctionReturn(ierr);
275: }
277: void* VecAXPY_Kernel(void *arg)
278: {
279: if(PetscUseThreadPool==PETSC_FALSE) {
280: DoCoreAffinity();
281: }
282: VecAXPY_KernelData *data = (VecAXPY_KernelData*)arg;
283: PetscScalar a,*y;
284: const PetscScalar *x;
285: PetscBLASInt one = 1, bn;
286: PetscInt n;
288: x = data->x;
289: y = data->y;
290: a = data->alpha;
291: n = data->n;
292: bn = PetscBLASIntCast(n);
293: BLASaxpy_(&bn,&a,x,&one,y,&one);
294: return(0);
295: }
299: PetscErrorCode VecAXPY_SeqPThread(Vec yin,PetscScalar alpha,Vec xin)
300: {
301: PetscErrorCode ierr;
302: const PetscScalar *xarray,*xp;
303: PetscScalar a=alpha,*yarray,*yp;
304: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
305: PetscInt i,Q = xin->map->n/(iNumThreads);
306: PetscInt R = xin->map->n-Q*(iNumThreads);
307: PetscBool S;
308: VecAXPY_KernelData* kerneldatap = (VecAXPY_KernelData*)malloc(iNumThreads*sizeof(VecAXPY_KernelData));
309: VecAXPY_KernelData** pdata = (VecAXPY_KernelData**)malloc(iNumThreads*sizeof(VecAXPY_KernelData*));
312: /* assume that the BLAS handles alpha == 1.0 efficiently since we have no fast code for it */
313: if (alpha != 0.0) {
314: VecGetArrayRead(xin,&xarray);
315: VecGetArray(yin,&yarray);
316: xp = xarray;
317: yp = yarray;
318: for (i=0; i<iNumThreads; i++) {
319: S = (PetscBool)(i<R);
320: kerneldatap[i].x = xp;
321: kerneldatap[i].y = yp;
322: kerneldatap[i].alpha = a;
323: kerneldatap[i].n = S?Q+1:Q;
324: xp += kerneldatap[i].n; //pointer arithmetic
325: yp += kerneldatap[i].n;
326: pdata[i] = &kerneldatap[i];
327: }
328: MainJob(VecAXPY_Kernel,(void**)pdata,iNumThreads);
329: VecRestoreArrayRead(xin,&xarray);
330: VecRestoreArray(yin,&yarray);
331: PetscLogFlops(2.0*yin->map->n);
332: }
333: free(kerneldatap);
334: free(pdata);
335: return(0);
336: }
338: void* VecAYPX_Kernel(void *arg)
339: {
340: if(PetscUseThreadPool==PETSC_FALSE) {
341: DoCoreAffinity();
342: }
343: VecAYPX_KernelData *data = (VecAYPX_KernelData*)arg;
344: PetscScalar a,*yy;
345: const PetscScalar *xx;
346: PetscInt i,n;
348: xx = data->xx;
349: yy = data->yy;
350: a = data->alpha;
351: n = data->n;
352: if(a==-1.0) {
353: for (i=0; i<n; i++) {
354: yy[i] = xx[i] - yy[i];
355: }
356: }
357: else {
358: for (i=0; i<n; i++) {
359: yy[i] = xx[i] + a*yy[i];
360: }
361: }
362: return(0);
363: }
367: PetscErrorCode VecAYPX_SeqPThread(Vec yin,PetscScalar alpha,Vec xin)
368: {
369: PetscErrorCode ierr;
373: if (alpha == 0.0) {
374: VecCopy(xin,yin);
375: }
376: else if (alpha == 1.0) {
377: VecAXPY_SeqPThread(yin,alpha,xin);
378: }
379: else {
380: PetscInt n = yin->map->n;
381: PetscScalar *yy;
382: const PetscScalar *xx;
383: VecGetArrayRead(xin,&xx);
384: VecGetArray(yin,&yy);
385: #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
386: {
387: PetscScalar oalpha = alpha;
388: fortranaypx_(&n,&oalpha,xx,yy);
389: }
390: #else
391: {
392: const PetscScalar *xp = xx;
393: PetscScalar a=alpha,*yp = yy;
394: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
395: PetscInt i,Q = xin->map->n/(iNumThreads);
396: PetscInt R = xin->map->n-Q*(iNumThreads);
397: PetscBool S;
398: VecAYPX_KernelData* kerneldatap = (VecAYPX_KernelData*)malloc(iNumThreads*sizeof(VecAYPX_KernelData));
399: VecAYPX_KernelData** pdata = (VecAYPX_KernelData**)malloc(iNumThreads*sizeof(VecAYPX_KernelData*));
401: for (i=0; i<iNumThreads; i++) {
402: S = (PetscBool)(i<R);
403: kerneldatap[i].xx = xp;
404: kerneldatap[i].yy = yp;
405: kerneldatap[i].alpha = a;
406: kerneldatap[i].n = S?Q+1:Q;
407: xp += kerneldatap[i].n; //pointer arithmetic
408: yp += kerneldatap[i].n;
409: pdata[i] = &kerneldatap[i];
410: }
411: MainJob(VecAYPX_Kernel,(void**)pdata,iNumThreads);
412: free(kerneldatap);
413: free(pdata);
414: }
415: #endif
416: VecRestoreArrayRead(xin,&xx);
417: VecRestoreArray(yin,&yy);
418: if(alpha==-1.0) {
419: PetscLogFlops(1.0*n);
420: }
421: else {
422: PetscLogFlops(2.0*n);
423: }
424: }
425: return(0);
426: }
428: void* VecWAXPY_Kernel(void *arg)
429: {
430: if(PetscUseThreadPool==PETSC_FALSE) {
431: DoCoreAffinity();
432: }
433: VecWAXPY_KernelData *data = (VecWAXPY_KernelData*)arg;
434: PetscScalar a,*ww;
435: const PetscScalar *xx,*yy;
436: PetscInt i,n;
438: ww = data->ww;
439: xx = data->xx;
440: yy = data->yy;
441: a = data->alpha;
442: n = data->n;
443: if(a==-1.0) {
444: for (i=0; i<n; i++) {
445: ww[i] = yy[i] - xx[i];
446: }
447: }
448: else if(a==1.0) {
449: for (i=0; i<n; i++) {
450: ww[i] = yy[i] + xx[i];
451: }
452: }
453: else {
454: for (i=0; i<n; i++) {
455: ww[i] = a*xx[i] + yy[i];
456: }
457: }
458: return(0);
459: }
463: PetscErrorCode VecWAXPY_SeqPThread(Vec win, PetscScalar alpha,Vec xin,Vec yin)
464: {
465: PetscErrorCode ierr;
466: PetscInt n = win->map->n;
467: PetscScalar *ww;
468: const PetscScalar *yy,*xx;
472: VecGetArrayRead(xin,&xx);
473: VecGetArrayRead(yin,&yy);
474: VecGetArray(win,&ww);
475: if (alpha == 0.0) {
476: PetscMemcpy(ww,yy,n*sizeof(PetscScalar));
477: }
478: else {
479: #if defined(PETSC_USE_FORTRAN_KERNEL_WAXPY)
480: PetscScalar oalpha = alpha;
481: fortranwaxpy_(&n,&oalpha,xx,yy,ww);
482: #else
483: const PetscScalar *xp = xx,*yp = yy;
484: PetscScalar a=alpha,*wp = ww;
485: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
486: PetscInt i,Q = n/(iNumThreads);
487: PetscInt R = n-Q*(iNumThreads);
488: PetscBool S;
489: VecWAXPY_KernelData* kerneldatap = (VecWAXPY_KernelData*)malloc(iNumThreads*sizeof(VecWAXPY_KernelData));
490: VecWAXPY_KernelData** pdata = (VecWAXPY_KernelData**)malloc(iNumThreads*sizeof(VecWAXPY_KernelData*));
492: for (i=0; i<iNumThreads; i++) {
493: S = (PetscBool)(i<R);
494: kerneldatap[i].ww = wp;
495: kerneldatap[i].xx = xp;
496: kerneldatap[i].yy = yp;
497: kerneldatap[i].alpha = a;
498: kerneldatap[i].n = S?Q+1:Q;
499: wp += kerneldatap[i].n; //pointer arithmetic
500: xp += kerneldatap[i].n;
501: yp += kerneldatap[i].n;
502: pdata[i] = &kerneldatap[i];
503: }
504: MainJob(VecWAXPY_Kernel,(void**)pdata,iNumThreads);
505: free(kerneldatap);
506: free(pdata);
507: #endif
508: if (alpha == 1.0 || alpha == -1.0) {
509: PetscLogFlops(1.0*n);
510: }
511: else {
512: PetscLogFlops(2.0*n);
513: }
514: }
515: VecRestoreArrayRead(xin,&xx);
516: VecRestoreArrayRead(yin,&yy);
517: VecRestoreArray(win,&ww);
518: return(0);
519: }
521: void* VecNorm_Kernel(void *arg)
522: {
523: if(PetscUseThreadPool==PETSC_FALSE) {
524: DoCoreAffinity();
525: }
526: VecNorm_KernelData *data = (VecNorm_KernelData*)arg;
527: const PetscScalar *x;
528: NormType type;
529: PetscInt i,n;
531: x = data->x;
532: type = data->typeUse;
533: n = data->n;
534: data->result = 0.0;
535: if(type==NORM_1) {
536: PetscBLASInt one = 1, bn = PetscBLASIntCast(n);
537: data->result = BLASasum_(&bn,x,&one);
538: }
539: else if(type==NORM_INFINITY) {
540: PetscReal maxv = 0.0,tmp;
541: for(i=0; i<n; i++) {
542: tmp = PetscAbsScalar(x[i]);
543: if(tmp>maxv) {
544: maxv = tmp;
545: }
546: }
547: data->result = maxv;
548: }
549: else {
550: PetscBLASInt one = 1, bn = PetscBLASIntCast(n);
551: data->result = BLASnrm2_(&bn,x,&one);
552: }
553: return(0);
554: }
558: PetscErrorCode VecNorm_SeqPThread(Vec xin,NormType type,PetscReal* z)
559: {
560: const PetscScalar *xx;
561: PetscErrorCode ierr;
562: PetscInt n = xin->map->n;
565: if(type == NORM_1_AND_2) {
566: VecNorm_SeqPThread(xin,NORM_1,z);
567: VecNorm_SeqPThread(xin,NORM_2,z+1);
568: }
569: else {
570: VecGetArrayRead(xin,&xx);
571: const PetscScalar *xp = xx;
572: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
573: PetscInt i,Q = n/(iNumThreads);
574: PetscInt R = n-Q*(iNumThreads);
575: PetscBool S;
576: VecNorm_KernelData* kerneldatap = (VecNorm_KernelData*)malloc(iNumThreads*sizeof(VecNorm_KernelData));
577: VecNorm_KernelData** pdata = (VecNorm_KernelData**)malloc(iNumThreads*sizeof(VecNorm_KernelData*));
579: for (i=0; i<iNumThreads; i++) {
580: S = (PetscBool)(i<R);
581: kerneldatap[i].x = xp;
582: kerneldatap[i].typeUse = type;
583: kerneldatap[i].n = S?Q+1:Q;
584: xp += kerneldatap[i].n; //pointer arithmetic
585: pdata[i] = &kerneldatap[i];
586: }
587: MainJob(VecNorm_Kernel,(void**)pdata,iNumThreads);
588: //collect results
589: *z = 0.0;
590: if(type == NORM_1) {
591: for(i=0; i<iNumThreads; i++) {
592: *z += kerneldatap[i].result;
593: }
594: PetscLogFlops(PetscMax(n-1.0,0.0));
595: }
596: else if(type == NORM_2 || type == NORM_FROBENIUS) {
597: for(i=0; i<iNumThreads; i++) {
598: *z += kerneldatap[i].result*kerneldatap[i].result;
599: }
600: *z = sqrt(*z);
601: PetscLogFlops(PetscMax(2.0*n-1,0.0));
602: }
603: else {
604: PetscReal maxv = 0.0,tmp;
605: for(i=0; i<iNumThreads; i++) {
606: tmp = kerneldatap[i].result;
607: if(tmp>maxv) {
608: maxv = tmp;
609: }
610: }
611: *z = maxv;
612: }
613: free(kerneldatap);
614: free(pdata);
615: VecRestoreArrayRead(xin,&xx);
616: }
617: return(0);
618: }
620: void* VecMDot_Kernel(void *arg)
621: {
622: if(PetscUseThreadPool==PETSC_FALSE) {
623: DoCoreAffinity();
624: }
625: VecMDot_KernelData *data = (VecMDot_KernelData*)arg;
626: const PetscScalar *xbase = data->xvalin;
627: Vec* yin = data->yavecin;
628: PetscInt n = data->nelem;
629: PetscInt nv = data->ntoproc;
630: PetscScalar* z = data->result;
631: PetscErrorCode ierr;
632: PetscInt i,j,nv_rem,j_rem;
633: PetscScalar sum0,sum1,sum2,sum3,x0,x1,x2,x3;
634: const PetscScalar *yy0,*yy1,*yy2,*yy3,*x;
635: Vec *yy;
637: sum0 = 0.;
638: sum1 = 0.;
639: sum2 = 0.;
640: i = nv;
641: nv_rem = nv&0x3;
642: yy = yin;
643: j = n;
644: x = xbase;
646: switch (nv_rem) {
647: case 3:
648: VecGetArrayRead(yy[0],&yy0);CHKERRQP(ierr);
649: VecGetArrayRead(yy[1],&yy1);CHKERRQP(ierr);
650: VecGetArrayRead(yy[2],&yy2);CHKERRQP(ierr);
651: switch (j_rem=j&0x3) {
652: case 3:
653: x2 = x[2];
654: sum0 += x2*PetscConj(yy0[2]); sum1 += x2*PetscConj(yy1[2]);
655: sum2 += x2*PetscConj(yy2[2]);
656: case 2:
657: x1 = x[1];
658: sum0 += x1*PetscConj(yy0[1]); sum1 += x1*PetscConj(yy1[1]);
659: sum2 += x1*PetscConj(yy2[1]);
660: case 1:
661: x0 = x[0];
662: sum0 += x0*PetscConj(yy0[0]); sum1 += x0*PetscConj(yy1[0]);
663: sum2 += x0*PetscConj(yy2[0]);
664: case 0:
665: x += j_rem;
666: yy0 += j_rem;
667: yy1 += j_rem;
668: yy2 += j_rem;
669: j -= j_rem;
670: break;
671: }
672: while (j>0) {
673: x0 = x[0];
674: x1 = x[1];
675: x2 = x[2];
676: x3 = x[3];
677: x += 4;
678:
679: sum0 += x0*PetscConj(yy0[0]) + x1*PetscConj(yy0[1]) + x2*PetscConj(yy0[2]) + x3*PetscConj(yy0[3]); yy0+=4;
680: sum1 += x0*PetscConj(yy1[0]) + x1*PetscConj(yy1[1]) + x2*PetscConj(yy1[2]) + x3*PetscConj(yy1[3]); yy1+=4;
681: sum2 += x0*PetscConj(yy2[0]) + x1*PetscConj(yy2[1]) + x2*PetscConj(yy2[2]) + x3*PetscConj(yy2[3]); yy2+=4;
682: j -= 4;
683: }
684: z[0] = sum0;
685: z[1] = sum1;
686: z[2] = sum2;
687: VecRestoreArrayRead(yy[0],&yy0);CHKERRQP(ierr);
688: VecRestoreArrayRead(yy[1],&yy1);CHKERRQP(ierr);
689: VecRestoreArrayRead(yy[2],&yy2);CHKERRQP(ierr);
690: break;
691: case 2:
692: VecGetArrayRead(yy[0],&yy0);CHKERRQP(ierr);
693: VecGetArrayRead(yy[1],&yy1);CHKERRQP(ierr);
694: switch (j_rem=j&0x3) {
695: case 3:
696: x2 = x[2];
697: sum0 += x2*PetscConj(yy0[2]); sum1 += x2*PetscConj(yy1[2]);
698: case 2:
699: x1 = x[1];
700: sum0 += x1*PetscConj(yy0[1]); sum1 += x1*PetscConj(yy1[1]);
701: case 1:
702: x0 = x[0];
703: sum0 += x0*PetscConj(yy0[0]); sum1 += x0*PetscConj(yy1[0]);
704: case 0:
705: x += j_rem;
706: yy0 += j_rem;
707: yy1 += j_rem;
708: j -= j_rem;
709: break;
710: }
711: while (j>0) {
712: x0 = x[0];
713: x1 = x[1];
714: x2 = x[2];
715: x3 = x[3];
716: x += 4;
717:
718: sum0 += x0*PetscConj(yy0[0]) + x1*PetscConj(yy0[1]) + x2*PetscConj(yy0[2]) + x3*PetscConj(yy0[3]); yy0+=4;
719: sum1 += x0*PetscConj(yy1[0]) + x1*PetscConj(yy1[1]) + x2*PetscConj(yy1[2]) + x3*PetscConj(yy1[3]); yy1+=4;
720: j -= 4;
721: }
722: z[0] = sum0;
723: z[1] = sum1;
724:
725: VecRestoreArrayRead(yy[0],&yy0);CHKERRQP(ierr);
726: VecRestoreArrayRead(yy[1],&yy1);CHKERRQP(ierr);
727: break;
728: case 1:
729: VecGetArrayRead(yy[0],&yy0);CHKERRQP(ierr);
730: switch (j_rem=j&0x3) {
731: case 3:
732: x2 = x[2]; sum0 += x2*PetscConj(yy0[2]);
733: case 2:
734: x1 = x[1]; sum0 += x1*PetscConj(yy0[1]);
735: case 1:
736: x0 = x[0]; sum0 += x0*PetscConj(yy0[0]);
737: case 0:
738: x += j_rem;
739: yy0 += j_rem;
740: j -= j_rem;
741: break;
742: }
743: while (j>0) {
744: sum0 += x[0]*PetscConj(yy0[0]) + x[1]*PetscConj(yy0[1])
745: + x[2]*PetscConj(yy0[2]) + x[3]*PetscConj(yy0[3]);
746: yy0+=4;
747: j -= 4; x+=4;
748: }
749: z[0] = sum0;
751: VecRestoreArrayRead(yy[0],&yy0);CHKERRQP(ierr);
752: break;
753: case 0:
754: break;
755: }
756: z += nv_rem;
757: i -= nv_rem;
758: yy += nv_rem;
760: while (i >0) {
761: sum0 = 0.;
762: sum1 = 0.;
763: sum2 = 0.;
764: sum3 = 0.;
765: VecGetArrayRead(yy[0],&yy0);CHKERRQP(ierr);
766: VecGetArrayRead(yy[1],&yy1);CHKERRQP(ierr);
767: VecGetArrayRead(yy[2],&yy2);CHKERRQP(ierr);
768: VecGetArrayRead(yy[3],&yy3);CHKERRQP(ierr);
770: j = n;
771: x = xbase;
772: switch (j_rem=j&0x3) {
773: case 3:
774: x2 = x[2];
775: sum0 += x2*PetscConj(yy0[2]); sum1 += x2*PetscConj(yy1[2]);
776: sum2 += x2*PetscConj(yy2[2]); sum3 += x2*PetscConj(yy3[2]);
777: case 2:
778: x1 = x[1];
779: sum0 += x1*PetscConj(yy0[1]); sum1 += x1*PetscConj(yy1[1]);
780: sum2 += x1*PetscConj(yy2[1]); sum3 += x1*PetscConj(yy3[1]);
781: case 1:
782: x0 = x[0];
783: sum0 += x0*PetscConj(yy0[0]); sum1 += x0*PetscConj(yy1[0]);
784: sum2 += x0*PetscConj(yy2[0]); sum3 += x0*PetscConj(yy3[0]);
785: case 0:
786: x += j_rem;
787: yy0 += j_rem;
788: yy1 += j_rem;
789: yy2 += j_rem;
790: yy3 += j_rem;
791: j -= j_rem;
792: break;
793: }
794: while (j>0) {
795: x0 = x[0];
796: x1 = x[1];
797: x2 = x[2];
798: x3 = x[3];
799: x += 4;
801: sum0 += x0*PetscConj(yy0[0]) + x1*PetscConj(yy0[1]) + x2*PetscConj(yy0[2]) + x3*PetscConj(yy0[3]); yy0+=4;
802: sum1 += x0*PetscConj(yy1[0]) + x1*PetscConj(yy1[1]) + x2*PetscConj(yy1[2]) + x3*PetscConj(yy1[3]); yy1+=4;
803: sum2 += x0*PetscConj(yy2[0]) + x1*PetscConj(yy2[1]) + x2*PetscConj(yy2[2]) + x3*PetscConj(yy2[3]); yy2+=4;
804: sum3 += x0*PetscConj(yy3[0]) + x1*PetscConj(yy3[1]) + x2*PetscConj(yy3[2]) + x3*PetscConj(yy3[3]); yy3+=4;
805: j -= 4;
806: }
807: z[0] = sum0;
808: z[1] = sum1;
809: z[2] = sum2;
810: z[3] = sum3;
811: z += 4;
812: i -= 4;
813: VecRestoreArrayRead(yy[0],&yy0);CHKERRQP(ierr);
814: VecRestoreArrayRead(yy[1],&yy1);CHKERRQP(ierr);
815: VecRestoreArrayRead(yy[2],&yy2);CHKERRQP(ierr);
816: VecRestoreArrayRead(yy[3],&yy3);CHKERRQP(ierr);
817: yy += 4;
818: }
819: return(0);
820: }
824: PetscErrorCode VecMDot_SeqPThread(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
825: {
826: PetscErrorCode ierr;
827: PetscInt i,j=0;
828: Vec *yy = (Vec *)yin;
829: const PetscScalar *xbase;
832: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
833: PetscInt n=xin->map->n,Q = nv/(iNumThreads);
834: PetscInt R = nv-Q*(iNumThreads);
835: PetscBool S;
836: VecMDot_KernelData* kerneldatap = (VecMDot_KernelData*)malloc(iNumThreads*sizeof(VecMDot_KernelData));
837: VecMDot_KernelData** pdata = (VecMDot_KernelData**)malloc(iNumThreads*sizeof(VecMDot_KernelData*));
838: VecGetArrayRead(xin,&xbase);
839: for (i=0; i<iNumThreads; i++) {
840: S = (PetscBool)(i<R);
841: kerneldatap[i].xvalin = xbase;
842: kerneldatap[i].yavecin = &yy[j];
843: kerneldatap[i].nelem = n;
844: kerneldatap[i].ntoproc = S?Q+1:Q;
845: kerneldatap[i].result = &z[j];
846: j += kerneldatap[i].ntoproc;
847: pdata[i] = &kerneldatap[i];
848: }
849: MainJob(VecMDot_Kernel,(void**)pdata,iNumThreads);
850: free(kerneldatap);
851: free(pdata);
852: VecRestoreArrayRead(xin,&xbase);
853: PetscLogFlops(PetscMax(nv*(2.0*xin->map->n-1),0.0));
854: return(0);
855: }
857: void* VecMax_Kernel(void *arg)
858: {
859: if(PetscUseThreadPool==PETSC_FALSE) {
860: DoCoreAffinity();
861: }
862: VecMax_KernelData *data = (VecMax_KernelData*)arg;
863: const PetscScalar *xx = data->x;
864: PetscInt i,j,n = data->localn;
865: PetscReal lmax,tmp;
867: #if defined(PETSC_USE_COMPLEX)
868: lmax = PetscRealPart(*xx++); j = 0;
869: #else
870: lmax = *xx++; j = 0;
871: #endif
872: for (i=1; i<n; i++) {
873: #if defined(PETSC_USE_COMPLEX)
874: if ((tmp = PetscRealPart(*xx++)) > lmax) { j = i; lmax = tmp;}
875: #else
876: if ((tmp = *xx++) > lmax) { j = i; lmax = tmp; }
877: #endif
878: }
880: data->localmax = lmax;
881: data->localind = j;
882: return(0);
883: }
887: PetscErrorCode VecMax_SeqPThread(Vec xin,PetscInt* idx,PetscReal * z)
888: {
889: PetscInt i,j=0,n = xin->map->n;
890: PetscReal max;
891: const PetscScalar *xx,*xp;
892: PetscErrorCode ierr;
895: VecGetArrayRead(xin,&xx);
896: if (!n) {
897: max = PETSC_MIN_REAL;
898: j = -1;
899: } else {
900: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
901: PetscInt gind,Q = n/(iNumThreads);
902: PetscInt R = n-Q*(iNumThreads);
903: PetscBool S;
904: VecMax_KernelData* kerneldatap = (VecMax_KernelData*)malloc(iNumThreads*sizeof(VecMax_KernelData));
905: VecMax_KernelData** pdata = (VecMax_KernelData**)malloc(iNumThreads*sizeof(VecMax_KernelData*));
907: gind = 0;
908: xp = xx;
909: for (i=0; i<iNumThreads; i++) {
910: S = (PetscBool)(i<R);
911: kerneldatap[i].x = xp;
912: kerneldatap[i].gind = gind;
913: kerneldatap[i].localn = S?Q+1:Q;
914: xp += kerneldatap[i].localn; //pointer arithmetic
915: gind += kerneldatap[i].localn;
916: pdata[i] = &kerneldatap[i];
917: }
918: MainJob(VecMax_Kernel,(void**)pdata,iNumThreads);
919: //collect results, determine global max, global index
920: max = kerneldatap[0].localmax;
921: j = kerneldatap[0].localind;
922: for(i=1; i<iNumThreads; i++) {
923: if(kerneldatap[i].localmax>max) {
924: max = kerneldatap[i].localmax;
925: j = kerneldatap[i].gind+kerneldatap[i].localind;
926: }
927: }
928: free(kerneldatap);
929: free(pdata);
930: }
931: *z = max;
932: if (idx) *idx = j;
933: VecRestoreArrayRead(xin,&xx);
934: return(0);
935: }
937: void* VecMin_Kernel(void *arg)
938: {
939: if(PetscUseThreadPool==PETSC_FALSE) {
940: DoCoreAffinity();
941: }
942: VecMin_KernelData *data = (VecMin_KernelData*)arg;
943: const PetscScalar *xx = data->x;
944: PetscInt i,j,n = data->localn;
945: PetscReal lmin,tmp;
947: #if defined(PETSC_USE_COMPLEX)
948: lmin = PetscRealPart(*xx++); j = 0;
949: #else
950: lmin = *xx++; j = 0;
951: #endif
952: for (i=1; i<n; i++) {
953: #if defined(PETSC_USE_COMPLEX)
954: if ((tmp = PetscRealPart(*xx++)) < lmin) { j = i; lmin = tmp;}
955: #else
956: if ((tmp = *xx++) < lmin) { j = i; lmin = tmp; }
957: #endif
958: }
960: data->localmin = lmin;
961: data->localind = j;
962: return(0);
963: }
967: PetscErrorCode VecMin_SeqPThread(Vec xin,PetscInt* idx,PetscReal * z)
968: {
969: PetscInt i,j=0,n = xin->map->n;
970: PetscReal min;
971: const PetscScalar *xx,*xp;
972: PetscErrorCode ierr;
975: VecGetArrayRead(xin,&xx);
976: if (!n) {
977: min = PETSC_MAX_REAL;
978: j = -1;
979: } else {
980: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
981: PetscInt gind,Q = n/(iNumThreads);
982: PetscInt R = n-Q*(iNumThreads);
983: PetscBool S;
984: VecMin_KernelData* kerneldatap = (VecMin_KernelData*)malloc(iNumThreads*sizeof(VecMin_KernelData));
985: VecMin_KernelData** pdata = (VecMin_KernelData**)malloc(iNumThreads*sizeof(VecMin_KernelData*));
987: gind = 0;
988: xp = xx;
989: for (i=0; i<iNumThreads; i++) {
990: S = (PetscBool)(i<R);
991: kerneldatap[i].x = xp;
992: kerneldatap[i].gind = gind;
993: kerneldatap[i].localn = S?Q+1:Q;
994: xp += kerneldatap[i].localn; //pointer arithmetic
995: gind += kerneldatap[i].localn;
996: pdata[i] = &kerneldatap[i];
997: }
998: MainJob(VecMin_Kernel,(void**)pdata,iNumThreads);
999: //collect results, determine global max, global index
1000: min = kerneldatap[0].localmin;
1001: j = kerneldatap[0].localind;
1002: for(i=1; i<iNumThreads; i++) {
1003: if(kerneldatap[i].localmin<min) {
1004: min = kerneldatap[i].localmin;
1005: j = kerneldatap[i].gind+kerneldatap[i].localind;
1006: }
1007: }
1008: free(kerneldatap);
1009: free(pdata);
1010: }
1011: *z = min;
1012: if (idx) *idx = j;
1013: VecRestoreArrayRead(xin,&xx);
1014: return(0);
1015: }
1017: void* VecPointwiseMult_Kernel(void *arg)
1018: {
1019: if(PetscUseThreadPool==PETSC_FALSE) {
1020: DoCoreAffinity();
1021: }
1022: VecPointwiseMult_KernelData *data = (VecPointwiseMult_KernelData*)arg;
1023: PetscScalar *ww = data->wpin,*xx = data->xpin,*yy = data->ypin;
1024: PetscInt n = data->nlocal,i;
1026: if (ww == xx) {
1027: for (i=0; i<n; i++) ww[i] *= yy[i];
1028: } else if (ww == yy) {
1029: for (i=0; i<n; i++) ww[i] *= xx[i];
1030: } else {
1031: #if defined(PETSC_USE_FORTRAN_KERNEL_XTIMESY)
1032: fortranxtimesy_(xx,yy,ww,&n);
1033: #else
1034: for (i=0; i<n; i++) ww[i] = xx[i] * yy[i];
1035: #endif
1036: }
1037: return(0);
1038: }
1040: #include <../src/vec/vec/impls/seq/ftn-kernels/fxtimesy.h>
1043: static PetscErrorCode VecPointwiseMult_SeqPThread(Vec win,Vec xin,Vec yin)
1044: {
1046: PetscInt n = win->map->n,i,iIndex;
1047: PetscScalar *ww,*xx,*yy; /* cannot make xx or yy const since might be ww */
1048: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
1049: PetscInt Q = n/(iNumThreads);
1050: PetscInt R = n-Q*(iNumThreads);
1051: PetscBool S;
1053: VecPointwiseMult_KernelData* kerneldatap = (VecPointwiseMult_KernelData*)malloc(iNumThreads*sizeof(VecPointwiseMult_KernelData));
1054: VecPointwiseMult_KernelData** pdata = (VecPointwiseMult_KernelData**)malloc(iNumThreads*sizeof(VecPointwiseMult_KernelData*));
1058: VecGetArrayRead(xin,(const PetscScalar**)&xx);
1059: VecGetArrayRead(yin,(const PetscScalar**)&yy);
1060: VecGetArray(win,&ww);
1062: iIndex = 0;
1063: for (i=0; i<iNumThreads; i++) {
1064: S = (PetscBool)(i<R);
1065: kerneldatap[i].wpin = ww+iIndex;
1066: kerneldatap[i].xpin = xx+iIndex;
1067: kerneldatap[i].ypin = yy+iIndex;
1068: kerneldatap[i].nlocal = S?Q+1:Q;
1069: iIndex += kerneldatap[i].nlocal;
1070: pdata[i] = &kerneldatap[i];
1071: }
1073: MainJob(VecPointwiseMult_Kernel,(void**)pdata,iNumThreads);
1074: free(kerneldatap);
1075: free(pdata);
1077: VecRestoreArrayRead(xin,(const PetscScalar**)&xx);
1078: VecRestoreArrayRead(yin,(const PetscScalar**)&yy);
1079: VecRestoreArray(win,&ww);
1080: PetscLogFlops(n);
1081: return(0);
1082: }
1084: void* VecPointwiseDivide_Kernel(void *arg)
1085: {
1086: if(PetscUseThreadPool==PETSC_FALSE) {
1087: DoCoreAffinity();
1088: }
1089: VecPointwiseDivide_KernelData *data = (VecPointwiseDivide_KernelData*)arg;
1090: PetscScalar *ww = data->wpin,*xx = data->xpin,*yy = data->ypin;
1091: PetscInt n = data->nlocal,i;
1093: for (i=0; i<n; i++) {
1094: ww[i] = xx[i] / yy[i];
1095: }
1096: return(0);
1097: }
1101: static PetscErrorCode VecPointwiseDivide_SeqPThread(Vec win,Vec xin,Vec yin)
1102: {
1104: PetscInt n = win->map->n,i,iIndex;
1105: PetscScalar *ww,*xx,*yy; /* cannot make xx or yy const since might be ww */
1106: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
1107: PetscInt Q = n/(iNumThreads);
1108: PetscInt R = n-Q*(iNumThreads);
1109: PetscBool S;
1111: VecPointwiseDivide_KernelData* kerneldatap = (VecPointwiseDivide_KernelData*)malloc(iNumThreads*sizeof(VecPointwiseDivide_KernelData));
1112: VecPointwiseDivide_KernelData** pdata = (VecPointwiseDivide_KernelData**)malloc(iNumThreads*sizeof(VecPointwiseDivide_KernelData*));
1116: VecGetArrayRead(xin,(const PetscScalar**)&xx);
1117: VecGetArrayRead(yin,(const PetscScalar**)&yy);
1118: VecGetArray(win,&ww);
1120: iIndex = 0;
1121: for (i=0; i<iNumThreads; i++) {
1122: S = (PetscBool)(i<R);
1123: kerneldatap[i].wpin = ww+iIndex;
1124: kerneldatap[i].xpin = xx+iIndex;
1125: kerneldatap[i].ypin = yy+iIndex;
1126: kerneldatap[i].nlocal = S?Q+1:Q;
1127: iIndex += kerneldatap[i].nlocal;
1128: pdata[i] = &kerneldatap[i];
1129: }
1131: MainJob(VecPointwiseDivide_Kernel,(void**)pdata,iNumThreads);
1132: free(kerneldatap);
1133: free(pdata);
1135: PetscLogFlops(n);
1136: VecRestoreArrayRead(xin,(const PetscScalar**)&xx);
1137: VecRestoreArrayRead(yin,(const PetscScalar**)&yy);
1138: VecRestoreArray(win,&ww);
1139: return(0);
1140: }
1142: void* VecSwap_Kernel(void *arg)
1143: {
1144: if(PetscUseThreadPool==PETSC_FALSE) {
1145: DoCoreAffinity();
1146: }
1147: VecSwap_KernelData *data = (VecSwap_KernelData*)arg;
1148: PetscScalar *xa = data->xpin,*ya = data->ypin;
1149: PetscBLASInt one = 1,bn = PetscBLASIntCast(data->nlocal);
1151: BLASswap_(&bn,xa,&one,ya,&one);
1152: return(0);
1153: }
1155: #include <petscblaslapack.h>
1158: static PetscErrorCode VecSwap_SeqPThread(Vec xin,Vec yin)
1159: {
1160: PetscScalar *ya, *xa;
1164: if (xin != yin) {
1165: VecGetArray(xin,&xa);
1166: VecGetArray(yin,&ya);
1167: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
1168: PetscInt n = xin->map->n,Q = n/(iNumThreads),R = n-Q*(iNumThreads),i,iIndex;
1169: PetscBool S;
1171: VecSwap_KernelData* kerneldatap = (VecSwap_KernelData*)malloc(iNumThreads*sizeof(VecSwap_KernelData));
1172: VecSwap_KernelData** pdata = (VecSwap_KernelData**)malloc(iNumThreads*sizeof(VecSwap_KernelData*));
1174: iIndex = 0;
1175: for (i=0; i<iNumThreads; i++) {
1176: S = (PetscBool)(i<R);
1177: kerneldatap[i].xpin = xa+iIndex;
1178: kerneldatap[i].ypin = ya+iIndex;
1179: kerneldatap[i].nlocal = S?Q+1:Q;
1180: iIndex += kerneldatap[i].nlocal;
1181: pdata[i] = &kerneldatap[i];
1182: }
1184: MainJob(VecSwap_Kernel,(void**)pdata,iNumThreads);
1185: free(kerneldatap);
1186: free(pdata);
1187: VecRestoreArray(xin,&xa);
1188: VecRestoreArray(yin,&ya);
1189: }
1190: return(0);
1191: }
1193: void* VecSetRandom_Kernel(void *arg)
1194: {
1195: if(PetscUseThreadPool==PETSC_FALSE) {
1196: DoCoreAffinity();
1197: }
1198: VecSetRandom_KernelData *data = (VecSetRandom_KernelData*)arg;
1199: PetscScalar *xx = data->xpin;
1200: PetscRandom r = data->rin;
1201: PetscInt i,n = data->nlocal;
1204: for(i=0; i<n; i++) {
1205: PetscRandomGetValue(r,&xx[i]);CHKERRQP(ierr);
1206: }
1207: return(0);
1208: }
1212: static PetscErrorCode VecSetRandom_SeqPThread(Vec xin,PetscRandom r)
1213: {
1215: PetscInt n = xin->map->n,i;
1216: PetscScalar *xx;
1217: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
1218: PetscInt Q = n/(iNumThreads),R = n-Q*(iNumThreads),iIndex;
1219: PetscBool S;
1221: VecSetRandom_KernelData* kerneldatap = (VecSetRandom_KernelData*)malloc(iNumThreads*sizeof(VecSetRandom_KernelData));
1222: VecSetRandom_KernelData** pdata = (VecSetRandom_KernelData**)malloc(iNumThreads*sizeof(VecSetRandom_KernelData*));
1225: VecGetArray(xin,&xx);
1227: iIndex = 0;
1228: for (i=0; i<iNumThreads; i++) {
1229: S = (PetscBool)(i<R);
1230: kerneldatap[i].xpin = xx+iIndex;
1231: kerneldatap[i].rin = r;
1232: kerneldatap[i].nlocal = S?Q+1:Q;
1233: iIndex += kerneldatap[i].nlocal;
1234: pdata[i] = &kerneldatap[i];
1235: }
1237: MainJob(VecSetRandom_Kernel,(void**)pdata,iNumThreads);
1238: free(kerneldatap);
1239: free(pdata);
1240: VecRestoreArray(xin,&xx);
1241: return(0);
1242: }
1244: void* VecCopy_Kernel(void *arg)
1245: {
1246: if(PetscUseThreadPool==PETSC_FALSE) {
1247: DoCoreAffinity();
1248: }
1249: VecCopy_KernelData *data = (VecCopy_KernelData*)arg;
1250: const PetscScalar *xa = data->xpin;
1251: PetscScalar *ya = data->ypin;
1252: PetscInt n = data->nlocal;
1255: PetscMemcpy(ya,xa,n*sizeof(PetscScalar));CHKERRQP(ierr);
1256: return(0);
1257: }
1261: static PetscErrorCode VecCopy_SeqPThread(Vec xin,Vec yin)
1262: {
1263: PetscScalar *ya;
1264: const PetscScalar *xa;
1265: PetscErrorCode ierr;
1268: if (xin != yin) {
1269: VecGetArrayRead(xin,&xa);
1270: VecGetArray(yin,&ya);
1272: PetscInt n = xin->map->n,i;
1273: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
1274: PetscInt Q = n/(iNumThreads),R = n-Q*(iNumThreads),iIndex;
1275: PetscBool S;
1277: VecCopy_KernelData* kerneldatap = (VecCopy_KernelData*)malloc(iNumThreads*sizeof(VecCopy_KernelData));
1278: VecCopy_KernelData** pdata = (VecCopy_KernelData**)malloc(iNumThreads*sizeof(VecCopy_KernelData*));
1280: iIndex = 0;
1281: for (i=0; i<iNumThreads; i++) {
1282: S = (PetscBool)(i<R);
1283: kerneldatap[i].xpin = xa+iIndex;
1284: kerneldatap[i].ypin = ya+iIndex;
1285: kerneldatap[i].nlocal = S?Q+1:Q;
1286: iIndex += kerneldatap[i].nlocal;
1287: pdata[i] = &kerneldatap[i];
1288: }
1289: MainJob(VecCopy_Kernel,(void**)pdata,iNumThreads);
1290: free(kerneldatap);
1291: free(pdata);
1293: VecRestoreArrayRead(xin,&xa);
1294: VecRestoreArray(yin,&ya);
1295: }
1296: return(0);
1297: }
1299: void* VecMAXPY_Kernel(void* arg)
1300: {
1301: if(PetscUseThreadPool==PETSC_FALSE) {
1302: DoCoreAffinity();
1303: }
1304: VecMAXPY_KernelData *data = (VecMAXPY_KernelData*)arg;
1305: PetscErrorCode ierr;
1306: PetscInt n = data->nelem,nv=data->ntoproc,ibase=data->ibase,j,j_rem;
1307: const PetscScalar *alpha=data->amult,*yy0,*yy1,*yy2,*yy3;
1308: PetscScalar *xx = data->xavalin,alpha0,alpha1,alpha2,alpha3;
1309: Vec* y = data->yavecin;
1311: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
1312: #pragma disjoint(*xx,*yy0,*yy1,*yy2,*yy3,*alpha)
1313: #endif
1315: switch (j_rem=nv&0x3) {
1316: case 3:
1317: VecGetArrayRead(y[0],&yy0);CHKERRQP(ierr);
1318: VecGetArrayRead(y[1],&yy1);CHKERRQP(ierr);
1319: VecGetArrayRead(y[2],&yy2);CHKERRQP(ierr);
1320: yy0 += ibase; yy1 += ibase; yy2 += ibase; //pointer arithmetic
1321: alpha0 = alpha[0];
1322: alpha1 = alpha[1];
1323: alpha2 = alpha[2];
1324: alpha += 3;
1325: PetscAXPY3(xx,alpha0,alpha1,alpha2,yy0,yy1,yy2,n);
1326: VecRestoreArrayRead(y[0],&yy0);CHKERRQP(ierr);
1327: VecRestoreArrayRead(y[1],&yy1);CHKERRQP(ierr);
1328: VecRestoreArrayRead(y[2],&yy2);CHKERRQP(ierr);
1329: y += 3;
1330: break;
1331: case 2:
1332: VecGetArrayRead(y[0],&yy0);CHKERRQP(ierr);
1333: VecGetArrayRead(y[1],&yy1);CHKERRQP(ierr);
1334: yy0 += ibase; yy1 += ibase;
1335: alpha0 = alpha[0];
1336: alpha1 = alpha[1];
1337: alpha +=2;
1338: PetscAXPY2(xx,alpha0,alpha1,yy0,yy1,n);
1339: VecRestoreArrayRead(y[0],&yy0);CHKERRQP(ierr);
1340: VecRestoreArrayRead(y[1],&yy1);CHKERRQP(ierr);
1341: y +=2;
1342: break;
1343: case 1:
1344: VecGetArrayRead(y[0],&yy0);CHKERRQP(ierr);
1345: yy0 += ibase;
1346: alpha0 = *alpha++;
1347: PetscAXPY(xx,alpha0,yy0,n);
1348: VecRestoreArrayRead(y[0],&yy0);CHKERRQP(ierr);
1349: y +=1;
1350: break;
1351: }
1352: for (j=j_rem; j<nv; j+=4) {
1353: VecGetArrayRead(y[0],&yy0);CHKERRQP(ierr);
1354: VecGetArrayRead(y[1],&yy1);CHKERRQP(ierr);
1355: VecGetArrayRead(y[2],&yy2);CHKERRQP(ierr);
1356: VecGetArrayRead(y[3],&yy3);CHKERRQP(ierr);
1357: yy0 += ibase; yy1 += ibase; yy2 += ibase; yy3 += ibase;
1358: alpha0 = alpha[0];
1359: alpha1 = alpha[1];
1360: alpha2 = alpha[2];
1361: alpha3 = alpha[3];
1362: alpha += 4;
1364: PetscAXPY4(xx,alpha0,alpha1,alpha2,alpha3,yy0,yy1,yy2,yy3,n);
1365: VecRestoreArrayRead(y[0],&yy0);CHKERRQP(ierr);
1366: VecRestoreArrayRead(y[1],&yy1);CHKERRQP(ierr);
1367: VecRestoreArrayRead(y[2],&yy2);CHKERRQP(ierr);
1368: VecRestoreArrayRead(y[3],&yy3);CHKERRQP(ierr);
1369: y += 4;
1370: }
1371: return(0);
1372: }
1376: PetscErrorCode VecMAXPY_SeqPThread(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
1377: {
1378: PetscErrorCode ierr;
1379: PetscInt n = xin->map->n,i,j=0;
1380: PetscScalar *xx;
1382: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
1383: PetscInt Q = n/(iNumThreads),R = n-Q*(iNumThreads);
1384: //PetscInt K = nv / 4; /*how many groups of 4 are present */
1385: //PetscInt Q = K / iNumThreads; /* how many groups of 4 to give to each thread */
1386: //PetscInt R = nv - Q*iNumThreads*4;
1387: PetscBool S;
1388: VecMAXPY_KernelData* kerneldatap = (VecMAXPY_KernelData*)malloc(iNumThreads*sizeof(VecMAXPY_KernelData));
1389: VecMAXPY_KernelData** pdata = (VecMAXPY_KernelData**)malloc(iNumThreads*sizeof(VecMAXPY_KernelData*));
1392: PetscLogFlops(nv*2.0*n);
1393: VecGetArray(xin,&xx);
1394: for(i=0;i<nv;i++) {
1395: if(y[i]->petscnative!=PETSC_TRUE) {
1396: printf("Non PETSC Native Vector!\n");
1397: }
1398: }
1399: for (i=0; i<iNumThreads; i++) {
1400: S = (PetscBool)(i<R);
1401: kerneldatap[i].xavalin = xx+j;
1402: kerneldatap[i].yavecin = &y[0];
1403: kerneldatap[i].amult = &alpha[0];
1404: kerneldatap[i].nelem = S?Q+1:Q;
1405: kerneldatap[i].ntoproc = nv;
1406: kerneldatap[i].ibase = j;
1407: j += kerneldatap[i].nelem;
1408: pdata[i] = &kerneldatap[i];
1409: }
1410: MainJob(VecMAXPY_Kernel,(void**)pdata,iNumThreads);
1411: free(kerneldatap);
1412: free(pdata);
1414: VecRestoreArray(xin,&xx);
1415: return(0);
1416: }
1418: void* VecSet_Kernel(void *arg)
1419: {
1420: if(PetscUseThreadPool==PETSC_FALSE) {
1421: DoCoreAffinity();
1422: }
1423: VecSet_KernelData *data = (VecSet_KernelData*)arg;
1424: PetscScalar *xx = data->xpin;
1425: PetscScalar alpha = data->alphain;
1426: PetscInt i,n = data->nelem;
1429: if (alpha == (PetscScalar)0.0) {
1430: PetscMemzero(xx,n*sizeof(PetscScalar));CHKERRQP(ierr);
1431: } else {
1432: for (i=0; i<n; i++) xx[i] = alpha;
1433: }
1434: return(0);
1435: }
1439: PetscErrorCode VecSet_SeqPThread(Vec xin,PetscScalar alpha)
1440: {
1441: PetscInt i,n = xin->map->n;
1442: PetscScalar *xx;
1446: VecGetArray(xin,&xx);
1447: const PetscInt iNumThreads = PetscMaxThreads; //this number could be different
1448: PetscInt Q = n/(iNumThreads),R = n-Q*(iNumThreads),iIndex;
1449: PetscBool S;
1451: VecSet_KernelData* kerneldatap = (VecSet_KernelData*)malloc(iNumThreads*sizeof(VecSet_KernelData));
1452: VecSet_KernelData** pdata = (VecSet_KernelData**)malloc(iNumThreads*sizeof(VecSet_KernelData*));
1454: iIndex = 0;
1455: for (i=0; i<iNumThreads; i++) {
1456: S = (PetscBool)(i<R);
1457: kerneldatap[i].xpin = xx+iIndex;
1458: kerneldatap[i].alphain = alpha;
1459: kerneldatap[i].nelem = S?Q+1:Q;
1460: iIndex += kerneldatap[i].nelem;
1461: pdata[i] = &kerneldatap[i];
1462: }
1463: MainJob(VecSet_Kernel,(void**)pdata,iNumThreads);
1464: free(kerneldatap);
1465: free(pdata);
1466: VecRestoreArray(xin,&xx);
1467: return(0);
1468: }
1472: PetscErrorCode VecDestroy_SeqPThread(Vec v)
1473: {
1477: VecDestroy_Seq(v);
1478: return(0);
1479: }
1481: void* DoCoreAffinity(void)
1482: {
1483: int i,icorr=0; cpu_set_t mset;
1484: pthread_t pThread = pthread_self();
1485: for(i=0; i<PetscMaxThreads; i++) {
1486: if(pthread_equal(pThread,PetscThreadPoint[i])) {
1487: icorr = ThreadCoreAffinity[i];
1488: }
1489: }
1490: CPU_ZERO(&mset);
1491: CPU_SET(icorr,&mset);
1492: sched_setaffinity(0,sizeof(cpu_set_t),&mset);
1493: return(0);
1494: }
1499: PetscErrorCode VecCreate_SeqPThread(Vec V)
1500: {
1502: PetscMPIInt size;
1503: PetscScalar *array;
1504: PetscInt n = PetscMax(V->map->n,V->map->N);
1507: MPI_Comm_size(((PetscObject)V)->comm,&size);
1508: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQTHREAD on more than one process");
1509: PetscMalloc(n*sizeof(PetscScalar),&array);
1510: PetscLogObjectMemory(V, n*sizeof(PetscScalar));
1511: PetscMemzero(array,n*sizeof(PetscScalar));
1512: VecCreate_Seq_Private(V,array);
1513: PetscObjectChangeTypeName((PetscObject)V,VECSEQPTHREAD);
1514: V->ops->dot = VecDot_SeqPThread;
1515: V->ops->mdot = VecMDot_SeqPThread;
1516: V->ops->scale = VecScale_SeqPThread;
1517: V->ops->axpy = VecAXPY_SeqPThread;
1518: V->ops->aypx = VecAYPX_SeqPThread;
1519: V->ops->waxpy = VecWAXPY_SeqPThread;
1520: V->ops->norm = VecNorm_SeqPThread;
1521: V->ops->max = VecMax_SeqPThread;
1522: V->ops->min = VecMin_SeqPThread;
1523: V->ops->pointwisemult = VecPointwiseMult_SeqPThread;
1524: V->ops->pointwisedivide = VecPointwiseDivide_SeqPThread;
1525: V->ops->swap = VecSwap_SeqPThread;
1526: V->ops->setrandom = VecSetRandom_SeqPThread;
1527: V->ops->copy = VecCopy_SeqPThread;
1528: V->ops->maxpy = VecMAXPY_SeqPThread;
1529: V->ops->set = VecSet_SeqPThread;
1530: VecSet(V,0);
1531: return(0);
1532: }
1538: PetscErrorCode VecCreate_PThread(Vec v)
1539: {
1541: PetscMPIInt size;
1544: MPI_Comm_size(((PetscObject)v)->comm,&size);
1545: if (size == 1) {
1546: VecSetType(v,VECSEQPTHREAD);
1547: } else SETERRQ(((PetscObject)v)->comm,PETSC_ERR_SUP,"No parallel thread vector yet");
1548: return(0);
1549: }