Actual source code: vecpthread.c

  2: /*
  3:    Implements the sequential pthread based vectors.
  4: */
  5: #ifndef _GNU_SOURCE
  6: #define _GNU_SOURCE
  7: #endif
  8: #include <sched.h>
  9: #include <petscconf.h>
 10: #include <private/vecimpl.h>          /*I "petscvec.h" I*/
 11: #include <../src/vec/vec/impls/dvecimpl.h>
 12: #include <petscblaslapack.h>
 13: #include <private/petscaxpy.h>
 14: #include <pthread.h>
 15: #include <unistd.h>

 23: void* PetscThreadRun(MPI_Comm Comm,void* (*pFunc)(void*),int,pthread_t*,void**);
 24: void* PetscThreadStop(MPI_Comm Comm,int,pthread_t*);
 25: void* DoCoreAffinity(void);

 27: typedef struct {
 28:   const PetscScalar *x,*y;
 29:   PetscInt          n;
 30:   PetscScalar       result;
 31: } VecDot_KernelData;

 33: typedef struct {
 34:   PetscScalar *x;
 35:   PetscScalar alpha;
 36:   PetscInt    n;
 37: } VecScale_KernelData;

 39: typedef struct {
 40:   PetscScalar *y;
 41:   const PetscScalar *x;
 42:   PetscScalar alpha;
 43:   PetscInt    n;
 44: } VecAXPY_KernelData;

 46: typedef struct {
 47:   PetscScalar *yy;
 48:   const PetscScalar *xx;
 49:   PetscScalar alpha;
 50:   PetscInt    n;
 51: } VecAYPX_KernelData;

 53: typedef struct {
 54:   PetscScalar *ww;
 55:   const PetscScalar *yy;
 56:   const PetscScalar *xx;
 57:   PetscScalar alpha;
 58:   PetscInt    n;
 59: } VecWAXPY_KernelData;

 61: typedef struct {
 62:   const PetscScalar *x;
 63:   NormType typeUse;
 64:   PetscInt    n;
 65:   PetscScalar result;
 66: } VecNorm_KernelData;

 68: typedef struct {
 69:   const PetscScalar* xvalin;
 70:   Vec*               yavecin;
 71:   PetscInt           nelem;
 72:   PetscInt           ntoproc;
 73:   PetscScalar*       result;
 74: } VecMDot_KernelData;

 76: typedef struct {
 77:   const PetscScalar *x;
 78:   PetscInt          gind;
 79:   PetscInt          localn;
 80:   PetscInt          localind;
 81:   PetscReal         localmax;
 82: } VecMax_KernelData;

 84: typedef struct {
 85:   const PetscScalar *x;
 86:   PetscInt          gind;
 87:   PetscInt          localn;
 88:   PetscInt          localind;
 89:   PetscReal         localmin;
 90: } VecMin_KernelData;

 92: typedef struct {
 93:   PetscScalar *wpin,*xpin,*ypin;
 94:   PetscInt          nlocal;
 95: } VecPointwiseMult_KernelData;

 97: typedef struct {
 98:   PetscScalar *wpin,*xpin,*ypin;
 99:   PetscInt          nlocal;
100: } VecPointwiseDivide_KernelData;

102: typedef struct {
103:   PetscScalar *xpin,*ypin;
104:   PetscInt nlocal;
105: } VecSwap_KernelData;

107: typedef struct {
108:   PetscScalar *xpin;
109:   PetscRandom   rin;
110:   PetscInt nlocal;
111: } VecSetRandom_KernelData;

113: typedef struct {
114:   const PetscScalar *xpin;
115:   PetscScalar   *ypin;
116:   PetscInt nlocal;
117: } VecCopy_KernelData;

119: typedef struct {
120:   PetscScalar*       xavalin; //vector out
121:   Vec*               yavecin; //array of data vectors
122:   const PetscScalar* amult;   //multipliers
123:   PetscInt           nelem;   //number of elements in vector to process
124:   PetscInt           ntoproc; //number of data vectors
125:   PetscInt           ibase;   //used to properly index into other vectors
126: } VecMAXPY_KernelData;

128: typedef struct {
129:   PetscScalar *xpin;
130:   PetscScalar alphain;
131:   PetscInt nelem;
132: } VecSet_KernelData;

134: void* PetscThreadRun(MPI_Comm Comm,void* (*funcp)(void*),int iTotThreads,pthread_t* ThreadId,void** data) {
135:   PetscInt    ierr;
136:   int i;
137:   for(i=0; i<iTotThreads; i++) {
138:     pthread_create(&ThreadId[i],NULL,funcp,data[i]);
139:   }
140:   return NULL;
141: }

143: void* PetscThreadStop(MPI_Comm Comm,int iTotThreads,pthread_t* ThreadId) {
144:   int i;
145:   void* joinstatus;
146:   for (i=0; i<iTotThreads; i++) {
147:     pthread_join(ThreadId[i], &joinstatus);
148:   }
149:   return NULL;
150: }

152: /* Change these macros so can be used in thread kernels */
153: #undef CHKERRQP
154: #define CHKERRQP(ierr) if (ierr) return (void*)(long int)ierr

156: void* VecDot_Kernel(void *arg)
157: {
158:   if(PetscUseThreadPool==PETSC_FALSE) {
159:     DoCoreAffinity();
160:   }
161:   VecDot_KernelData *data = (VecDot_KernelData*)arg;
162:   const PetscScalar *x, *y;
163:   PetscBLASInt one = 1, bn;
164:   PetscInt    n;

166:   x = data->x;
167:   y = data->y;
168:   n = data->n;
169:   bn = PetscBLASIntCast(n);
170:   data->result = BLASdot_(&bn,x,&one,y,&one);
171:   return(0);
172: }

176: PetscErrorCode VecDot_SeqPThread(Vec xin,Vec yin,PetscScalar *z)
177: {
178:   const PetscScalar *ya,*xa;
179:   PetscErrorCode    ierr;
180:   PetscInt          i, iIndex = 0;
181:   const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
182:   PetscInt          Q = xin->map->n/(iNumThreads);
183:   PetscInt          R = xin->map->n-Q*(iNumThreads);
184:   PetscBool         S;

187:   VecGetArrayRead(xin,&xa);
188:   VecGetArrayRead(yin,&ya);

190:   VecDot_KernelData* kerneldatap = (VecDot_KernelData*)malloc(iNumThreads*sizeof(VecDot_KernelData));
191:   VecDot_KernelData** pdata = (VecDot_KernelData**)malloc(iNumThreads*sizeof(VecDot_KernelData*));

193:   for (i=0; i<iNumThreads; i++) {
194:     S = (PetscBool)(i<R);
195:     kerneldatap[i].x = &xa[iIndex];
196:     kerneldatap[i].y = &ya[iIndex];
197:     kerneldatap[i].n = S?Q+1:Q;
198:     iIndex += kerneldatap[i].n;
199:     pdata[i] = &kerneldatap[i];
200:   }

202:   MainJob(VecDot_Kernel,(void**)pdata,iNumThreads);

204:   //gather result
205:   *z = 0.0;
206:   for(i=0; i<iNumThreads; i++) {
207:     *z += kerneldatap[i].result;
208:   }
209:   free(kerneldatap);
210:   free(pdata);

212:   VecRestoreArrayRead(xin,&xa);
213:   VecRestoreArrayRead(yin,&ya);

215:   if (xin->map->n > 0) {
216:     PetscLogFlops(2.0*xin->map->n-1);
217:   }
218:   PetscFunctionReturn(ierr);
219: }


222: void* VecScale_Kernel(void *arg)
223: {
224:   if(PetscUseThreadPool==PETSC_FALSE) {
225:     DoCoreAffinity();
226:   }
227:   VecScale_KernelData *data = (VecScale_KernelData*)arg;
228:   PetscScalar a,*x;
229:   PetscBLASInt one = 1, bn;
230:   PetscInt    n;

232:   x = data->x;
233:   a = data->alpha;
234:   n = data->n;
235:   bn = PetscBLASIntCast(n);
236:   BLASscal_(&bn,&a,x,&one);
237:   return(0);
238: }

242: PetscErrorCode VecScale_SeqPThread(Vec xin, PetscScalar alpha)
243: {


248:   if (alpha == 0.0) {
249:     VecSet_Seq(xin,alpha);
250:   } else if (alpha != 1.0) {
251:     PetscScalar a = alpha,*xarray,*xp;
252:     const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
253:     PetscInt          i,Q = xin->map->n/(iNumThreads);
254:     PetscInt          R = xin->map->n-Q*(iNumThreads);
255:     PetscBool         S;
256:     VecScale_KernelData* kerneldatap = (VecScale_KernelData*)malloc(iNumThreads*sizeof(VecScale_KernelData));
257:     VecScale_KernelData** pdata = (VecScale_KernelData**)malloc(iNumThreads*sizeof(VecScale_KernelData*));
258:     VecGetArray(xin,&xarray); //get address of first element in data array
259:     xp = xarray;
260:     for (i=0; i<iNumThreads; i++) {
261:       S = (PetscBool)(i<R);
262:       kerneldatap[i].x = xp;
263:       kerneldatap[i].alpha = a;
264:       kerneldatap[i].n = S?Q+1:Q;
265:       xp += kerneldatap[i].n; //pointer arithmetic
266:       pdata[i] = &kerneldatap[i];
267:     }
268:     MainJob(VecScale_Kernel,(void**)pdata,iNumThreads);
269:     free(kerneldatap);
270:     free(pdata);
271:     VecRestoreArray(xin,&xarray);
272:   }
273:   PetscLogFlops(xin->map->n);
274:   PetscFunctionReturn(ierr);
275: }

277: void* VecAXPY_Kernel(void *arg)
278: {
279:   if(PetscUseThreadPool==PETSC_FALSE) {
280:     DoCoreAffinity();
281:   }
282:   VecAXPY_KernelData *data = (VecAXPY_KernelData*)arg;
283:   PetscScalar a,*y;
284:   const PetscScalar *x;
285:   PetscBLASInt one = 1, bn;
286:   PetscInt    n;

288:   x = data->x;
289:   y = data->y;
290:   a = data->alpha;
291:   n = data->n;
292:   bn = PetscBLASIntCast(n);
293:   BLASaxpy_(&bn,&a,x,&one,y,&one);
294:   return(0);
295: }

299: PetscErrorCode VecAXPY_SeqPThread(Vec yin,PetscScalar alpha,Vec xin)
300: {
301:   PetscErrorCode    ierr;
302:   const PetscScalar *xarray,*xp;
303:   PetscScalar       a=alpha,*yarray,*yp;
304:   const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
305:   PetscInt          i,Q = xin->map->n/(iNumThreads);
306:   PetscInt          R = xin->map->n-Q*(iNumThreads);
307:   PetscBool         S;
308:   VecAXPY_KernelData* kerneldatap = (VecAXPY_KernelData*)malloc(iNumThreads*sizeof(VecAXPY_KernelData));
309:   VecAXPY_KernelData** pdata = (VecAXPY_KernelData**)malloc(iNumThreads*sizeof(VecAXPY_KernelData*));

312:   /* assume that the BLAS handles alpha == 1.0 efficiently since we have no fast code for it */
313:   if (alpha != 0.0) {
314:     VecGetArrayRead(xin,&xarray);
315:     VecGetArray(yin,&yarray);
316:     xp = xarray;
317:     yp = yarray;
318:     for (i=0; i<iNumThreads; i++) {
319:       S = (PetscBool)(i<R);
320:       kerneldatap[i].x = xp;
321:       kerneldatap[i].y = yp;
322:       kerneldatap[i].alpha = a;
323:       kerneldatap[i].n = S?Q+1:Q;
324:       xp += kerneldatap[i].n; //pointer arithmetic
325:       yp += kerneldatap[i].n;
326:       pdata[i] = &kerneldatap[i];
327:     }
328:     MainJob(VecAXPY_Kernel,(void**)pdata,iNumThreads);
329:     VecRestoreArrayRead(xin,&xarray);
330:     VecRestoreArray(yin,&yarray);
331:     PetscLogFlops(2.0*yin->map->n);
332:   }
333:   free(kerneldatap);
334:   free(pdata);
335:   return(0);
336: }

338: void* VecAYPX_Kernel(void *arg)
339: {
340:   if(PetscUseThreadPool==PETSC_FALSE) {
341:     DoCoreAffinity();
342:   }
343:   VecAYPX_KernelData *data = (VecAYPX_KernelData*)arg;
344:   PetscScalar a,*yy;
345:   const PetscScalar *xx;
346:   PetscInt    i,n;

348:   xx = data->xx;
349:   yy = data->yy;
350:   a = data->alpha;
351:   n = data->n;
352:   if(a==-1.0) {
353:     for (i=0; i<n; i++) {
354:       yy[i] = xx[i] - yy[i];
355:     }
356:   }
357:   else {
358:     for (i=0; i<n; i++) {
359:       yy[i] = xx[i] + a*yy[i];
360:     }
361:   }
362:   return(0);
363: }

367: PetscErrorCode VecAYPX_SeqPThread(Vec yin,PetscScalar alpha,Vec xin)
368: {
369:   PetscErrorCode    ierr;


373:   if (alpha == 0.0) {
374:     VecCopy(xin,yin);
375:   }
376:   else if (alpha == 1.0) {
377:     VecAXPY_SeqPThread(yin,alpha,xin);
378:   }
379:   else {
380:     PetscInt          n = yin->map->n;
381:     PetscScalar       *yy;
382:     const PetscScalar *xx;
383:     VecGetArrayRead(xin,&xx);
384:     VecGetArray(yin,&yy);
385:     #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
386:     {
387:       PetscScalar oalpha = alpha;
388:       fortranaypx_(&n,&oalpha,xx,yy);
389:     }
390:     #else
391:     {
392:       const PetscScalar *xp = xx;
393:       PetscScalar       a=alpha,*yp = yy;
394:       const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
395:       PetscInt          i,Q = xin->map->n/(iNumThreads);
396:       PetscInt          R = xin->map->n-Q*(iNumThreads);
397:       PetscBool         S;
398:       VecAYPX_KernelData* kerneldatap = (VecAYPX_KernelData*)malloc(iNumThreads*sizeof(VecAYPX_KernelData));
399:       VecAYPX_KernelData** pdata = (VecAYPX_KernelData**)malloc(iNumThreads*sizeof(VecAYPX_KernelData*));

401:       for (i=0; i<iNumThreads; i++) {
402:         S = (PetscBool)(i<R);
403:         kerneldatap[i].xx = xp;
404:         kerneldatap[i].yy = yp;
405:         kerneldatap[i].alpha = a;
406:         kerneldatap[i].n = S?Q+1:Q;
407:         xp += kerneldatap[i].n; //pointer arithmetic
408:         yp += kerneldatap[i].n;
409:         pdata[i] = &kerneldatap[i];
410:       }
411:       MainJob(VecAYPX_Kernel,(void**)pdata,iNumThreads);
412:       free(kerneldatap);
413:       free(pdata);
414:     }
415:     #endif
416:     VecRestoreArrayRead(xin,&xx);
417:     VecRestoreArray(yin,&yy);
418:     if(alpha==-1.0) {
419:       PetscLogFlops(1.0*n);
420:     }
421:     else {
422:       PetscLogFlops(2.0*n);
423:     }
424:   }
425:   return(0);
426: }

428: void* VecWAXPY_Kernel(void *arg)
429: {
430:   if(PetscUseThreadPool==PETSC_FALSE) {
431:     DoCoreAffinity();
432:   }
433:   VecWAXPY_KernelData *data = (VecWAXPY_KernelData*)arg;
434:   PetscScalar a,*ww;
435:   const PetscScalar *xx,*yy;
436:   PetscInt    i,n;

438:   ww = data->ww;
439:   xx = data->xx;
440:   yy = data->yy;
441:   a = data->alpha;
442:   n = data->n;
443:   if(a==-1.0) {
444:     for (i=0; i<n; i++) {
445:       ww[i] = yy[i] - xx[i];
446:     }
447:   }
448:   else if(a==1.0) {
449:     for (i=0; i<n; i++) {
450:       ww[i] = yy[i] + xx[i];
451:     }
452:   }
453:   else {
454:     for (i=0; i<n; i++) {
455:       ww[i] = a*xx[i] + yy[i];
456:     }
457:   }
458:   return(0);
459: }

463: PetscErrorCode VecWAXPY_SeqPThread(Vec win, PetscScalar alpha,Vec xin,Vec yin)
464: {
465:   PetscErrorCode     ierr;
466:   PetscInt           n = win->map->n;
467:   PetscScalar        *ww;
468:   const PetscScalar  *yy,*xx;


472:   VecGetArrayRead(xin,&xx);
473:   VecGetArrayRead(yin,&yy);
474:   VecGetArray(win,&ww);
475:   if (alpha == 0.0) {
476:     PetscMemcpy(ww,yy,n*sizeof(PetscScalar));
477:   }
478:   else {
479: #if defined(PETSC_USE_FORTRAN_KERNEL_WAXPY)
480:     PetscScalar oalpha = alpha;
481:     fortranwaxpy_(&n,&oalpha,xx,yy,ww);
482: #else
483:     const PetscScalar *xp = xx,*yp = yy;
484:     PetscScalar       a=alpha,*wp = ww;
485:     const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
486:     PetscInt          i,Q = n/(iNumThreads);
487:     PetscInt          R = n-Q*(iNumThreads);
488:     PetscBool         S;
489:     VecWAXPY_KernelData* kerneldatap = (VecWAXPY_KernelData*)malloc(iNumThreads*sizeof(VecWAXPY_KernelData));
490:     VecWAXPY_KernelData** pdata = (VecWAXPY_KernelData**)malloc(iNumThreads*sizeof(VecWAXPY_KernelData*));

492:     for (i=0; i<iNumThreads; i++) {
493:       S = (PetscBool)(i<R);
494:       kerneldatap[i].ww = wp;
495:       kerneldatap[i].xx = xp;
496:       kerneldatap[i].yy = yp;
497:       kerneldatap[i].alpha = a;
498:       kerneldatap[i].n = S?Q+1:Q;
499:       wp += kerneldatap[i].n; //pointer arithmetic
500:       xp += kerneldatap[i].n;
501:       yp += kerneldatap[i].n;
502:       pdata[i] = &kerneldatap[i];
503:     }
504:     MainJob(VecWAXPY_Kernel,(void**)pdata,iNumThreads);
505:     free(kerneldatap);
506:     free(pdata);
507: #endif
508:     if (alpha == 1.0 || alpha == -1.0) {
509:       PetscLogFlops(1.0*n);
510:     }
511:     else {
512:       PetscLogFlops(2.0*n);
513:     }
514:   }
515:   VecRestoreArrayRead(xin,&xx);
516:   VecRestoreArrayRead(yin,&yy);
517:   VecRestoreArray(win,&ww);
518:   return(0);
519: }

521: void* VecNorm_Kernel(void *arg)
522: {
523:   if(PetscUseThreadPool==PETSC_FALSE) {
524:     DoCoreAffinity();
525:   }
526:   VecNorm_KernelData *data = (VecNorm_KernelData*)arg;
527:   const PetscScalar *x;
528:   NormType type;
529:   PetscInt    i,n;

531:   x = data->x;
532:   type = data->typeUse;
533:   n = data->n;
534:   data->result = 0.0;
535:   if(type==NORM_1) {
536:     PetscBLASInt one = 1, bn = PetscBLASIntCast(n);
537:     data->result = BLASasum_(&bn,x,&one);
538:   }
539:   else if(type==NORM_INFINITY) {
540:     PetscReal    maxv = 0.0,tmp;
541:     for(i=0; i<n; i++) {
542:       tmp = PetscAbsScalar(x[i]);
543:       if(tmp>maxv) {
544:         maxv = tmp;
545:       }
546:     }
547:     data->result = maxv;
548:   }
549:   else {
550:     PetscBLASInt one = 1, bn = PetscBLASIntCast(n);
551:     data->result = BLASnrm2_(&bn,x,&one);
552:   }
553:   return(0);
554: }

558: PetscErrorCode VecNorm_SeqPThread(Vec xin,NormType type,PetscReal* z)
559: {
560:   const PetscScalar *xx;
561:   PetscErrorCode    ierr;
562:   PetscInt          n = xin->map->n;

565:   if(type == NORM_1_AND_2) {
566:     VecNorm_SeqPThread(xin,NORM_1,z);
567:     VecNorm_SeqPThread(xin,NORM_2,z+1);
568:   }
569:   else {
570:    VecGetArrayRead(xin,&xx);
571:    const PetscScalar *xp = xx;
572:    const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
573:    PetscInt          i,Q = n/(iNumThreads);
574:    PetscInt          R = n-Q*(iNumThreads);
575:    PetscBool         S;
576:    VecNorm_KernelData* kerneldatap = (VecNorm_KernelData*)malloc(iNumThreads*sizeof(VecNorm_KernelData));
577:    VecNorm_KernelData** pdata = (VecNorm_KernelData**)malloc(iNumThreads*sizeof(VecNorm_KernelData*));

579:    for (i=0; i<iNumThreads; i++) {
580:      S = (PetscBool)(i<R);
581:      kerneldatap[i].x = xp;
582:      kerneldatap[i].typeUse = type;
583:      kerneldatap[i].n = S?Q+1:Q;
584:      xp += kerneldatap[i].n; //pointer arithmetic
585:      pdata[i] = &kerneldatap[i];
586:    }
587:    MainJob(VecNorm_Kernel,(void**)pdata,iNumThreads);
588:    //collect results
589:    *z = 0.0;
590:    if(type == NORM_1) {
591:      for(i=0; i<iNumThreads; i++) {
592:        *z += kerneldatap[i].result;
593:      }
594:      PetscLogFlops(PetscMax(n-1.0,0.0));
595:    }
596:    else if(type == NORM_2 || type == NORM_FROBENIUS) {
597:      for(i=0; i<iNumThreads; i++) {
598:        *z += kerneldatap[i].result*kerneldatap[i].result;
599:      }
600:      *z = sqrt(*z);
601:      PetscLogFlops(PetscMax(2.0*n-1,0.0));
602:    }
603:    else {
604:      PetscReal    maxv = 0.0,tmp;
605:      for(i=0; i<iNumThreads; i++) {
606:        tmp = kerneldatap[i].result;
607:        if(tmp>maxv) {
608:          maxv = tmp;
609:        }
610:      }
611:      *z = maxv;
612:    }
613:    free(kerneldatap);
614:    free(pdata);
615:    VecRestoreArrayRead(xin,&xx);
616:   }
617:   return(0);
618: }

620: void* VecMDot_Kernel(void *arg)
621: {
622:   if(PetscUseThreadPool==PETSC_FALSE) {
623:     DoCoreAffinity();
624:   }
625:   VecMDot_KernelData *data = (VecMDot_KernelData*)arg;
626:   const PetscScalar  *xbase = data->xvalin;
627:   Vec*               yin = data->yavecin;
628:   PetscInt           n = data->nelem;
629:   PetscInt           nv = data->ntoproc;
630:   PetscScalar*       z = data->result;
631:   PetscErrorCode     ierr;
632:   PetscInt           i,j,nv_rem,j_rem;
633:   PetscScalar        sum0,sum1,sum2,sum3,x0,x1,x2,x3;
634:   const PetscScalar  *yy0,*yy1,*yy2,*yy3,*x;
635:   Vec                *yy;

637:   sum0 = 0.;
638:   sum1 = 0.;
639:   sum2 = 0.;
640:   i      = nv;
641:   nv_rem = nv&0x3;
642:   yy     = yin;
643:   j      = n;
644:   x      = xbase;

646:   switch (nv_rem) {
647:   case 3:
648:     VecGetArrayRead(yy[0],&yy0);CHKERRQP(ierr);
649:     VecGetArrayRead(yy[1],&yy1);CHKERRQP(ierr);
650:     VecGetArrayRead(yy[2],&yy2);CHKERRQP(ierr);
651:     switch (j_rem=j&0x3) {
652:     case 3:
653:       x2 = x[2];
654:       sum0 += x2*PetscConj(yy0[2]); sum1 += x2*PetscConj(yy1[2]);
655:       sum2 += x2*PetscConj(yy2[2]);
656:     case 2:
657:       x1 = x[1];
658:       sum0 += x1*PetscConj(yy0[1]); sum1 += x1*PetscConj(yy1[1]);
659:       sum2 += x1*PetscConj(yy2[1]);
660:     case 1:
661:       x0 = x[0];
662:       sum0 += x0*PetscConj(yy0[0]); sum1 += x0*PetscConj(yy1[0]);
663:       sum2 += x0*PetscConj(yy2[0]);
664:     case 0:
665:       x   += j_rem;
666:       yy0 += j_rem;
667:       yy1 += j_rem;
668:       yy2 += j_rem;
669:       j   -= j_rem;
670:       break;
671:     }
672:     while (j>0) {
673:       x0 = x[0];
674:       x1 = x[1];
675:       x2 = x[2];
676:       x3 = x[3];
677:       x += 4;
678: 
679:       sum0 += x0*PetscConj(yy0[0]) + x1*PetscConj(yy0[1]) + x2*PetscConj(yy0[2]) + x3*PetscConj(yy0[3]); yy0+=4;
680:       sum1 += x0*PetscConj(yy1[0]) + x1*PetscConj(yy1[1]) + x2*PetscConj(yy1[2]) + x3*PetscConj(yy1[3]); yy1+=4;
681:       sum2 += x0*PetscConj(yy2[0]) + x1*PetscConj(yy2[1]) + x2*PetscConj(yy2[2]) + x3*PetscConj(yy2[3]); yy2+=4;
682:       j -= 4;
683:     }
684:     z[0] = sum0;
685:     z[1] = sum1;
686:     z[2] = sum2;
687:     VecRestoreArrayRead(yy[0],&yy0);CHKERRQP(ierr);
688:     VecRestoreArrayRead(yy[1],&yy1);CHKERRQP(ierr);
689:     VecRestoreArrayRead(yy[2],&yy2);CHKERRQP(ierr);
690:     break;
691:   case 2:
692:     VecGetArrayRead(yy[0],&yy0);CHKERRQP(ierr);
693:     VecGetArrayRead(yy[1],&yy1);CHKERRQP(ierr);
694:     switch (j_rem=j&0x3) {
695:     case 3:
696:       x2 = x[2];
697:       sum0 += x2*PetscConj(yy0[2]); sum1 += x2*PetscConj(yy1[2]);
698:     case 2:
699:       x1 = x[1];
700:       sum0 += x1*PetscConj(yy0[1]); sum1 += x1*PetscConj(yy1[1]);
701:     case 1:
702:       x0 = x[0];
703:       sum0 += x0*PetscConj(yy0[0]); sum1 += x0*PetscConj(yy1[0]);
704:     case 0:
705:       x   += j_rem;
706:       yy0 += j_rem;
707:       yy1 += j_rem;
708:       j   -= j_rem;
709:       break;
710:     }
711:     while (j>0) {
712:       x0 = x[0];
713:       x1 = x[1];
714:       x2 = x[2];
715:       x3 = x[3];
716:       x += 4;
717: 
718:       sum0 += x0*PetscConj(yy0[0]) + x1*PetscConj(yy0[1]) + x2*PetscConj(yy0[2]) + x3*PetscConj(yy0[3]); yy0+=4;
719:       sum1 += x0*PetscConj(yy1[0]) + x1*PetscConj(yy1[1]) + x2*PetscConj(yy1[2]) + x3*PetscConj(yy1[3]); yy1+=4;
720:       j -= 4;
721:     }
722:     z[0] = sum0;
723:     z[1] = sum1;
724: 
725:     VecRestoreArrayRead(yy[0],&yy0);CHKERRQP(ierr);
726:     VecRestoreArrayRead(yy[1],&yy1);CHKERRQP(ierr);
727:     break;
728:   case 1:
729:     VecGetArrayRead(yy[0],&yy0);CHKERRQP(ierr);
730:     switch (j_rem=j&0x3) {
731:     case 3:
732:       x2 = x[2]; sum0 += x2*PetscConj(yy0[2]);
733:     case 2:
734:       x1 = x[1]; sum0 += x1*PetscConj(yy0[1]);
735:     case 1:
736:       x0 = x[0]; sum0 += x0*PetscConj(yy0[0]);
737:     case 0:
738:       x   += j_rem;
739:       yy0 += j_rem;
740:       j   -= j_rem;
741:       break;
742:     }
743:     while (j>0) {
744:       sum0 += x[0]*PetscConj(yy0[0]) + x[1]*PetscConj(yy0[1])
745:             + x[2]*PetscConj(yy0[2]) + x[3]*PetscConj(yy0[3]);
746:       yy0+=4;
747:       j -= 4; x+=4;
748:     }
749:     z[0] = sum0;

751:     VecRestoreArrayRead(yy[0],&yy0);CHKERRQP(ierr);
752:     break;
753:   case 0:
754:     break;
755:   }
756:   z  += nv_rem;
757:   i  -= nv_rem;
758:   yy += nv_rem;

760:   while (i >0) {
761:     sum0 = 0.;
762:     sum1 = 0.;
763:     sum2 = 0.;
764:     sum3 = 0.;
765:     VecGetArrayRead(yy[0],&yy0);CHKERRQP(ierr);
766:     VecGetArrayRead(yy[1],&yy1);CHKERRQP(ierr);
767:     VecGetArrayRead(yy[2],&yy2);CHKERRQP(ierr);
768:     VecGetArrayRead(yy[3],&yy3);CHKERRQP(ierr);

770:     j = n;
771:     x = xbase;
772:     switch (j_rem=j&0x3) {
773:     case 3:
774:       x2 = x[2];
775:       sum0 += x2*PetscConj(yy0[2]); sum1 += x2*PetscConj(yy1[2]);
776:       sum2 += x2*PetscConj(yy2[2]); sum3 += x2*PetscConj(yy3[2]);
777:     case 2:
778:       x1 = x[1];
779:       sum0 += x1*PetscConj(yy0[1]); sum1 += x1*PetscConj(yy1[1]);
780:       sum2 += x1*PetscConj(yy2[1]); sum3 += x1*PetscConj(yy3[1]);
781:     case 1:
782:       x0 = x[0];
783:       sum0 += x0*PetscConj(yy0[0]); sum1 += x0*PetscConj(yy1[0]);
784:       sum2 += x0*PetscConj(yy2[0]); sum3 += x0*PetscConj(yy3[0]);
785:     case 0:
786:       x   += j_rem;
787:       yy0 += j_rem;
788:       yy1 += j_rem;
789:       yy2 += j_rem;
790:       yy3 += j_rem;
791:       j   -= j_rem;
792:       break;
793:     }
794:     while (j>0) {
795:       x0 = x[0];
796:       x1 = x[1];
797:       x2 = x[2];
798:       x3 = x[3];
799:       x += 4;

801:       sum0 += x0*PetscConj(yy0[0]) + x1*PetscConj(yy0[1]) + x2*PetscConj(yy0[2]) + x3*PetscConj(yy0[3]); yy0+=4;
802:       sum1 += x0*PetscConj(yy1[0]) + x1*PetscConj(yy1[1]) + x2*PetscConj(yy1[2]) + x3*PetscConj(yy1[3]); yy1+=4;
803:       sum2 += x0*PetscConj(yy2[0]) + x1*PetscConj(yy2[1]) + x2*PetscConj(yy2[2]) + x3*PetscConj(yy2[3]); yy2+=4;
804:       sum3 += x0*PetscConj(yy3[0]) + x1*PetscConj(yy3[1]) + x2*PetscConj(yy3[2]) + x3*PetscConj(yy3[3]); yy3+=4;
805:       j -= 4;
806:     }
807:     z[0] = sum0;
808:     z[1] = sum1;
809:     z[2] = sum2;
810:     z[3] = sum3;
811:     z   += 4;
812:     i   -= 4;
813:     VecRestoreArrayRead(yy[0],&yy0);CHKERRQP(ierr);
814:     VecRestoreArrayRead(yy[1],&yy1);CHKERRQP(ierr);
815:     VecRestoreArrayRead(yy[2],&yy2);CHKERRQP(ierr);
816:     VecRestoreArrayRead(yy[3],&yy3);CHKERRQP(ierr);
817:     yy  += 4;
818:   }
819:   return(0);
820: }

824: PetscErrorCode VecMDot_SeqPThread(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
825: {
826:   PetscErrorCode    ierr;
827:   PetscInt          i,j=0;
828:   Vec               *yy = (Vec *)yin;
829:   const PetscScalar *xbase;

832:   const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
833:   PetscInt          n=xin->map->n,Q = nv/(iNumThreads);
834:   PetscInt          R = nv-Q*(iNumThreads);
835:   PetscBool         S;
836:   VecMDot_KernelData* kerneldatap = (VecMDot_KernelData*)malloc(iNumThreads*sizeof(VecMDot_KernelData));
837:   VecMDot_KernelData** pdata = (VecMDot_KernelData**)malloc(iNumThreads*sizeof(VecMDot_KernelData*));
838:   VecGetArrayRead(xin,&xbase);
839:   for (i=0; i<iNumThreads; i++) {
840:     S = (PetscBool)(i<R);
841:     kerneldatap[i].xvalin = xbase;
842:     kerneldatap[i].yavecin = &yy[j];
843:     kerneldatap[i].nelem = n;
844:     kerneldatap[i].ntoproc = S?Q+1:Q;
845:     kerneldatap[i].result = &z[j];
846:     j += kerneldatap[i].ntoproc;
847:     pdata[i] = &kerneldatap[i];
848:   }
849:   MainJob(VecMDot_Kernel,(void**)pdata,iNumThreads);
850:   free(kerneldatap);
851:   free(pdata);
852:   VecRestoreArrayRead(xin,&xbase);
853:   PetscLogFlops(PetscMax(nv*(2.0*xin->map->n-1),0.0));
854:   return(0);
855: }

857: void* VecMax_Kernel(void *arg)
858: {
859:   if(PetscUseThreadPool==PETSC_FALSE) {
860:     DoCoreAffinity();
861:   }
862:   VecMax_KernelData *data = (VecMax_KernelData*)arg;
863:   const PetscScalar *xx = data->x;
864:   PetscInt          i,j,n = data->localn;
865:   PetscReal         lmax,tmp;

867: #if defined(PETSC_USE_COMPLEX)
868:   lmax = PetscRealPart(*xx++); j = 0;
869: #else
870:   lmax = *xx++; j = 0;
871: #endif
872:   for (i=1; i<n; i++) {
873: #if defined(PETSC_USE_COMPLEX)
874:     if ((tmp = PetscRealPart(*xx++)) > lmax) { j = i; lmax = tmp;}
875: #else
876:     if ((tmp = *xx++) > lmax) { j = i; lmax = tmp; }
877: #endif
878:   }

880:   data->localmax = lmax;
881:   data->localind = j;
882:   return(0);
883: }

887: PetscErrorCode VecMax_SeqPThread(Vec xin,PetscInt* idx,PetscReal * z)
888: {
889:   PetscInt          i,j=0,n = xin->map->n;
890:   PetscReal         max;
891:   const PetscScalar *xx,*xp;
892:   PetscErrorCode    ierr;

895:   VecGetArrayRead(xin,&xx);
896:   if (!n) {
897:     max = PETSC_MIN_REAL;
898:     j   = -1;
899:   } else {
900:   const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
901:   PetscInt          gind,Q = n/(iNumThreads);
902:   PetscInt          R = n-Q*(iNumThreads);
903:   PetscBool         S;
904:   VecMax_KernelData* kerneldatap = (VecMax_KernelData*)malloc(iNumThreads*sizeof(VecMax_KernelData));
905:   VecMax_KernelData** pdata = (VecMax_KernelData**)malloc(iNumThreads*sizeof(VecMax_KernelData*));

907:   gind = 0;
908:   xp = xx;
909:   for (i=0; i<iNumThreads; i++) {
910:     S = (PetscBool)(i<R);
911:     kerneldatap[i].x = xp;
912:     kerneldatap[i].gind = gind;
913:     kerneldatap[i].localn = S?Q+1:Q;
914:     xp += kerneldatap[i].localn; //pointer arithmetic
915:     gind += kerneldatap[i].localn;
916:     pdata[i] = &kerneldatap[i];
917:   }
918:   MainJob(VecMax_Kernel,(void**)pdata,iNumThreads);
919:   //collect results, determine global max, global index
920:   max = kerneldatap[0].localmax;
921:   j   = kerneldatap[0].localind;
922:   for(i=1; i<iNumThreads; i++) {
923:     if(kerneldatap[i].localmax>max) {
924:       max = kerneldatap[i].localmax;
925:       j   = kerneldatap[i].gind+kerneldatap[i].localind;
926:     }
927:   }
928:   free(kerneldatap);
929:   free(pdata);
930:   }
931:   *z   = max;
932:   if (idx) *idx = j;
933:   VecRestoreArrayRead(xin,&xx);
934:   return(0);
935: }

937: void* VecMin_Kernel(void *arg)
938: {
939:   if(PetscUseThreadPool==PETSC_FALSE) {
940:     DoCoreAffinity();
941:   }
942:   VecMin_KernelData *data = (VecMin_KernelData*)arg;
943:   const PetscScalar *xx = data->x;
944:   PetscInt          i,j,n = data->localn;
945:   PetscReal         lmin,tmp;

947: #if defined(PETSC_USE_COMPLEX)
948:   lmin = PetscRealPart(*xx++); j = 0;
949: #else
950:   lmin = *xx++; j = 0;
951: #endif
952:   for (i=1; i<n; i++) {
953: #if defined(PETSC_USE_COMPLEX)
954:     if ((tmp = PetscRealPart(*xx++)) < lmin) { j = i; lmin = tmp;}
955: #else
956:     if ((tmp = *xx++) < lmin) { j = i; lmin = tmp; }
957: #endif
958:   }

960:   data->localmin = lmin;
961:   data->localind = j;
962:   return(0);
963: }

967: PetscErrorCode VecMin_SeqPThread(Vec xin,PetscInt* idx,PetscReal * z)
968: {
969:   PetscInt          i,j=0,n = xin->map->n;
970:   PetscReal         min;
971:   const PetscScalar *xx,*xp;
972:   PetscErrorCode    ierr;

975:   VecGetArrayRead(xin,&xx);
976:   if (!n) {
977:     min = PETSC_MAX_REAL;
978:     j   = -1;
979:   } else {
980:   const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
981:   PetscInt          gind,Q = n/(iNumThreads);
982:   PetscInt          R = n-Q*(iNumThreads);
983:   PetscBool         S;
984:   VecMin_KernelData* kerneldatap = (VecMin_KernelData*)malloc(iNumThreads*sizeof(VecMin_KernelData));
985:   VecMin_KernelData** pdata = (VecMin_KernelData**)malloc(iNumThreads*sizeof(VecMin_KernelData*));

987:   gind = 0;
988:   xp = xx;
989:   for (i=0; i<iNumThreads; i++) {
990:     S = (PetscBool)(i<R);
991:     kerneldatap[i].x = xp;
992:     kerneldatap[i].gind = gind;
993:     kerneldatap[i].localn = S?Q+1:Q;
994:     xp += kerneldatap[i].localn; //pointer arithmetic
995:     gind += kerneldatap[i].localn;
996:     pdata[i] = &kerneldatap[i];
997:   }
998:   MainJob(VecMin_Kernel,(void**)pdata,iNumThreads);
999:   //collect results, determine global max, global index
1000:   min = kerneldatap[0].localmin;
1001:   j   = kerneldatap[0].localind;
1002:   for(i=1; i<iNumThreads; i++) {
1003:     if(kerneldatap[i].localmin<min) {
1004:       min = kerneldatap[i].localmin;
1005:       j   = kerneldatap[i].gind+kerneldatap[i].localind;
1006:     }
1007:   }
1008:   free(kerneldatap);
1009:   free(pdata);
1010:   }
1011:   *z   = min;
1012:   if (idx) *idx = j;
1013:   VecRestoreArrayRead(xin,&xx);
1014:   return(0);
1015: }

1017: void* VecPointwiseMult_Kernel(void *arg)
1018: {
1019:   if(PetscUseThreadPool==PETSC_FALSE) {
1020:     DoCoreAffinity();
1021:   }
1022:   VecPointwiseMult_KernelData *data = (VecPointwiseMult_KernelData*)arg;
1023:   PetscScalar *ww = data->wpin,*xx = data->xpin,*yy = data->ypin;
1024:   PetscInt    n = data->nlocal,i;

1026:   if (ww == xx) {
1027:     for (i=0; i<n; i++) ww[i] *= yy[i];
1028:   } else if (ww == yy) {
1029:     for (i=0; i<n; i++) ww[i] *= xx[i];
1030:   } else {
1031: #if defined(PETSC_USE_FORTRAN_KERNEL_XTIMESY)
1032:     fortranxtimesy_(xx,yy,ww,&n);
1033: #else
1034:     for (i=0; i<n; i++) ww[i] = xx[i] * yy[i];
1035: #endif
1036:   }
1037:   return(0);
1038: }

1040: #include <../src/vec/vec/impls/seq/ftn-kernels/fxtimesy.h>
1043: static PetscErrorCode VecPointwiseMult_SeqPThread(Vec win,Vec xin,Vec yin)
1044: {
1046:   PetscInt       n = win->map->n,i,iIndex;
1047:   PetscScalar    *ww,*xx,*yy; /* cannot make xx or yy const since might be ww */
1048:   const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
1049:   PetscInt          Q = n/(iNumThreads);
1050:   PetscInt          R = n-Q*(iNumThreads);
1051:   PetscBool         S;

1053:   VecPointwiseMult_KernelData* kerneldatap = (VecPointwiseMult_KernelData*)malloc(iNumThreads*sizeof(VecPointwiseMult_KernelData));
1054:   VecPointwiseMult_KernelData** pdata = (VecPointwiseMult_KernelData**)malloc(iNumThreads*sizeof(VecPointwiseMult_KernelData*));


1058:   VecGetArrayRead(xin,(const PetscScalar**)&xx);
1059:   VecGetArrayRead(yin,(const PetscScalar**)&yy);
1060:   VecGetArray(win,&ww);

1062:   iIndex = 0;
1063:   for (i=0; i<iNumThreads; i++) {
1064:     S = (PetscBool)(i<R);
1065:     kerneldatap[i].wpin = ww+iIndex;
1066:     kerneldatap[i].xpin = xx+iIndex;
1067:     kerneldatap[i].ypin = yy+iIndex;
1068:     kerneldatap[i].nlocal = S?Q+1:Q;
1069:     iIndex += kerneldatap[i].nlocal;
1070:     pdata[i] = &kerneldatap[i];
1071:   }

1073:   MainJob(VecPointwiseMult_Kernel,(void**)pdata,iNumThreads);
1074:   free(kerneldatap);
1075:   free(pdata);

1077:   VecRestoreArrayRead(xin,(const PetscScalar**)&xx);
1078:   VecRestoreArrayRead(yin,(const PetscScalar**)&yy);
1079:   VecRestoreArray(win,&ww);
1080:   PetscLogFlops(n);
1081:   return(0);
1082: }

1084: void* VecPointwiseDivide_Kernel(void *arg)
1085: {
1086:   if(PetscUseThreadPool==PETSC_FALSE) {
1087:     DoCoreAffinity();
1088:   }
1089:   VecPointwiseDivide_KernelData *data = (VecPointwiseDivide_KernelData*)arg;
1090:   PetscScalar *ww = data->wpin,*xx = data->xpin,*yy = data->ypin;
1091:   PetscInt    n = data->nlocal,i;

1093:   for (i=0; i<n; i++) {
1094:     ww[i] = xx[i] / yy[i];
1095:   }
1096:   return(0);
1097: }

1101: static PetscErrorCode VecPointwiseDivide_SeqPThread(Vec win,Vec xin,Vec yin)
1102: {
1104:   PetscInt       n = win->map->n,i,iIndex;
1105:   PetscScalar    *ww,*xx,*yy; /* cannot make xx or yy const since might be ww */
1106:   const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
1107:   PetscInt          Q = n/(iNumThreads);
1108:   PetscInt          R = n-Q*(iNumThreads);
1109:   PetscBool         S;

1111:   VecPointwiseDivide_KernelData* kerneldatap = (VecPointwiseDivide_KernelData*)malloc(iNumThreads*sizeof(VecPointwiseDivide_KernelData));
1112:   VecPointwiseDivide_KernelData** pdata = (VecPointwiseDivide_KernelData**)malloc(iNumThreads*sizeof(VecPointwiseDivide_KernelData*));


1116:   VecGetArrayRead(xin,(const PetscScalar**)&xx);
1117:   VecGetArrayRead(yin,(const PetscScalar**)&yy);
1118:   VecGetArray(win,&ww);

1120:   iIndex = 0;
1121:   for (i=0; i<iNumThreads; i++) {
1122:     S = (PetscBool)(i<R);
1123:     kerneldatap[i].wpin = ww+iIndex;
1124:     kerneldatap[i].xpin = xx+iIndex;
1125:     kerneldatap[i].ypin = yy+iIndex;
1126:     kerneldatap[i].nlocal = S?Q+1:Q;
1127:     iIndex += kerneldatap[i].nlocal;
1128:     pdata[i] = &kerneldatap[i];
1129:   }

1131:   MainJob(VecPointwiseDivide_Kernel,(void**)pdata,iNumThreads);
1132:   free(kerneldatap);
1133:   free(pdata);

1135:   PetscLogFlops(n);
1136:   VecRestoreArrayRead(xin,(const PetscScalar**)&xx);
1137:   VecRestoreArrayRead(yin,(const PetscScalar**)&yy);
1138:   VecRestoreArray(win,&ww);
1139:   return(0);
1140: }

1142: void* VecSwap_Kernel(void *arg)
1143: {
1144:   if(PetscUseThreadPool==PETSC_FALSE) {
1145:     DoCoreAffinity();
1146:   }
1147:   VecSwap_KernelData *data = (VecSwap_KernelData*)arg;
1148:   PetscScalar *xa = data->xpin,*ya = data->ypin;
1149:   PetscBLASInt   one = 1,bn = PetscBLASIntCast(data->nlocal);

1151:   BLASswap_(&bn,xa,&one,ya,&one);
1152:   return(0);
1153: }

1155: #include <petscblaslapack.h>
1158: static PetscErrorCode VecSwap_SeqPThread(Vec xin,Vec yin)
1159: {
1160:   PetscScalar    *ya, *xa;

1164:   if (xin != yin) {
1165:     VecGetArray(xin,&xa);
1166:     VecGetArray(yin,&ya);
1167:     const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
1168:     PetscInt          n = xin->map->n,Q = n/(iNumThreads),R = n-Q*(iNumThreads),i,iIndex;
1169:     PetscBool         S;

1171:     VecSwap_KernelData* kerneldatap = (VecSwap_KernelData*)malloc(iNumThreads*sizeof(VecSwap_KernelData));
1172:     VecSwap_KernelData** pdata = (VecSwap_KernelData**)malloc(iNumThreads*sizeof(VecSwap_KernelData*));

1174:     iIndex = 0;
1175:     for (i=0; i<iNumThreads; i++) {
1176:       S = (PetscBool)(i<R);
1177:       kerneldatap[i].xpin = xa+iIndex;
1178:       kerneldatap[i].ypin = ya+iIndex;
1179:       kerneldatap[i].nlocal = S?Q+1:Q;
1180:       iIndex += kerneldatap[i].nlocal;
1181:       pdata[i] = &kerneldatap[i];
1182:     }

1184:     MainJob(VecSwap_Kernel,(void**)pdata,iNumThreads);
1185:     free(kerneldatap);
1186:     free(pdata);
1187:     VecRestoreArray(xin,&xa);
1188:     VecRestoreArray(yin,&ya);
1189:   }
1190:   return(0);
1191: }

1193: void* VecSetRandom_Kernel(void *arg)
1194: {
1195:   if(PetscUseThreadPool==PETSC_FALSE) {
1196:     DoCoreAffinity();
1197:   }
1198:   VecSetRandom_KernelData *data = (VecSetRandom_KernelData*)arg;
1199:   PetscScalar  *xx = data->xpin;
1200:   PetscRandom  r = data->rin;
1201:   PetscInt     i,n = data->nlocal;

1204:   for(i=0; i<n; i++) {
1205:     PetscRandomGetValue(r,&xx[i]);CHKERRQP(ierr);
1206:   }
1207:   return(0);
1208: }

1212: static PetscErrorCode VecSetRandom_SeqPThread(Vec xin,PetscRandom r)
1213: {
1215:   PetscInt       n = xin->map->n,i;
1216:   PetscScalar    *xx;
1217:   const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
1218:   PetscInt          Q = n/(iNumThreads),R = n-Q*(iNumThreads),iIndex;
1219:   PetscBool         S;

1221:   VecSetRandom_KernelData* kerneldatap = (VecSetRandom_KernelData*)malloc(iNumThreads*sizeof(VecSetRandom_KernelData));
1222:   VecSetRandom_KernelData** pdata = (VecSetRandom_KernelData**)malloc(iNumThreads*sizeof(VecSetRandom_KernelData*));

1225:   VecGetArray(xin,&xx);

1227:   iIndex = 0;
1228:   for (i=0; i<iNumThreads; i++) {
1229:     S = (PetscBool)(i<R);
1230:     kerneldatap[i].xpin   = xx+iIndex;
1231:     kerneldatap[i].rin    = r;
1232:     kerneldatap[i].nlocal = S?Q+1:Q;
1233:     iIndex += kerneldatap[i].nlocal;
1234:     pdata[i] = &kerneldatap[i];
1235:    }

1237:   MainJob(VecSetRandom_Kernel,(void**)pdata,iNumThreads);
1238:   free(kerneldatap);
1239:   free(pdata);
1240:   VecRestoreArray(xin,&xx);
1241:   return(0);
1242: }

1244: void* VecCopy_Kernel(void *arg)
1245: {
1246:   if(PetscUseThreadPool==PETSC_FALSE) {
1247:     DoCoreAffinity();
1248:   }
1249:   VecCopy_KernelData *data = (VecCopy_KernelData*)arg;
1250:   const PetscScalar  *xa = data->xpin;
1251:   PetscScalar        *ya = data->ypin;
1252:   PetscInt           n = data->nlocal;

1255:   PetscMemcpy(ya,xa,n*sizeof(PetscScalar));CHKERRQP(ierr);
1256:   return(0);
1257: }

1261: static PetscErrorCode VecCopy_SeqPThread(Vec xin,Vec yin)
1262: {
1263:   PetscScalar       *ya;
1264:   const PetscScalar *xa;
1265:   PetscErrorCode    ierr;

1268:   if (xin != yin) {
1269:     VecGetArrayRead(xin,&xa);
1270:     VecGetArray(yin,&ya);

1272:   PetscInt       n = xin->map->n,i;
1273:   const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
1274:   PetscInt          Q = n/(iNumThreads),R = n-Q*(iNumThreads),iIndex;
1275:   PetscBool         S;

1277:   VecCopy_KernelData* kerneldatap = (VecCopy_KernelData*)malloc(iNumThreads*sizeof(VecCopy_KernelData));
1278:   VecCopy_KernelData** pdata = (VecCopy_KernelData**)malloc(iNumThreads*sizeof(VecCopy_KernelData*));

1280:     iIndex = 0;
1281:     for (i=0; i<iNumThreads; i++) {
1282:       S = (PetscBool)(i<R);
1283:       kerneldatap[i].xpin   = xa+iIndex;
1284:       kerneldatap[i].ypin   = ya+iIndex;
1285:       kerneldatap[i].nlocal = S?Q+1:Q;
1286:       iIndex += kerneldatap[i].nlocal;
1287:       pdata[i] = &kerneldatap[i];
1288:     }
1289:     MainJob(VecCopy_Kernel,(void**)pdata,iNumThreads);
1290:     free(kerneldatap);
1291:     free(pdata);

1293:     VecRestoreArrayRead(xin,&xa);
1294:     VecRestoreArray(yin,&ya);
1295:   }
1296:   return(0);
1297: }

1299: void* VecMAXPY_Kernel(void* arg)
1300: {
1301:   if(PetscUseThreadPool==PETSC_FALSE) {
1302:     DoCoreAffinity();
1303:   }
1304:   VecMAXPY_KernelData *data = (VecMAXPY_KernelData*)arg;
1305:   PetscErrorCode    ierr;
1306:   PetscInt          n = data->nelem,nv=data->ntoproc,ibase=data->ibase,j,j_rem;
1307:   const PetscScalar *alpha=data->amult,*yy0,*yy1,*yy2,*yy3;
1308:   PetscScalar       *xx = data->xavalin,alpha0,alpha1,alpha2,alpha3;
1309:   Vec* y = data->yavecin;

1311: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
1312: #pragma disjoint(*xx,*yy0,*yy1,*yy2,*yy3,*alpha)
1313: #endif

1315:   switch (j_rem=nv&0x3) {
1316:   case 3:
1317:     VecGetArrayRead(y[0],&yy0);CHKERRQP(ierr);
1318:     VecGetArrayRead(y[1],&yy1);CHKERRQP(ierr);
1319:     VecGetArrayRead(y[2],&yy2);CHKERRQP(ierr);
1320:     yy0 += ibase; yy1 += ibase; yy2 += ibase; //pointer arithmetic
1321:     alpha0 = alpha[0];
1322:     alpha1 = alpha[1];
1323:     alpha2 = alpha[2];
1324:     alpha += 3;
1325:     PetscAXPY3(xx,alpha0,alpha1,alpha2,yy0,yy1,yy2,n);
1326:     VecRestoreArrayRead(y[0],&yy0);CHKERRQP(ierr);
1327:     VecRestoreArrayRead(y[1],&yy1);CHKERRQP(ierr);
1328:     VecRestoreArrayRead(y[2],&yy2);CHKERRQP(ierr);
1329:     y     += 3;
1330:     break;
1331:   case 2:
1332:     VecGetArrayRead(y[0],&yy0);CHKERRQP(ierr);
1333:     VecGetArrayRead(y[1],&yy1);CHKERRQP(ierr);
1334:     yy0 += ibase; yy1 += ibase;
1335:     alpha0 = alpha[0];
1336:     alpha1 = alpha[1];
1337:     alpha +=2;
1338:     PetscAXPY2(xx,alpha0,alpha1,yy0,yy1,n);
1339:     VecRestoreArrayRead(y[0],&yy0);CHKERRQP(ierr);
1340:     VecRestoreArrayRead(y[1],&yy1);CHKERRQP(ierr);
1341:     y     +=2;
1342:     break;
1343:   case 1:
1344:     VecGetArrayRead(y[0],&yy0);CHKERRQP(ierr);
1345:     yy0 += ibase;
1346:     alpha0 = *alpha++;
1347:     PetscAXPY(xx,alpha0,yy0,n);
1348:     VecRestoreArrayRead(y[0],&yy0);CHKERRQP(ierr);
1349:     y     +=1;
1350:     break;
1351:   }
1352:   for (j=j_rem; j<nv; j+=4) {
1353:     VecGetArrayRead(y[0],&yy0);CHKERRQP(ierr);
1354:     VecGetArrayRead(y[1],&yy1);CHKERRQP(ierr);
1355:     VecGetArrayRead(y[2],&yy2);CHKERRQP(ierr);
1356:     VecGetArrayRead(y[3],&yy3);CHKERRQP(ierr);
1357:     yy0 += ibase; yy1 += ibase; yy2 += ibase; yy3 += ibase;
1358:     alpha0 = alpha[0];
1359:     alpha1 = alpha[1];
1360:     alpha2 = alpha[2];
1361:     alpha3 = alpha[3];
1362:     alpha  += 4;

1364:     PetscAXPY4(xx,alpha0,alpha1,alpha2,alpha3,yy0,yy1,yy2,yy3,n);
1365:     VecRestoreArrayRead(y[0],&yy0);CHKERRQP(ierr);
1366:     VecRestoreArrayRead(y[1],&yy1);CHKERRQP(ierr);
1367:     VecRestoreArrayRead(y[2],&yy2);CHKERRQP(ierr);
1368:     VecRestoreArrayRead(y[3],&yy3);CHKERRQP(ierr);
1369:     y      += 4;
1370:   }
1371:   return(0);
1372: }

1376: PetscErrorCode VecMAXPY_SeqPThread(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
1377: {
1378:   PetscErrorCode    ierr;
1379:   PetscInt          n = xin->map->n,i,j=0;
1380:   PetscScalar       *xx;

1382:   const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
1383:   PetscInt          Q = n/(iNumThreads),R = n-Q*(iNumThreads);
1384:   //PetscInt          K = nv / 4; /*how many groups of 4 are present */
1385:   //PetscInt          Q = K / iNumThreads; /* how many groups of 4 to give to each thread */
1386:   //PetscInt          R = nv - Q*iNumThreads*4;
1387:   PetscBool         S;
1388:   VecMAXPY_KernelData* kerneldatap = (VecMAXPY_KernelData*)malloc(iNumThreads*sizeof(VecMAXPY_KernelData));
1389:   VecMAXPY_KernelData** pdata = (VecMAXPY_KernelData**)malloc(iNumThreads*sizeof(VecMAXPY_KernelData*));

1392:   PetscLogFlops(nv*2.0*n);
1393:   VecGetArray(xin,&xx);
1394:   for(i=0;i<nv;i++) {
1395:     if(y[i]->petscnative!=PETSC_TRUE) {
1396:       printf("Non PETSC Native Vector!\n");
1397:     }
1398:   }
1399:   for (i=0; i<iNumThreads; i++) {
1400:     S = (PetscBool)(i<R);
1401:     kerneldatap[i].xavalin = xx+j;
1402:     kerneldatap[i].yavecin = &y[0];
1403:     kerneldatap[i].amult   = &alpha[0];
1404:     kerneldatap[i].nelem   = S?Q+1:Q;
1405:     kerneldatap[i].ntoproc = nv;
1406:     kerneldatap[i].ibase   = j;
1407:     j += kerneldatap[i].nelem;
1408:     pdata[i] = &kerneldatap[i];
1409:   }
1410:   MainJob(VecMAXPY_Kernel,(void**)pdata,iNumThreads);
1411:   free(kerneldatap);
1412:   free(pdata);

1414:   VecRestoreArray(xin,&xx);
1415:   return(0);
1416: }

1418: void* VecSet_Kernel(void *arg)
1419: {
1420:   if(PetscUseThreadPool==PETSC_FALSE) {
1421:     DoCoreAffinity();
1422:   }
1423:   VecSet_KernelData *data = (VecSet_KernelData*)arg;
1424:   PetscScalar        *xx = data->xpin;
1425:   PetscScalar        alpha = data->alphain;
1426:   PetscInt           i,n = data->nelem;

1429:   if (alpha == (PetscScalar)0.0) {
1430:     PetscMemzero(xx,n*sizeof(PetscScalar));CHKERRQP(ierr);
1431:   } else {
1432:     for (i=0; i<n; i++) xx[i] = alpha;
1433:   }
1434:   return(0);
1435: }

1439: PetscErrorCode VecSet_SeqPThread(Vec xin,PetscScalar alpha)
1440: {
1441:   PetscInt       i,n = xin->map->n;
1442:   PetscScalar    *xx;

1446:   VecGetArray(xin,&xx);
1447:   const PetscInt    iNumThreads = PetscMaxThreads;  //this number could be different
1448:   PetscInt          Q = n/(iNumThreads),R = n-Q*(iNumThreads),iIndex;
1449:   PetscBool         S;

1451:   VecSet_KernelData* kerneldatap = (VecSet_KernelData*)malloc(iNumThreads*sizeof(VecSet_KernelData));
1452:   VecSet_KernelData** pdata = (VecSet_KernelData**)malloc(iNumThreads*sizeof(VecSet_KernelData*));

1454:   iIndex = 0;
1455:   for (i=0; i<iNumThreads; i++) {
1456:     S = (PetscBool)(i<R);
1457:     kerneldatap[i].xpin   = xx+iIndex;
1458:     kerneldatap[i].alphain   = alpha;
1459:     kerneldatap[i].nelem = S?Q+1:Q;
1460:     iIndex += kerneldatap[i].nelem;
1461:     pdata[i] = &kerneldatap[i];
1462:   }
1463:   MainJob(VecSet_Kernel,(void**)pdata,iNumThreads);
1464:   free(kerneldatap);
1465:   free(pdata);
1466:   VecRestoreArray(xin,&xx);
1467:   return(0);
1468: }

1472: PetscErrorCode VecDestroy_SeqPThread(Vec v)
1473: {

1477:   VecDestroy_Seq(v);
1478:   return(0);
1479: }

1481: void* DoCoreAffinity(void)
1482: {
1483:   int i,icorr=0; cpu_set_t mset;
1484:   pthread_t pThread = pthread_self();
1485:   for(i=0; i<PetscMaxThreads; i++) {
1486:     if(pthread_equal(pThread,PetscThreadPoint[i])) {
1487:       icorr = ThreadCoreAffinity[i];
1488:     }
1489:   }
1490:   CPU_ZERO(&mset);
1491:   CPU_SET(icorr,&mset);
1492:   sched_setaffinity(0,sizeof(cpu_set_t),&mset);
1493:   return(0);
1494: }

1499: PetscErrorCode  VecCreate_SeqPThread(Vec V)
1500: {
1502:   PetscMPIInt    size;
1503:   PetscScalar    *array;
1504:   PetscInt       n = PetscMax(V->map->n,V->map->N);

1507:   MPI_Comm_size(((PetscObject)V)->comm,&size);
1508:   if  (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQTHREAD on more than one process");
1509:   PetscMalloc(n*sizeof(PetscScalar),&array);
1510:   PetscLogObjectMemory(V, n*sizeof(PetscScalar));
1511:   PetscMemzero(array,n*sizeof(PetscScalar));
1512:   VecCreate_Seq_Private(V,array);
1513:   PetscObjectChangeTypeName((PetscObject)V,VECSEQPTHREAD);
1514:   V->ops->dot             = VecDot_SeqPThread;
1515:   V->ops->mdot            = VecMDot_SeqPThread;
1516:   V->ops->scale           = VecScale_SeqPThread;
1517:   V->ops->axpy            = VecAXPY_SeqPThread;
1518:   V->ops->aypx            = VecAYPX_SeqPThread;
1519:   V->ops->waxpy           = VecWAXPY_SeqPThread;
1520:   V->ops->norm            = VecNorm_SeqPThread;
1521:   V->ops->max             = VecMax_SeqPThread;
1522:   V->ops->min             = VecMin_SeqPThread;
1523:   V->ops->pointwisemult   = VecPointwiseMult_SeqPThread;
1524:   V->ops->pointwisedivide = VecPointwiseDivide_SeqPThread;
1525:   V->ops->swap            = VecSwap_SeqPThread;
1526:   V->ops->setrandom       = VecSetRandom_SeqPThread;
1527:   V->ops->copy            = VecCopy_SeqPThread;
1528:   V->ops->maxpy           = VecMAXPY_SeqPThread;
1529:   V->ops->set             = VecSet_SeqPThread;
1530:   VecSet(V,0);
1531:   return(0);
1532: }

1538: PetscErrorCode  VecCreate_PThread(Vec v)
1539: {
1541:   PetscMPIInt    size;

1544:   MPI_Comm_size(((PetscObject)v)->comm,&size);
1545:   if (size == 1) {
1546:     VecSetType(v,VECSEQPTHREAD);
1547:   } else SETERRQ(((PetscObject)v)->comm,PETSC_ERR_SUP,"No parallel thread vector yet");
1548:   return(0);
1549: }