Actual source code: veccusp.cu

  2: /*
  3:    Implements the sequential cusp vectors.
  4: */

  6: #include <petscconf.h>
  8: #include <private/vecimpl.h>          /*I "petscvec.h" I*/
  9: #include <../src/vec/vec/impls/dvecimpl.h>
 11: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>

 15: static PetscErrorCode VecCopy_Seq(Vec xin,Vec yin)
 16: {
 17:   PetscScalar       *ya;
 18:   const PetscScalar *xa;
 19:   PetscErrorCode    ierr;

 22:   if (xin != yin) {
 23:     VecGetArrayRead(xin,&xa);
 24:     VecGetArray(yin,&ya);
 25:     PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
 26:     VecRestoreArrayRead(xin,&xa);
 27:     VecRestoreArray(yin,&ya);
 28:   }
 29:   return(0);
 30: }

 34: static PetscErrorCode VecSetRandom_Seq(Vec xin,PetscRandom r)
 35: {
 37:   PetscInt       n = xin->map->n,i;
 38:   PetscScalar    *xx;

 41:   VecGetArray(xin,&xx);
 42:   for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
 43:   VecRestoreArray(xin,&xx);
 44:   return(0);
 45: }

 49: static PetscErrorCode VecDestroy_Seq(Vec v)
 50: {
 51:   Vec_Seq        *vs = (Vec_Seq*)v->data;

 55:   PetscObjectDepublish(v);

 57: #if defined(PETSC_USE_LOG)
 58:   PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
 59: #endif
 60:   PetscFree(vs->array_allocated);
 61:   PetscFree(vs);
 62:   return(0);
 63: }

 67: static PetscErrorCode VecResetArray_Seq(Vec vin)
 68: {
 69:   Vec_Seq *v = (Vec_Seq *)vin->data;

 72:   v->array         = v->unplacedarray;
 73:   v->unplacedarray = 0;
 74:   return(0);
 75: }

 77: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
 80: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
 81: {

 85:   VecCUSPAllocateCheck(v);
 86:   return(0);
 87: }

 91: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
 92: {

 96:   VecCUSPCopyToGPU(v);
 97:   return(0);
 98: }

100: struct  _p_PetscCUSPIndices {
101:   CUSPINTARRAYCPU indicesCPU;
102:   CUSPINTARRAYGPU indicesGPU;
103: };


108: /*
109:     PetscCUSPIndicesCreate - creates the data structure needed by VecCUSPCopyToGPUSome_Public()

111:    Input Parameters:
112: +    n - the number of indices
113: -    indices - integer list of indices

115:    Output Parameter:
116: .    ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()

118: .seealso: PetscCUSPIndicesDestroy(), VecCUSPCopyToGPUSome_Public()
119: */
120: PetscErrorCode PetscCUSPIndicesCreate(PetscInt n,const PetscInt *indices,PetscCUSPIndices *ci)
121: {
122:   PetscCUSPIndices  cci;

125:   cci = new struct _p_PetscCUSPIndices;
126:   cci->indicesCPU.assign(indices,indices+n);
127:   cci->indicesGPU.assign(indices,indices+n);
128:   *ci = cci;
129:   return(0);
130: }

134: /*
135:     PetscCUSPIndicesDestroy - destroys the data structure needed by VecCUSPCopyToGPUSome_Public()

137:    Input Parameters:
138: .    ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()

140: .seealso: PetscCUSPIndicesCreate(), VecCUSPCopyToGPUSome_Public()
141: */
142: PetscErrorCode PetscCUSPIndicesDestroy(PetscCUSPIndices *ci)
143: {
145:   if (!ci) return(0);
146:   try {
147:     delete *ci;
148:   } catch(char* ex) {
149:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
150:   }
151:   *ci = 0;
152:   return(0);
153: }

157: /*
158:     VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector

160:    Input Parameters:
161: +    v - the vector
162: -    indices - the requested indices, this should be created with CUSPIndicesCreate()

164: */
165: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
166: {
167:   PetscErrorCode   ierr;

170:   VecCUSPCopyToGPUSome(v,&ci->indicesCPU,&ci->indicesGPU);CHKERRCUSP(ierr);
171:   return(0);
172: }



178: /*@C
179:      VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
180: @*/
181: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
182: {
184:   CUSPARRAY      *GPUvector;
185:   PetscScalar    *array;
186:   Vec_Seq        *s;
187:   PetscInt       n = v->map->n;

190:   s = (Vec_Seq*)v->data;
191:   if (s->array == 0){
192:     PetscMalloc(n*sizeof(PetscScalar),&array);
193:     PetscLogObjectMemory(v,n*sizeof(PetscScalar));
194:     s->array           = array;
195:     s->array_allocated = array;
196:   }
197:   if (v->valid_GPU_array == PETSC_CUSP_GPU){
198:     GPUvector  = ((Vec_CUSP*)v->spptr)->GPUarray;
199:     PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
200:     try{
201:       thrust::copy(GPUvector->begin(),GPUvector->end(),*(PetscScalar**)v->data);
202:       WaitForGPU();CHKERRCUSP(ierr);
203:     } catch(char* ex) {
204:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
205:     }
206:     PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
207:     v->valid_GPU_array = PETSC_CUSP_BOTH;
208:   }
209:   return(0);
210: }

214: /* Note that this function only copies *some* of the values up from the GPU to CPU,
215:    which means that we need recombine the data at some point before using any of the standard functions.
216:    We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
217:    where you have to always call in pairs
218: */
219: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v,CUSPINTARRAYCPU *indicesCPU,CUSPINTARRAYGPU *indicesGPU)
220: {
221:   Vec_Seq        *s;
222:   PetscInt       n = v->map->n;
223:   PetscScalar    *array;
224:   CUSPARRAY      *varray;

228:   VecCUSPAllocateCheck(v);CHKERRCUSP(ierr);
229:   s = (Vec_Seq*)v->data;
230:   if (s->array == 0){
231:     PetscMalloc(n*sizeof(PetscScalar),&array);
232:     PetscLogObjectMemory(v,n*sizeof(PetscScalar));
233:     s->array           = array;
234:     s->array_allocated = array;
235:   }
236:   if (v->valid_GPU_array == PETSC_CUSP_GPU) {
237:     PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);
238:     VecCUSPGetArrayRead(v,&varray);
239:     thrust::copy(
240:                  thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()),
241:                  thrust::make_permutation_iterator(varray->begin(),indicesGPU->end()),
242:                  thrust::make_permutation_iterator(s->array,indicesCPU->begin()));
243:     VecCUSPRestoreArrayRead(v,&varray);
244:     PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
245:   }
246:   /*v->valid_GPU_array = PETSC_CUSP_CPU; */
247:   return(0);
248: }

252: /*
253:   VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector

255:   Input Parameters:
256:  +    v - the vector
257:  -    indices - the requested indices, this should be created with CUSPIndicesCreate()
258: */
259: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
260: {

264:   VecCUSPCopyFromGPUSome(v,&ci->indicesCPU,&ci->indicesGPU);CHKERRCUSP(ierr);
265:   return(0);
266: }


269: /*MC
270:    VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP

272:    Options Database Keys:
273: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()

275:   Level: beginner

277: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
278: M*/

280: /* for VecAYPX_SeqCUSP*/
281: namespace cusp
282: {
283: namespace blas
284: {
285: namespace detail
286: {
287:   template <typename T>
288:     struct AYPX : public thrust::binary_function<T,T,T>
289:     {
290:       T alpha;

292:       AYPX(T _alpha) : alpha(_alpha) {}

294:       __host__ __device__
295:         T operator()(T x, T y)
296:       {
297:         return alpha * y + x;
298:       }
299:     };
300: }

302:  template <typename ForwardIterator1,
303:            typename ForwardIterator2,
304:            typename ScalarType>
305: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
306:            {
307:              thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
308:            }
309:  template <typename Array1, typename Array2, typename ScalarType>
310:    void aypx(const Array1& x, Array2& y, ScalarType alpha)
311:  {
312:    detail::assert_same_dimensions(x,y);
313:    aypx(x.begin(),x.end(),y.begin(),alpha);
314:  }
315: }
316: }

320: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
321: {
322:   CUSPARRAY      *xarray,*yarray;

326:   if (alpha != 0.0) {
327:     VecCUSPGetArrayRead(xin,&xarray);
328:     VecCUSPGetArrayReadWrite(yin,&yarray);
329:     try{
330:       cusp::blas::aypx(*xarray,*yarray,alpha);
331:       WaitForGPU();CHKERRCUSP(ierr);
332:     } catch(char* ex) {
333:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
334:     }
335:     VecCUSPRestoreArrayRead(xin,&xarray);
336:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
337:     PetscLogFlops(2.0*yin->map->n);
338:    }
339:   return(0);
340: }


345: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
346: {
347:   CUSPARRAY      *xarray,*yarray;

351:   if (alpha != 0.0) {
352:     VecCUSPGetArrayRead(xin,&xarray);
353:     VecCUSPGetArrayReadWrite(yin,&yarray);
354:     try {
355:       cusp::blas::axpy(*xarray,*yarray,alpha);
356:       WaitForGPU();CHKERRCUSP(ierr);
357:     } catch(char* ex) {
358:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
359:     }
360:     VecCUSPRestoreArrayRead(xin,&xarray);
361:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
362:     PetscLogFlops(2.0*yin->map->n);
363:   }
364:   return(0);
365: }

367: struct VecCUSPPointwiseDivide
368: {
369:   template <typename Tuple>
370:   __host__ __device__
371:   void operator()(Tuple t)
372:   {
373:     thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
374:   }
375: };

379: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
380: {
381:   CUSPARRAY      *warray,*xarray,*yarray;

385:   VecCUSPGetArrayRead(xin,&xarray);
386:   VecCUSPGetArrayRead(yin,&yarray);
387:   VecCUSPGetArrayWrite(win,&warray);
388:   try{
389:     thrust::for_each(
390:         thrust::make_zip_iterator(
391:             thrust::make_tuple(
392:                 warray->begin(),
393:                 xarray->begin(),
394:                 yarray->begin())),
395:         thrust::make_zip_iterator(
396:             thrust::make_tuple(
397:                 warray->end(),
398:                 xarray->end(),
399:                 yarray->end())),
400:         VecCUSPPointwiseDivide());
401:   WaitForGPU();CHKERRCUSP(ierr);
402:   } catch(char* ex) {
403:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
404:     }
405:   PetscLogFlops(win->map->n);
406:   VecCUSPRestoreArrayRead(xin,&xarray);
407:   VecCUSPRestoreArrayRead(yin,&yarray);
408:   VecCUSPRestoreArrayWrite(win,&warray);
409:   return(0);
410: }


413: struct VecCUSPWAXPY
414: {
415:   template <typename Tuple>
416:   __host__ __device__
417:   void operator()(Tuple t)
418:   {
419:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
420:   }
421: };

423: struct VecCUSPSum
424: {
425:   template <typename Tuple>
426:   __host__ __device__
427:   void operator()(Tuple t)
428:   {
429:     thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
430:   }
431: };

433: struct VecCUSPDiff
434: {
435:   template <typename Tuple>
436:   __host__ __device__
437:   void operator()(Tuple t)
438:   {
439:     thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
440:   }
441: };

445: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
446: {
447:   CUSPARRAY      *xarray,*yarray,*warray;

451:     if (alpha == 0.0) {
452:     VecCopy_SeqCUSP(yin,win);
453:   } else {
454:       VecCUSPGetArrayRead(xin,&xarray);
455:       VecCUSPGetArrayRead(yin,&yarray);
456:       VecCUSPGetArrayWrite(win,&warray);
457:       if (alpha == 1.0) {
458:         try {
459:           thrust::for_each(
460:             thrust::make_zip_iterator(
461:               thrust::make_tuple(
462:                 warray->begin(),
463:                 yarray->begin(),
464:                 xarray->begin())),
465:             thrust::make_zip_iterator(
466:               thrust::make_tuple(
467:                 warray->end(),
468:                 yarray->end(),
469:                 xarray->end())),
470:             VecCUSPSum());
471:         } catch(char* ex) {
472:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
473:         }
474:         PetscLogFlops(win->map->n);
475:       } else if (alpha == -1.0) {
476:         try {
477:           thrust::for_each(
478:             thrust::make_zip_iterator(
479:               thrust::make_tuple(
480:                 warray->begin(),
481:                 yarray->begin(),
482:                 xarray->begin())),
483:             thrust::make_zip_iterator(
484:               thrust::make_tuple(
485:                 warray->end(),
486:                 yarray->end(),
487:                 xarray->end())),
488:             VecCUSPDiff());
489:         } catch(char* ex) {
490:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
491:         }
492:         PetscLogFlops(win->map->n);
493:       } else {
494:         try {
495:           thrust::for_each(
496:             thrust::make_zip_iterator(
497:               thrust::make_tuple(
498:                 warray->begin(),
499:                 yarray->begin(),
500:                 thrust::make_constant_iterator(alpha),
501:                 xarray->begin())),
502:             thrust::make_zip_iterator(
503:               thrust::make_tuple(
504:                 warray->end(),
505:                 yarray->end(),
506:                 thrust::make_constant_iterator(alpha),
507:                 xarray->end())),
508:             VecCUSPWAXPY());
509:         } catch(char* ex) {
510:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
511:         }
512:         PetscLogFlops(2*win->map->n);
513:       }
514:       WaitForGPU();CHKERRCUSP(ierr);
515:       VecCUSPRestoreArrayRead(xin,&xarray);
516:       VecCUSPRestoreArrayRead(yin,&yarray);
517:       VecCUSPRestoreArrayWrite(win,&warray);
518:     }
519:     return(0);
520: }

522: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
523: struct VecCUSPMAXPY4
524: {
525:   template <typename Tuple>
526:   __host__ __device__
527:   void operator()(Tuple t)
528:   {
529:     /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
530:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
531:   }
532: };


535: struct VecCUSPMAXPY3
536: {
537:   template <typename Tuple>
538:   __host__ __device__
539:   void operator()(Tuple t)
540:   {
541:     /*y += a1*x1 +a2*x2 + 13*x3 */
542:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
543:   }
544: };

546: struct VecCUSPMAXPY2
547: {
548:   template <typename Tuple>
549:   __host__ __device__
550:   void operator()(Tuple t)
551:   {
552:     /*y += a1*x1 +a2*x2*/
553:     thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
554:   }
555: };
558: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
559: {
560:   PetscErrorCode    ierr;
561:   CUSPARRAY         *xarray,*yy0,*yy1,*yy2,*yy3;
562:   PetscInt          n = xin->map->n,j,j_rem;
563:   PetscScalar       alpha0,alpha1,alpha2,alpha3;

566:   PetscLogFlops(nv*2.0*n);
567:   VecCUSPGetArrayReadWrite(xin,&xarray);
568:   switch (j_rem=nv&0x3) {
569:   case 3:
570:     alpha0 = alpha[0];
571:     alpha1 = alpha[1];
572:     alpha2 = alpha[2];
573:     alpha += 3;
574:     VecCUSPGetArrayRead(y[0],&yy0);
575:     VecCUSPGetArrayRead(y[1],&yy1);
576:     VecCUSPGetArrayRead(y[2],&yy2);
577:     try {
578:       thrust::for_each(
579:         thrust::make_zip_iterator(
580:             thrust::make_tuple(
581:                 xarray->begin(),
582:                 thrust::make_constant_iterator(alpha0),
583:                 yy0->begin(),
584:                 thrust::make_constant_iterator(alpha1),
585:                 yy1->begin(),
586:                 thrust::make_constant_iterator(alpha2),
587:                 yy2->begin())),
588:         thrust::make_zip_iterator(
589:             thrust::make_tuple(
590:                 xarray->end(),
591:                 thrust::make_constant_iterator(alpha0),
592:                 yy0->end(),
593:                 thrust::make_constant_iterator(alpha1),
594:                 yy1->end(),
595:                 thrust::make_constant_iterator(alpha2),
596:                 yy2->end())),
597:         VecCUSPMAXPY3());
598:     } catch(char* ex) {
599:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
600:     }
601:     VecCUSPRestoreArrayRead(y[0],&yy0);
602:     VecCUSPRestoreArrayRead(y[1],&yy1);
603:     VecCUSPRestoreArrayRead(y[2],&yy2);
604:     y     += 3;
605:     break;
606:   case 2:
607:     alpha0 = alpha[0];
608:     alpha1 = alpha[1];
609:     alpha +=2;
610:     VecCUSPGetArrayRead(y[0],&yy0);
611:     VecCUSPGetArrayRead(y[1],&yy1);
612:     try {
613:       thrust::for_each(
614:         thrust::make_zip_iterator(
615:             thrust::make_tuple(
616:                 xarray->begin(),
617:                 thrust::make_constant_iterator(alpha0),
618:                 yy0->begin(),
619:                 thrust::make_constant_iterator(alpha1),
620:                 yy1->begin())),
621:         thrust::make_zip_iterator(
622:             thrust::make_tuple(
623:                 xarray->end(),
624:                 thrust::make_constant_iterator(alpha0),
625:                 yy0->end(),
626:                 thrust::make_constant_iterator(alpha1),
627:                 yy1->end())),
628:         VecCUSPMAXPY2());
629:     } catch(char* ex) {
630:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
631:     }
632:     y     +=2;
633:     break;
634:   case 1:
635:     alpha0 = *alpha++;
636:     VecAXPY_SeqCUSP(xin,alpha0,y[0]);
637:     y     +=1;
638:     break;
639:   }
640:   for (j=j_rem; j<nv; j+=4) {
641:     alpha0 = alpha[0];
642:     alpha1 = alpha[1];
643:     alpha2 = alpha[2];
644:     alpha3 = alpha[3];
645:     alpha  += 4;
646:     VecCUSPGetArrayRead(y[0],&yy0);
647:     VecCUSPGetArrayRead(y[1],&yy1);
648:     VecCUSPGetArrayRead(y[2],&yy2);
649:     VecCUSPGetArrayRead(y[3],&yy3);
650:     try {
651:       thrust::for_each(
652:         thrust::make_zip_iterator(
653:             thrust::make_tuple(
654:                 xarray->begin(),
655:                 thrust::make_constant_iterator(alpha0),
656:                 yy0->begin(),
657:                 thrust::make_constant_iterator(alpha1),
658:                 yy1->begin(),
659:                 thrust::make_constant_iterator(alpha2),
660:                 yy2->begin(),
661:                 thrust::make_constant_iterator(alpha3),
662:                 yy3->begin())),
663:         thrust::make_zip_iterator(
664:             thrust::make_tuple(
665:                 xarray->end(),
666:                 thrust::make_constant_iterator(alpha0),
667:                 yy0->end(),
668:                 thrust::make_constant_iterator(alpha1),
669:                 yy1->end(),
670:                 thrust::make_constant_iterator(alpha2),
671:                 yy2->end(),
672:                 thrust::make_constant_iterator(alpha3),
673:                 yy3->end())),
674:         VecCUSPMAXPY4());
675:     } catch(char* ex) {
676:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
677:     }
678:     VecCUSPRestoreArrayRead(y[0],&yy0);
679:     VecCUSPRestoreArrayRead(y[1],&yy1);
680:     VecCUSPRestoreArrayRead(y[2],&yy2);
681:     VecCUSPRestoreArrayRead(y[3],&yy3);
682:     y      += 4;
683:   }
684:   VecCUSPRestoreArrayReadWrite(xin,&xarray);
685:   WaitForGPU();CHKERRCUSP(ierr);
686:   return(0);
687: }


692: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
693: {
694: #if defined(PETSC_USE_COMPLEX)
695:   PetscScalar    *ya,*xa;
696: #endif
697:   CUSPARRAY      *xarray,*yarray;

701: #if defined(PETSC_USE_COMPLEX)
702:   /*Not working for complex*/
703: #else
704:   {
705:     VecCUSPGetArrayRead(xin,&xarray);
706:     VecCUSPGetArrayRead(yin,&yarray);
707:     try {
708:       *z = cusp::blas::dot(*xarray,*yarray);
709:     } catch(char* ex) {
710:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
711:     }
712:   }
713: #endif
714:  WaitForGPU();CHKERRCUSP(ierr);
715:  if (xin->map->n >0) {
716:     PetscLogFlops(2.0*xin->map->n-1);
717:   }
718:  VecCUSPRestoreArrayRead(xin,&xarray);
719:  VecCUSPRestoreArrayRead(yin,&yarray);
720:  return(0);
721: }

723: /*The following few template functions are for VecMDot_SeqCUSP*/

725: template <typename T1,typename T2>
726: struct cuspmult2 : thrust::unary_function<T1,T2>
727: {
728:         __host__ __device__
729:         T2 operator()(T1 x)
730:         {
731:                 return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x));
732:         }
733: };

735: template <typename T>
736: struct cuspadd2 : thrust::binary_function<T,T,T>
737: {
738:         __host__ __device__
739:         T operator()(T x,T y)
740:         {
741:                 return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y));
742:         }
743: };

745: template <typename T1,typename T2>
746: struct cuspmult3 : thrust::unary_function<T1,T2>
747: {
748:         __host__ __device__
749:         T2 operator()(T1 x)
750:         {
751:           return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x),thrust::get<0>(x)*thrust::get<3>(x));
752:         }
753: };

755: template <typename T>
756: struct cuspadd3 : thrust::binary_function<T,T,T>
757: {
758:         __host__ __device__
759:         T operator()(T x,T y)
760:         {
761:           return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y),thrust::get<2>(x)+thrust::get<2>(y));
762:         }
763: };
764:         template <typename T1,typename T2>
765: struct cuspmult4 : thrust::unary_function<T1,T2>
766: {
767:         __host__ __device__
768:         T2 operator()(T1 x)
769:         {
770:           return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x),thrust::get<0>(x)*thrust::get<3>(x),thrust::get<0>(x)*thrust::get<4>(x));
771:         }
772: };

774: template <typename T>
775: struct cuspadd4 : thrust::binary_function<T,T,T>
776: {
777:         __host__ __device__
778:         T operator()(T x,T y)
779:         {
780:           return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y),thrust::get<2>(x)+thrust::get<2>(y),thrust::get<3>(x)+thrust::get<3>(y));
781:         }
782: };


787: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
788: {
789:   PetscErrorCode    ierr;
790:   PetscInt          n = xin->map->n,j,j_rem;
791:   /*Vec               yy0,yy1,yy2,yy3;*/
792:   CUSPARRAY         *xarray,*yy0,*yy1,*yy2,*yy3;
793:   PetscScalar       zero=0.0;
794:   Vec               *yyin = (Vec*)yin;

796:   thrust::tuple<PetscScalar,PetscScalar> result2;
797:   thrust::tuple<PetscScalar,PetscScalar,PetscScalar> result3;
798:   thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar>result4;

801:   VecCUSPGetArrayRead(xin,&xarray);
802:   switch(j_rem=nv&0x3) {
803:   case 3:
804:     VecCUSPGetArrayRead(yyin[0],&yy0);
805:     VecCUSPGetArrayRead(yyin[1],&yy1);
806:     VecCUSPGetArrayRead(yyin[2],&yy2);
807:     try {
808:       result3 = thrust::transform_reduce(
809:                      thrust::make_zip_iterator(
810:                           thrust::make_tuple(
811:                                    xarray->begin(),
812:                                    yy0->begin(),
813:                                    yy1->begin(),
814:                                    yy2->begin())),
815:                      thrust::make_zip_iterator(
816:                           thrust::make_tuple(
817:                                    xarray->end(),
818:                                    yy0->end(),
819:                                    yy1->end(),
820:                                    yy2->end())),
821:                      cuspmult3<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar,PetscScalar> >(),
822:                      thrust::make_tuple(zero,zero,zero), /*init */
823:                      cuspadd3<thrust::tuple<PetscScalar,PetscScalar,PetscScalar> >()); /* binary function */
824:       z[0] = thrust::get<0>(result3);
825:       z[1] = thrust::get<1>(result3);
826:       z[2] = thrust::get<2>(result3);
827:     } catch(char* ex) {
828:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
829:     }
830:     z    += 3;
831:     VecCUSPRestoreArrayRead(yyin[0],&yy0);
832:     VecCUSPRestoreArrayRead(yyin[1],&yy1);
833:     VecCUSPRestoreArrayRead(yyin[2],&yy2);
834:     yyin  += 3;
835:     break;
836:   case 2:
837:     VecCUSPGetArrayRead(yyin[0],&yy0);
838:     VecCUSPGetArrayRead(yyin[1],&yy1);
839:     try {
840:       result2 = thrust::transform_reduce(
841:                     thrust::make_zip_iterator(
842:                         thrust::make_tuple(
843:                                   xarray->begin(),
844:                                   yy0->begin(),
845:                                   yy1->begin())),
846:                     thrust::make_zip_iterator(
847:                         thrust::make_tuple(
848:                                   xarray->end(),
849:                                   yy0->end(),
850:                                   yy1->end())),
851:                     cuspmult2<thrust::tuple<PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar> >(),
852:                     thrust::make_tuple(zero,zero), /*init */
853:                     cuspadd2<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
854:       z[0] = thrust::get<0>(result2);
855:       z[1] = thrust::get<1>(result2);
856:     } catch(char* ex) {
857:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
858:     }
859:     z    += 2;
860:     VecCUSPRestoreArrayRead(yyin[0],&yy0);
861:     VecCUSPRestoreArrayRead(yyin[1],&yy1);
862:     yyin  += 2;
863:     break;
864:   case 1:
865:      VecDot_SeqCUSP(xin,yyin[0],&z[0]);
866:     z    += 1;
867:     yyin  += 1;
868:     break;
869:   }
870:   for (j=j_rem; j<nv; j+=4) {
871:     VecCUSPGetArrayRead(yyin[0],&yy0);
872:     VecCUSPGetArrayRead(yyin[1],&yy1);
873:     VecCUSPGetArrayRead(yyin[2],&yy2);
874:     VecCUSPGetArrayRead(yyin[3],&yy3);
875:     try {
876:       result4 = thrust::transform_reduce(
877:                     thrust::make_zip_iterator(
878:                         thrust::make_tuple(
879:                                   xarray->begin(),
880:                                   yy0->begin(),
881:                                   yy1->begin(),
882:                                   yy2->begin(),
883:                                   yy3->begin())),
884:                     thrust::make_zip_iterator(
885:                         thrust::make_tuple(
886:                                   xarray->end(),
887:                                   yy0->end(),
888:                                   yy1->end(),
889:                                   yy2->end(),
890:                                   yy3->end())),
891:                      cuspmult4<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar> >(),
892:                      thrust::make_tuple(zero,zero,zero,zero), /*init */
893:                      cuspadd4<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar> >()); /* binary function */
894:       z[0] = thrust::get<0>(result4);
895:       z[1] = thrust::get<1>(result4);
896:       z[2] = thrust::get<2>(result4);
897:       z[3] = thrust::get<3>(result4);
898:     } catch(char* ex) {
899:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
900:     }
901:     z    += 4;
902:     VecCUSPRestoreArrayRead(yyin[0],&yy0);
903:     VecCUSPRestoreArrayRead(yyin[1],&yy1);
904:     VecCUSPRestoreArrayRead(yyin[2],&yy2);
905:     VecCUSPRestoreArrayRead(yyin[3],&yy3);
906:     yyin  += 4;
907:   }
908:   WaitForGPU();CHKERRCUSP(ierr);
909:   PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
910:   return(0);
911: }


916: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
917: {
918:   CUSPARRAY      *xarray;

922:   /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
923:   VecCUSPGetArrayWrite(xin,&xarray);
924:   try {
925:     cusp::blas::fill(*xarray,alpha);
926:   } catch(char* ex) {
927:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
928:   }
929:   WaitForGPU();CHKERRCUSP(ierr);
930:   VecCUSPRestoreArrayWrite(xin,&xarray);
931:   return(0);
932: }

936: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
937: {
938:   CUSPARRAY      *xarray;

942:   if (alpha == 0.0) {
943:     VecSet_SeqCUSP(xin,alpha);
944:   } else if (alpha != 1.0) {
945:     VecCUSPGetArrayReadWrite(xin,&xarray);
946:     try {
947:       cusp::blas::scal(*xarray,alpha);
948:     } catch(char* ex) {
949:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
950:     }
951:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
952:   }
953:   WaitForGPU();CHKERRCUSP(ierr);
954:   PetscLogFlops(xin->map->n);
955:   return(0);
956: }


961: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
962: {
963: #if defined(PETSC_USE_COMPLEX)
964:   PetscScalar    *ya,*xa;
965: #endif
966:   CUSPARRAY      *xarray,*yarray;

970: #if defined(PETSC_USE_COMPLEX)
971:   /*Not working for complex*/
972: #else
973:  VecCUSPGetArrayRead(xin,&xarray);
974:  VecCUSPGetArrayRead(yin,&yarray);
975:  try {
976:    *z = cusp::blas::dot(*xarray,*yarray);
977:  } catch(char* ex) {
978:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
979:  }
980: #endif
981:  WaitForGPU();CHKERRCUSP(ierr);
982:   if (xin->map->n > 0) {
983:     PetscLogFlops(2.0*xin->map->n-1);
984:   }
985:   VecCUSPRestoreArrayRead(yin,&yarray);
986:   VecCUSPRestoreArrayRead(xin,&xarray);
987:   return(0);
988: }
991: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
992: {
993:   CUSPARRAY      *xarray,*yarray;

997:   if (xin != yin) {
998:     if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
999:       VecCUSPGetArrayRead(xin,&xarray);
1000:       VecCUSPGetArrayWrite(yin,&yarray);
1001:        try {
1002:          cusp::blas::copy(*xarray,*yarray);
1003:        } catch(char* ex) {
1004:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1005:       }
1006:       WaitForGPU();CHKERRCUSP(ierr);
1007:       VecCUSPRestoreArrayRead(xin,&xarray);
1008:       VecCUSPRestoreArrayWrite(yin,&yarray);

1010:     } else if (xin->valid_GPU_array == PETSC_CUSP_CPU || xin->valid_GPU_array == PETSC_CUSP_UNALLOCATED) {
1011:       /* copy in CPU if we are on the CPU*/
1012:       VecCopy_Seq(xin,yin);
1013:     } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1014:       /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1015:       if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1016:         /* copy in CPU */
1017:         VecCopy_Seq(xin,yin);

1019:       } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1020:         /* copy in GPU */
1021:         VecCUSPGetArrayRead(xin,&xarray);
1022:         VecCUSPGetArrayWrite(yin,&yarray);
1023:         try {
1024:           cusp::blas::copy(*xarray,*yarray);
1025:           WaitForGPU();CHKERRCUSP(ierr);
1026:         } catch(char* ex) {
1027:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1028:         }
1029:         VecCUSPRestoreArrayRead(xin,&xarray);
1030:         VecCUSPRestoreArrayWrite(yin,&yarray);
1031:       } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1032:         /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1033:            default to copy in GPU (this is an arbitrary choice) */
1034:         VecCUSPGetArrayRead(xin,&xarray);
1035:         VecCUSPGetArrayWrite(yin,&yarray);
1036:         try {
1037:           cusp::blas::copy(*xarray,*yarray);
1038:           WaitForGPU();CHKERRCUSP(ierr);
1039:         } catch(char* ex) {
1040:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1041:         }
1042:         VecCUSPRestoreArrayRead(xin,&xarray);
1043:         VecCUSPRestoreArrayWrite(yin,&yarray);
1044:       } else {
1045:         VecCopy_Seq(xin,yin);
1046:       }
1047:     }
1048:   }
1049:   return(0);
1050: }


1055: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1056: {
1058:   PetscBLASInt   one = 1,bn = PetscBLASIntCast(xin->map->n);
1059:   CUSPARRAY      *xarray,*yarray;

1062:   if (xin != yin) {
1063:     VecCUSPGetArrayReadWrite(xin,&xarray);
1064:     VecCUSPGetArrayReadWrite(yin,&yarray);
1065: #if defined(PETSC_USE_REAL_SINGLE)
1066:     cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1067: #else
1068:     cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1069: #endif
1070:     cublasGetError();CHKERRCUSP(ierr);
1071:     WaitForGPU();CHKERRCUSP(ierr);
1072:     VecCUSPRestoreArrayReadWrite(xin,&xarray);
1073:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1074:   }
1075:   return(0);
1076: }

1078: struct VecCUSPAX
1079: {
1080:   template <typename Tuple>
1081:   __host__ __device__
1082:   void operator()(Tuple t)
1083:   {
1084:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1085:   }
1086: };
1089: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1090: {
1091:   PetscErrorCode    ierr;
1092:   PetscScalar       a = alpha,b = beta;
1093:   CUSPARRAY         *xarray,*yarray;

1096:   if (a == 0.0) {
1097:     VecScale_SeqCUSP(yin,beta);
1098:   } else if (b == 1.0) {
1099:     VecAXPY_SeqCUSP(yin,alpha,xin);
1100:   } else if (a == 1.0) {
1101:     VecAYPX_SeqCUSP(yin,beta,xin);
1102:   } else if (b == 0.0) {
1103:     VecCUSPGetArrayRead(xin,&xarray);
1104:     VecCUSPGetArrayReadWrite(yin,&yarray);
1105:     try {
1106:       thrust::for_each(
1107:         thrust::make_zip_iterator(
1108:             thrust::make_tuple(
1109:                 yarray->begin(),
1110:                 thrust::make_constant_iterator(a),
1111:                 xarray->begin())),
1112:         thrust::make_zip_iterator(
1113:             thrust::make_tuple(
1114:                 yarray->end(),
1115:                 thrust::make_constant_iterator(a),
1116:                 xarray->end())),
1117:         VecCUSPAX());
1118:     } catch(char* ex) {
1119:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1120:     }
1121:     PetscLogFlops(xin->map->n);
1122:     WaitForGPU();CHKERRCUSP(ierr);
1123:     VecCUSPRestoreArrayRead(xin,&xarray);
1124:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1125:   } else {
1126:     VecCUSPGetArrayRead(xin,&xarray);
1127:     VecCUSPGetArrayReadWrite(yin,&yarray);
1128:     try {
1129:       cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1130:     } catch(char* ex) {
1131:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1132:     }
1133:     VecCUSPRestoreArrayRead(xin,&xarray);
1134:     VecCUSPRestoreArrayReadWrite(yin,&yarray);
1135:     WaitForGPU();CHKERRCUSP(ierr);
1136:     PetscLogFlops(3.0*xin->map->n);
1137:   }
1138:   return(0);
1139: }

1141: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1142: struct VecCUSPXPBYPCZ
1143: {
1144:   /* z = x + b*y + c*z */
1145:   template <typename Tuple>
1146:   __host__ __device__
1147:   void operator()(Tuple t)
1148:   {
1149:     thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1150:   }
1151: };
1152: struct VecCUSPAXPBYPZ
1153: {
1154:   /* z = ax + b*y + z */
1155:   template <typename Tuple>
1156:   __host__ __device__
1157:   void operator()(Tuple t)
1158:   {
1159:     thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1160:   }
1161: };

1165: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1166: {
1167:   PetscErrorCode     ierr;
1168:   PetscInt           n = zin->map->n;
1169:   CUSPARRAY          *xarray,*yarray,*zarray;

1172:   VecCUSPGetArrayRead(xin,&xarray);
1173:   VecCUSPGetArrayRead(yin,&yarray);
1174:   VecCUSPGetArrayReadWrite(zin,&zarray);
1175:   if (alpha == 1.0) {
1176:     try {
1177:       thrust::for_each(
1178:         thrust::make_zip_iterator(
1179:             thrust::make_tuple(
1180:                 zarray->begin(),
1181:                 thrust::make_constant_iterator(gamma),
1182:                 xarray->begin(),
1183:                 yarray->begin(),
1184:                 thrust::make_constant_iterator(beta))),
1185:         thrust::make_zip_iterator(
1186:             thrust::make_tuple(
1187:                 zarray->end(),
1188:                 thrust::make_constant_iterator(gamma),
1189:                 xarray->end(),
1190:                 yarray->end(),
1191:                 thrust::make_constant_iterator(beta))),
1192:         VecCUSPXPBYPCZ());
1193:     } catch(char* ex) {
1194:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1195:     }
1196:     PetscLogFlops(4.0*n);
1197:   } else if (gamma == 1.0) {
1198:     try {
1199:       thrust::for_each(
1200:         thrust::make_zip_iterator(
1201:             thrust::make_tuple(
1202:                 zarray->begin(),
1203:                 xarray->begin(),
1204:                 thrust::make_constant_iterator(alpha),
1205:                 yarray->begin(),
1206:                 thrust::make_constant_iterator(beta))),
1207:         thrust::make_zip_iterator(
1208:             thrust::make_tuple(
1209:                 zarray->end(),
1210:                 xarray->end(),
1211:                 thrust::make_constant_iterator(alpha),
1212:                 yarray->end(),
1213:                 thrust::make_constant_iterator(beta))),
1214:         VecCUSPAXPBYPZ());
1215:     } catch(char* ex) {
1216:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1217:     }
1218:     PetscLogFlops(4.0*n);
1219:   } else {
1220:     try {
1221:       cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1222:     } catch(char* ex) {
1223:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1224:     }
1225:     VecCUSPRestoreArrayReadWrite(zin,&zarray);
1226:     VecCUSPRestoreArrayRead(xin,&xarray);
1227:     VecCUSPRestoreArrayRead(yin,&yarray);
1228:     PetscLogFlops(5.0*n);
1229:   }
1230:   WaitForGPU();CHKERRCUSP(ierr);
1231:   return(0);
1232: }

1236: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1237: {
1239:   PetscInt       n = win->map->n;
1240:   CUSPARRAY      *xarray,*yarray,*warray;

1243:   VecCUSPGetArrayRead(xin,&xarray);
1244:   VecCUSPGetArrayRead(yin,&yarray);
1245:   VecCUSPGetArrayReadWrite(win,&warray);
1246:   try {
1247:     cusp::blas::xmy(*xarray,*yarray,*warray);
1248:   } catch(char* ex) {
1249:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1250:   }
1251:   VecCUSPRestoreArrayRead(xin,&xarray);
1252:   VecCUSPRestoreArrayRead(yin,&yarray);
1253:   VecCUSPRestoreArrayReadWrite(win,&warray);
1254:   PetscLogFlops(n);
1255:   WaitForGPU();CHKERRCUSP(ierr);
1256:   return(0);
1257: }


1260: /* should do infinity norm in cusp */

1264: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal* z)
1265: {
1266:   const PetscScalar *xx;
1267:   PetscErrorCode    ierr;
1268:   PetscInt          n = xin->map->n;
1269:   PetscBLASInt      one = 1, bn = PetscBLASIntCast(n);
1270:   CUSPARRAY         *xarray;

1273:   if (type == NORM_2 || type == NORM_FROBENIUS) {
1274:     VecCUSPGetArrayRead(xin,&xarray);
1275:     try {
1276:       *z = cusp::blas::nrm2(*xarray);
1277:     } catch(char* ex) {
1278:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1279:     }
1280:     WaitForGPU();CHKERRCUSP(ierr);
1281:     VecCUSPRestoreArrayRead(xin,&xarray);
1282:     PetscLogFlops(PetscMax(2.0*n-1,0.0));
1283:   } else if (type == NORM_INFINITY) {
1284:     PetscInt     i;
1285:     PetscReal    max = 0.0,tmp;

1287:     VecGetArrayRead(xin,&xx);
1288:     for (i=0; i<n; i++) {
1289:       if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1290:       /* check special case of tmp == NaN */
1291:       if (tmp != tmp) {max = tmp; break;}
1292:       xx++;
1293:     }
1294:     VecRestoreArrayRead(xin,&xx);
1295:     *z   = max;
1296:   } else if (type == NORM_1) {
1297:     VecCUSPGetArrayRead(xin,&xarray);
1298: #if defined(PETSC_USE_REAL_SINGLE)
1299:     *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1300: #else
1301:     *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1302: #endif
1303:     cublasGetError();CHKERRCUSP(ierr);
1304:     VecCUSPRestoreArrayRead(xin,&xarray);
1305:     WaitForGPU();CHKERRCUSP(ierr);
1306:     PetscLogFlops(PetscMax(n-1.0,0.0));
1307:   } else if (type == NORM_1_AND_2) {
1308:     VecNorm_SeqCUSP(xin,NORM_1,z);
1309:     VecNorm_SeqCUSP(xin,NORM_2,z+1);
1310:   }
1311:   return(0);
1312: }


1315: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */

1319: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1320: {
1323:   VecSetRandom_Seq(xin,r);
1324:   if (xin->valid_GPU_array != PETSC_CUSP_UNALLOCATED){
1325:     xin->valid_GPU_array = PETSC_CUSP_CPU;
1326:   }
1327:   return(0);
1328: }

1332: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1333: {
1336:   VecCUSPCopyFromGPU(vin);CHKERRCUSP(ierr);
1337:   VecResetArray_Seq(vin);
1338:   if (vin->valid_GPU_array != PETSC_CUSP_UNALLOCATED){
1339:     vin->valid_GPU_array = PETSC_CUSP_CPU;
1340:   }
1341:   return(0);
1342: }

1346: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1347: {
1350:   VecCUSPCopyFromGPU(vin);CHKERRCUSP(ierr);
1351:   VecPlaceArray_Seq(vin,a);
1352:   if (vin->valid_GPU_array != PETSC_CUSP_UNALLOCATED){
1353:     vin->valid_GPU_array = PETSC_CUSP_CPU;
1354:   }
1355:   return(0);
1356: }


1361: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1362: {
1365:   VecCUSPCopyFromGPU(vin);CHKERRCUSP(ierr);
1366:   VecReplaceArray_Seq(vin,a);
1367:   if (vin->valid_GPU_array != PETSC_CUSP_UNALLOCATED){
1368:     vin->valid_GPU_array = PETSC_CUSP_CPU;
1369:   }
1370:   return(0);
1371: }


1376: /*@
1377:    VecCreateSeqCUSP - Creates a standard, sequential array-style vector.

1379:    Collective on MPI_Comm

1381:    Input Parameter:
1382: +  comm - the communicator, should be PETSC_COMM_SELF
1383: -  n - the vector length

1385:    Output Parameter:
1386: .  V - the vector

1388:    Notes:
1389:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1390:    same type as an existing vector.

1392:    Level: intermediate

1394:    Concepts: vectors^creating sequential

1396: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1397: @*/
1398: PetscErrorCode  VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1399: {

1403:   VecCreate(comm,v);
1404:   VecSetSizes(*v,n,n);
1405:   VecSetType(*v,VECSEQCUSP);
1406:   return(0);
1407: }

1409: /*The following template functions are for VecDotNorm2_SeqCUSP.  Note that there is no complex support as currently written*/
1410: template <typename T>
1411: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1412: {
1413:         __host__ __device__
1414:         T operator()(T x)
1415:         {
1416:                 return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<1>(x)*thrust::get<1>(x));
1417:         }
1418: };

1420: template <typename T>
1421: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1422: {
1423:         __host__ __device__
1424:         T operator()(T x,T y)
1425:         {
1426:                 return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y));
1427:         }
1428: };

1432: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1433: {
1434:   PetscErrorCode                         ierr;
1435:   PetscScalar                            zero = 0.0,n=s->map->n;
1436:   thrust::tuple<PetscScalar,PetscScalar> result;
1437:   CUSPARRAY                              *sarray,*tarray;

1440:   /*VecCUSPCopyToGPU(s);
1441:    VecCUSPCopyToGPU(t);*/
1442:   VecCUSPGetArrayRead(s,&sarray);
1443:   VecCUSPGetArrayRead(t,&tarray);
1444:   try {
1445:     result = thrust::transform_reduce(
1446:                  thrust::make_zip_iterator(
1447:                      thrust::make_tuple(
1448:                          sarray->begin(),
1449:                          tarray->begin())),
1450:                  thrust::make_zip_iterator(
1451:                      thrust::make_tuple(
1452:                          sarray->end(),
1453:                          tarray->end())),
1454:                   cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1455:                   thrust::make_tuple(zero,zero), /*init */
1456:                   cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1457:     *dp = thrust::get<0>(result);
1458:     *nm = thrust::get<1>(result);
1459:   } catch(char* ex) {
1460:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1461:   }
1462:   VecCUSPRestoreArrayRead(s,&sarray);
1463:   VecCUSPRestoreArrayRead(t,&tarray);
1464:   WaitForGPU();CHKERRCUSP(ierr);
1465:   PetscLogFlops(4.0*n);
1466:   return(0);
1467: }

1471: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1472: {

1476:   VecCreateSeqCUSP(((PetscObject)win)->comm,win->map->n,V);
1477:   PetscLayoutReference(win->map,&(*V)->map);
1478:   PetscOListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1479:   PetscFListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1480:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1481:   return(0);
1482: }

1486: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1487: {

1491:   try {
1492:     if (v->spptr) {
1493:       delete ((Vec_CUSP *)v->spptr)->GPUarray;
1494:       delete (Vec_CUSP *)v->spptr;
1495:     }
1496:   } catch(char* ex) {
1497:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1498:   }
1499:   VecDestroy_Seq(v);
1500:   return(0);
1501: }

1506: PetscErrorCode  VecCreate_SeqCUSP(Vec V)
1507: {
1509:   PetscMPIInt    size;

1512:   MPI_Comm_size(((PetscObject)V)->comm,&size);
1513:   if  (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
1514:   VecCreate_Seq_Private(V,0);
1515:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);
1516:   V->ops->dot             = VecDot_SeqCUSP;
1517:   V->ops->norm            = VecNorm_SeqCUSP;
1518:   V->ops->tdot            = VecTDot_SeqCUSP;
1519:   V->ops->scale           = VecScale_SeqCUSP;
1520:   V->ops->copy            = VecCopy_SeqCUSP;
1521:   V->ops->set             = VecSet_SeqCUSP;
1522:   V->ops->swap            = VecSwap_SeqCUSP;
1523:   V->ops->axpy            = VecAXPY_SeqCUSP;
1524:   V->ops->axpby           = VecAXPBY_SeqCUSP;
1525:   V->ops->axpbypcz        = VecAXPBYPCZ_SeqCUSP;
1526:   V->ops->pointwisemult   = VecPointwiseMult_SeqCUSP;
1527:   V->ops->pointwisedivide = VecPointwiseDivide_SeqCUSP;
1528:   V->ops->setrandom       = VecSetRandom_SeqCUSP;
1529:   V->ops->dot_local       = VecDot_SeqCUSP;
1530:   V->ops->tdot_local      = VecTDot_SeqCUSP;
1531:   V->ops->norm_local      = VecNorm_SeqCUSP;
1532:   V->ops->mdot_local      = VecMDot_SeqCUSP;
1533:   V->ops->maxpy           = VecMAXPY_SeqCUSP;
1534:   V->ops->mdot            = VecMDot_SeqCUSP;
1535:   V->ops->aypx            = VecAYPX_SeqCUSP;
1536:   V->ops->waxpy           = VecWAXPY_SeqCUSP;
1537:   V->ops->dotnorm2        = VecDotNorm2_SeqCUSP;
1538:   V->ops->placearray      = VecPlaceArray_SeqCUSP;
1539:   V->ops->replacearray    = VecReplaceArray_SeqCUSP;
1540:   V->ops->resetarray      = VecResetArray_SeqCUSP;
1541:   V->ops->destroy         = VecDestroy_SeqCUSP;
1542:   V->ops->duplicate       = VecDuplicate_SeqCUSP;
1543:   V->valid_GPU_array      = PETSC_CUSP_UNALLOCATED;
1544:   return(0);
1545: }