Actual source code: veccusp.cu
2: /*
3: Implements the sequential cusp vectors.
4: */
6: #include <petscconf.h>
8: #include <private/vecimpl.h> /*I "petscvec.h" I*/
9: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <../src/vec/vec/impls/seq/seqcusp/cuspvecimpl.h>
15: static PetscErrorCode VecCopy_Seq(Vec xin,Vec yin)
16: {
17: PetscScalar *ya;
18: const PetscScalar *xa;
19: PetscErrorCode ierr;
22: if (xin != yin) {
23: VecGetArrayRead(xin,&xa);
24: VecGetArray(yin,&ya);
25: PetscMemcpy(ya,xa,xin->map->n*sizeof(PetscScalar));
26: VecRestoreArrayRead(xin,&xa);
27: VecRestoreArray(yin,&ya);
28: }
29: return(0);
30: }
34: static PetscErrorCode VecSetRandom_Seq(Vec xin,PetscRandom r)
35: {
37: PetscInt n = xin->map->n,i;
38: PetscScalar *xx;
41: VecGetArray(xin,&xx);
42: for (i=0; i<n; i++) {PetscRandomGetValue(r,&xx[i]);}
43: VecRestoreArray(xin,&xx);
44: return(0);
45: }
49: static PetscErrorCode VecDestroy_Seq(Vec v)
50: {
51: Vec_Seq *vs = (Vec_Seq*)v->data;
55: PetscObjectDepublish(v);
57: #if defined(PETSC_USE_LOG)
58: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
59: #endif
60: PetscFree(vs->array_allocated);
61: PetscFree(vs);
62: return(0);
63: }
67: static PetscErrorCode VecResetArray_Seq(Vec vin)
68: {
69: Vec_Seq *v = (Vec_Seq *)vin->data;
72: v->array = v->unplacedarray;
73: v->unplacedarray = 0;
74: return(0);
75: }
77: /* these following 3 public versions are necessary because we use CUSP in the regular PETSc code and these need to be called from plain C code. */
80: PetscErrorCode VecCUSPAllocateCheck_Public(Vec v)
81: {
85: VecCUSPAllocateCheck(v);
86: return(0);
87: }
91: PetscErrorCode VecCUSPCopyToGPU_Public(Vec v)
92: {
96: VecCUSPCopyToGPU(v);
97: return(0);
98: }
100: struct _p_PetscCUSPIndices {
101: CUSPINTARRAYCPU indicesCPU;
102: CUSPINTARRAYGPU indicesGPU;
103: };
108: /*
109: PetscCUSPIndicesCreate - creates the data structure needed by VecCUSPCopyToGPUSome_Public()
111: Input Parameters:
112: + n - the number of indices
113: - indices - integer list of indices
115: Output Parameter:
116: . ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()
118: .seealso: PetscCUSPIndicesDestroy(), VecCUSPCopyToGPUSome_Public()
119: */
120: PetscErrorCode PetscCUSPIndicesCreate(PetscInt n,const PetscInt *indices,PetscCUSPIndices *ci)
121: {
122: PetscCUSPIndices cci;
125: cci = new struct _p_PetscCUSPIndices;
126: cci->indicesCPU.assign(indices,indices+n);
127: cci->indicesGPU.assign(indices,indices+n);
128: *ci = cci;
129: return(0);
130: }
134: /*
135: PetscCUSPIndicesDestroy - destroys the data structure needed by VecCUSPCopyToGPUSome_Public()
137: Input Parameters:
138: . ci - the CUSPIndices object suitable to pass to VecCUSPCopyToGPUSome_Public()
140: .seealso: PetscCUSPIndicesCreate(), VecCUSPCopyToGPUSome_Public()
141: */
142: PetscErrorCode PetscCUSPIndicesDestroy(PetscCUSPIndices *ci)
143: {
145: if (!ci) return(0);
146: try {
147: delete *ci;
148: } catch(char* ex) {
149: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
150: }
151: *ci = 0;
152: return(0);
153: }
157: /*
158: VecCUSPCopyToGPUSome_Public - Copies certain entries down to the GPU from the CPU of a vector
160: Input Parameters:
161: + v - the vector
162: - indices - the requested indices, this should be created with CUSPIndicesCreate()
164: */
165: PetscErrorCode VecCUSPCopyToGPUSome_Public(Vec v, PetscCUSPIndices ci)
166: {
167: PetscErrorCode ierr;
170: VecCUSPCopyToGPUSome(v,&ci->indicesCPU,&ci->indicesGPU);CHKERRCUSP(ierr);
171: return(0);
172: }
178: /*@C
179: VecCUSPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
180: @*/
181: PetscErrorCode VecCUSPCopyFromGPU(Vec v)
182: {
184: CUSPARRAY *GPUvector;
185: PetscScalar *array;
186: Vec_Seq *s;
187: PetscInt n = v->map->n;
190: s = (Vec_Seq*)v->data;
191: if (s->array == 0){
192: PetscMalloc(n*sizeof(PetscScalar),&array);
193: PetscLogObjectMemory(v,n*sizeof(PetscScalar));
194: s->array = array;
195: s->array_allocated = array;
196: }
197: if (v->valid_GPU_array == PETSC_CUSP_GPU){
198: GPUvector = ((Vec_CUSP*)v->spptr)->GPUarray;
199: PetscLogEventBegin(VEC_CUSPCopyFromGPU,v,0,0,0);
200: try{
201: thrust::copy(GPUvector->begin(),GPUvector->end(),*(PetscScalar**)v->data);
202: WaitForGPU();CHKERRCUSP(ierr);
203: } catch(char* ex) {
204: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
205: }
206: PetscLogEventEnd(VEC_CUSPCopyFromGPU,v,0,0,0);
207: v->valid_GPU_array = PETSC_CUSP_BOTH;
208: }
209: return(0);
210: }
214: /* Note that this function only copies *some* of the values up from the GPU to CPU,
215: which means that we need recombine the data at some point before using any of the standard functions.
216: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
217: where you have to always call in pairs
218: */
219: PetscErrorCode VecCUSPCopyFromGPUSome(Vec v,CUSPINTARRAYCPU *indicesCPU,CUSPINTARRAYGPU *indicesGPU)
220: {
221: Vec_Seq *s;
222: PetscInt n = v->map->n;
223: PetscScalar *array;
224: CUSPARRAY *varray;
228: VecCUSPAllocateCheck(v);CHKERRCUSP(ierr);
229: s = (Vec_Seq*)v->data;
230: if (s->array == 0){
231: PetscMalloc(n*sizeof(PetscScalar),&array);
232: PetscLogObjectMemory(v,n*sizeof(PetscScalar));
233: s->array = array;
234: s->array_allocated = array;
235: }
236: if (v->valid_GPU_array == PETSC_CUSP_GPU) {
237: PetscLogEventBegin(VEC_CUSPCopyFromGPUSome,v,0,0,0);
238: VecCUSPGetArrayRead(v,&varray);
239: thrust::copy(
240: thrust::make_permutation_iterator(varray->begin(),indicesGPU->begin()),
241: thrust::make_permutation_iterator(varray->begin(),indicesGPU->end()),
242: thrust::make_permutation_iterator(s->array,indicesCPU->begin()));
243: VecCUSPRestoreArrayRead(v,&varray);
244: PetscLogEventEnd(VEC_CUSPCopyFromGPUSome,v,0,0,0);
245: }
246: /*v->valid_GPU_array = PETSC_CUSP_CPU; */
247: return(0);
248: }
252: /*
253: VecCUSPCopyFromGPUSome_Public - Copies certain entries up to the CPU from the GPU of a vector
255: Input Parameters:
256: + v - the vector
257: - indices - the requested indices, this should be created with CUSPIndicesCreate()
258: */
259: PetscErrorCode VecCUSPCopyFromGPUSome_Public(Vec v, PetscCUSPIndices ci)
260: {
264: VecCUSPCopyFromGPUSome(v,&ci->indicesCPU,&ci->indicesGPU);CHKERRCUSP(ierr);
265: return(0);
266: }
269: /*MC
270: VECSEQCUSP - VECSEQCUSP = "seqcusp" - The basic sequential vector, modified to use CUSP
272: Options Database Keys:
273: . -vec_type seqcusp - sets the vector type to VECSEQCUSP during a call to VecSetFromOptions()
275: Level: beginner
277: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
278: M*/
280: /* for VecAYPX_SeqCUSP*/
281: namespace cusp
282: {
283: namespace blas
284: {
285: namespace detail
286: {
287: template <typename T>
288: struct AYPX : public thrust::binary_function<T,T,T>
289: {
290: T alpha;
292: AYPX(T _alpha) : alpha(_alpha) {}
294: __host__ __device__
295: T operator()(T x, T y)
296: {
297: return alpha * y + x;
298: }
299: };
300: }
302: template <typename ForwardIterator1,
303: typename ForwardIterator2,
304: typename ScalarType>
305: void aypx(ForwardIterator1 first1,ForwardIterator1 last1,ForwardIterator2 first2,ScalarType alpha)
306: {
307: thrust::transform(first1,last1,first2,first2,detail::AYPX<ScalarType>(alpha));
308: }
309: template <typename Array1, typename Array2, typename ScalarType>
310: void aypx(const Array1& x, Array2& y, ScalarType alpha)
311: {
312: detail::assert_same_dimensions(x,y);
313: aypx(x.begin(),x.end(),y.begin(),alpha);
314: }
315: }
316: }
320: PetscErrorCode VecAYPX_SeqCUSP(Vec yin, PetscScalar alpha, Vec xin)
321: {
322: CUSPARRAY *xarray,*yarray;
326: if (alpha != 0.0) {
327: VecCUSPGetArrayRead(xin,&xarray);
328: VecCUSPGetArrayReadWrite(yin,&yarray);
329: try{
330: cusp::blas::aypx(*xarray,*yarray,alpha);
331: WaitForGPU();CHKERRCUSP(ierr);
332: } catch(char* ex) {
333: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
334: }
335: VecCUSPRestoreArrayRead(xin,&xarray);
336: VecCUSPRestoreArrayReadWrite(yin,&yarray);
337: PetscLogFlops(2.0*yin->map->n);
338: }
339: return(0);
340: }
345: PetscErrorCode VecAXPY_SeqCUSP(Vec yin,PetscScalar alpha,Vec xin)
346: {
347: CUSPARRAY *xarray,*yarray;
351: if (alpha != 0.0) {
352: VecCUSPGetArrayRead(xin,&xarray);
353: VecCUSPGetArrayReadWrite(yin,&yarray);
354: try {
355: cusp::blas::axpy(*xarray,*yarray,alpha);
356: WaitForGPU();CHKERRCUSP(ierr);
357: } catch(char* ex) {
358: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
359: }
360: VecCUSPRestoreArrayRead(xin,&xarray);
361: VecCUSPRestoreArrayReadWrite(yin,&yarray);
362: PetscLogFlops(2.0*yin->map->n);
363: }
364: return(0);
365: }
367: struct VecCUSPPointwiseDivide
368: {
369: template <typename Tuple>
370: __host__ __device__
371: void operator()(Tuple t)
372: {
373: thrust::get<0>(t) = thrust::get<1>(t) / thrust::get<2>(t);
374: }
375: };
379: PetscErrorCode VecPointwiseDivide_SeqCUSP(Vec win, Vec xin, Vec yin)
380: {
381: CUSPARRAY *warray,*xarray,*yarray;
385: VecCUSPGetArrayRead(xin,&xarray);
386: VecCUSPGetArrayRead(yin,&yarray);
387: VecCUSPGetArrayWrite(win,&warray);
388: try{
389: thrust::for_each(
390: thrust::make_zip_iterator(
391: thrust::make_tuple(
392: warray->begin(),
393: xarray->begin(),
394: yarray->begin())),
395: thrust::make_zip_iterator(
396: thrust::make_tuple(
397: warray->end(),
398: xarray->end(),
399: yarray->end())),
400: VecCUSPPointwiseDivide());
401: WaitForGPU();CHKERRCUSP(ierr);
402: } catch(char* ex) {
403: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
404: }
405: PetscLogFlops(win->map->n);
406: VecCUSPRestoreArrayRead(xin,&xarray);
407: VecCUSPRestoreArrayRead(yin,&yarray);
408: VecCUSPRestoreArrayWrite(win,&warray);
409: return(0);
410: }
413: struct VecCUSPWAXPY
414: {
415: template <typename Tuple>
416: __host__ __device__
417: void operator()(Tuple t)
418: {
419: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t)*thrust::get<3>(t);
420: }
421: };
423: struct VecCUSPSum
424: {
425: template <typename Tuple>
426: __host__ __device__
427: void operator()(Tuple t)
428: {
429: thrust::get<0>(t) = thrust::get<1>(t) + thrust::get<2>(t);
430: }
431: };
433: struct VecCUSPDiff
434: {
435: template <typename Tuple>
436: __host__ __device__
437: void operator()(Tuple t)
438: {
439: thrust::get<0>(t) = thrust::get<1>(t) - thrust::get<2>(t);
440: }
441: };
445: PetscErrorCode VecWAXPY_SeqCUSP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
446: {
447: CUSPARRAY *xarray,*yarray,*warray;
451: if (alpha == 0.0) {
452: VecCopy_SeqCUSP(yin,win);
453: } else {
454: VecCUSPGetArrayRead(xin,&xarray);
455: VecCUSPGetArrayRead(yin,&yarray);
456: VecCUSPGetArrayWrite(win,&warray);
457: if (alpha == 1.0) {
458: try {
459: thrust::for_each(
460: thrust::make_zip_iterator(
461: thrust::make_tuple(
462: warray->begin(),
463: yarray->begin(),
464: xarray->begin())),
465: thrust::make_zip_iterator(
466: thrust::make_tuple(
467: warray->end(),
468: yarray->end(),
469: xarray->end())),
470: VecCUSPSum());
471: } catch(char* ex) {
472: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
473: }
474: PetscLogFlops(win->map->n);
475: } else if (alpha == -1.0) {
476: try {
477: thrust::for_each(
478: thrust::make_zip_iterator(
479: thrust::make_tuple(
480: warray->begin(),
481: yarray->begin(),
482: xarray->begin())),
483: thrust::make_zip_iterator(
484: thrust::make_tuple(
485: warray->end(),
486: yarray->end(),
487: xarray->end())),
488: VecCUSPDiff());
489: } catch(char* ex) {
490: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
491: }
492: PetscLogFlops(win->map->n);
493: } else {
494: try {
495: thrust::for_each(
496: thrust::make_zip_iterator(
497: thrust::make_tuple(
498: warray->begin(),
499: yarray->begin(),
500: thrust::make_constant_iterator(alpha),
501: xarray->begin())),
502: thrust::make_zip_iterator(
503: thrust::make_tuple(
504: warray->end(),
505: yarray->end(),
506: thrust::make_constant_iterator(alpha),
507: xarray->end())),
508: VecCUSPWAXPY());
509: } catch(char* ex) {
510: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
511: }
512: PetscLogFlops(2*win->map->n);
513: }
514: WaitForGPU();CHKERRCUSP(ierr);
515: VecCUSPRestoreArrayRead(xin,&xarray);
516: VecCUSPRestoreArrayRead(yin,&yarray);
517: VecCUSPRestoreArrayWrite(win,&warray);
518: }
519: return(0);
520: }
522: /* These functions are for the CUSP implementation of MAXPY with the loop unrolled on the CPU */
523: struct VecCUSPMAXPY4
524: {
525: template <typename Tuple>
526: __host__ __device__
527: void operator()(Tuple t)
528: {
529: /*y += a1*x1 +a2*x2 + 13*x3 +a4*x4 */
530: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t)+thrust::get<7>(t)*thrust::get<8>(t);
531: }
532: };
535: struct VecCUSPMAXPY3
536: {
537: template <typename Tuple>
538: __host__ __device__
539: void operator()(Tuple t)
540: {
541: /*y += a1*x1 +a2*x2 + 13*x3 */
542: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t)+thrust::get<5>(t)*thrust::get<6>(t);
543: }
544: };
546: struct VecCUSPMAXPY2
547: {
548: template <typename Tuple>
549: __host__ __device__
550: void operator()(Tuple t)
551: {
552: /*y += a1*x1 +a2*x2*/
553: thrust::get<0>(t) += thrust::get<1>(t)*thrust::get<2>(t)+thrust::get<3>(t)*thrust::get<4>(t);
554: }
555: };
558: PetscErrorCode VecMAXPY_SeqCUSP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
559: {
560: PetscErrorCode ierr;
561: CUSPARRAY *xarray,*yy0,*yy1,*yy2,*yy3;
562: PetscInt n = xin->map->n,j,j_rem;
563: PetscScalar alpha0,alpha1,alpha2,alpha3;
566: PetscLogFlops(nv*2.0*n);
567: VecCUSPGetArrayReadWrite(xin,&xarray);
568: switch (j_rem=nv&0x3) {
569: case 3:
570: alpha0 = alpha[0];
571: alpha1 = alpha[1];
572: alpha2 = alpha[2];
573: alpha += 3;
574: VecCUSPGetArrayRead(y[0],&yy0);
575: VecCUSPGetArrayRead(y[1],&yy1);
576: VecCUSPGetArrayRead(y[2],&yy2);
577: try {
578: thrust::for_each(
579: thrust::make_zip_iterator(
580: thrust::make_tuple(
581: xarray->begin(),
582: thrust::make_constant_iterator(alpha0),
583: yy0->begin(),
584: thrust::make_constant_iterator(alpha1),
585: yy1->begin(),
586: thrust::make_constant_iterator(alpha2),
587: yy2->begin())),
588: thrust::make_zip_iterator(
589: thrust::make_tuple(
590: xarray->end(),
591: thrust::make_constant_iterator(alpha0),
592: yy0->end(),
593: thrust::make_constant_iterator(alpha1),
594: yy1->end(),
595: thrust::make_constant_iterator(alpha2),
596: yy2->end())),
597: VecCUSPMAXPY3());
598: } catch(char* ex) {
599: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
600: }
601: VecCUSPRestoreArrayRead(y[0],&yy0);
602: VecCUSPRestoreArrayRead(y[1],&yy1);
603: VecCUSPRestoreArrayRead(y[2],&yy2);
604: y += 3;
605: break;
606: case 2:
607: alpha0 = alpha[0];
608: alpha1 = alpha[1];
609: alpha +=2;
610: VecCUSPGetArrayRead(y[0],&yy0);
611: VecCUSPGetArrayRead(y[1],&yy1);
612: try {
613: thrust::for_each(
614: thrust::make_zip_iterator(
615: thrust::make_tuple(
616: xarray->begin(),
617: thrust::make_constant_iterator(alpha0),
618: yy0->begin(),
619: thrust::make_constant_iterator(alpha1),
620: yy1->begin())),
621: thrust::make_zip_iterator(
622: thrust::make_tuple(
623: xarray->end(),
624: thrust::make_constant_iterator(alpha0),
625: yy0->end(),
626: thrust::make_constant_iterator(alpha1),
627: yy1->end())),
628: VecCUSPMAXPY2());
629: } catch(char* ex) {
630: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
631: }
632: y +=2;
633: break;
634: case 1:
635: alpha0 = *alpha++;
636: VecAXPY_SeqCUSP(xin,alpha0,y[0]);
637: y +=1;
638: break;
639: }
640: for (j=j_rem; j<nv; j+=4) {
641: alpha0 = alpha[0];
642: alpha1 = alpha[1];
643: alpha2 = alpha[2];
644: alpha3 = alpha[3];
645: alpha += 4;
646: VecCUSPGetArrayRead(y[0],&yy0);
647: VecCUSPGetArrayRead(y[1],&yy1);
648: VecCUSPGetArrayRead(y[2],&yy2);
649: VecCUSPGetArrayRead(y[3],&yy3);
650: try {
651: thrust::for_each(
652: thrust::make_zip_iterator(
653: thrust::make_tuple(
654: xarray->begin(),
655: thrust::make_constant_iterator(alpha0),
656: yy0->begin(),
657: thrust::make_constant_iterator(alpha1),
658: yy1->begin(),
659: thrust::make_constant_iterator(alpha2),
660: yy2->begin(),
661: thrust::make_constant_iterator(alpha3),
662: yy3->begin())),
663: thrust::make_zip_iterator(
664: thrust::make_tuple(
665: xarray->end(),
666: thrust::make_constant_iterator(alpha0),
667: yy0->end(),
668: thrust::make_constant_iterator(alpha1),
669: yy1->end(),
670: thrust::make_constant_iterator(alpha2),
671: yy2->end(),
672: thrust::make_constant_iterator(alpha3),
673: yy3->end())),
674: VecCUSPMAXPY4());
675: } catch(char* ex) {
676: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
677: }
678: VecCUSPRestoreArrayRead(y[0],&yy0);
679: VecCUSPRestoreArrayRead(y[1],&yy1);
680: VecCUSPRestoreArrayRead(y[2],&yy2);
681: VecCUSPRestoreArrayRead(y[3],&yy3);
682: y += 4;
683: }
684: VecCUSPRestoreArrayReadWrite(xin,&xarray);
685: WaitForGPU();CHKERRCUSP(ierr);
686: return(0);
687: }
692: PetscErrorCode VecDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
693: {
694: #if defined(PETSC_USE_COMPLEX)
695: PetscScalar *ya,*xa;
696: #endif
697: CUSPARRAY *xarray,*yarray;
701: #if defined(PETSC_USE_COMPLEX)
702: /*Not working for complex*/
703: #else
704: {
705: VecCUSPGetArrayRead(xin,&xarray);
706: VecCUSPGetArrayRead(yin,&yarray);
707: try {
708: *z = cusp::blas::dot(*xarray,*yarray);
709: } catch(char* ex) {
710: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
711: }
712: }
713: #endif
714: WaitForGPU();CHKERRCUSP(ierr);
715: if (xin->map->n >0) {
716: PetscLogFlops(2.0*xin->map->n-1);
717: }
718: VecCUSPRestoreArrayRead(xin,&xarray);
719: VecCUSPRestoreArrayRead(yin,&yarray);
720: return(0);
721: }
723: /*The following few template functions are for VecMDot_SeqCUSP*/
725: template <typename T1,typename T2>
726: struct cuspmult2 : thrust::unary_function<T1,T2>
727: {
728: __host__ __device__
729: T2 operator()(T1 x)
730: {
731: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x));
732: }
733: };
735: template <typename T>
736: struct cuspadd2 : thrust::binary_function<T,T,T>
737: {
738: __host__ __device__
739: T operator()(T x,T y)
740: {
741: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y));
742: }
743: };
745: template <typename T1,typename T2>
746: struct cuspmult3 : thrust::unary_function<T1,T2>
747: {
748: __host__ __device__
749: T2 operator()(T1 x)
750: {
751: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x),thrust::get<0>(x)*thrust::get<3>(x));
752: }
753: };
755: template <typename T>
756: struct cuspadd3 : thrust::binary_function<T,T,T>
757: {
758: __host__ __device__
759: T operator()(T x,T y)
760: {
761: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y),thrust::get<2>(x)+thrust::get<2>(y));
762: }
763: };
764: template <typename T1,typename T2>
765: struct cuspmult4 : thrust::unary_function<T1,T2>
766: {
767: __host__ __device__
768: T2 operator()(T1 x)
769: {
770: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<0>(x)*thrust::get<2>(x),thrust::get<0>(x)*thrust::get<3>(x),thrust::get<0>(x)*thrust::get<4>(x));
771: }
772: };
774: template <typename T>
775: struct cuspadd4 : thrust::binary_function<T,T,T>
776: {
777: __host__ __device__
778: T operator()(T x,T y)
779: {
780: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y),thrust::get<2>(x)+thrust::get<2>(y),thrust::get<3>(x)+thrust::get<3>(y));
781: }
782: };
787: PetscErrorCode VecMDot_SeqCUSP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
788: {
789: PetscErrorCode ierr;
790: PetscInt n = xin->map->n,j,j_rem;
791: /*Vec yy0,yy1,yy2,yy3;*/
792: CUSPARRAY *xarray,*yy0,*yy1,*yy2,*yy3;
793: PetscScalar zero=0.0;
794: Vec *yyin = (Vec*)yin;
796: thrust::tuple<PetscScalar,PetscScalar> result2;
797: thrust::tuple<PetscScalar,PetscScalar,PetscScalar> result3;
798: thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar>result4;
801: VecCUSPGetArrayRead(xin,&xarray);
802: switch(j_rem=nv&0x3) {
803: case 3:
804: VecCUSPGetArrayRead(yyin[0],&yy0);
805: VecCUSPGetArrayRead(yyin[1],&yy1);
806: VecCUSPGetArrayRead(yyin[2],&yy2);
807: try {
808: result3 = thrust::transform_reduce(
809: thrust::make_zip_iterator(
810: thrust::make_tuple(
811: xarray->begin(),
812: yy0->begin(),
813: yy1->begin(),
814: yy2->begin())),
815: thrust::make_zip_iterator(
816: thrust::make_tuple(
817: xarray->end(),
818: yy0->end(),
819: yy1->end(),
820: yy2->end())),
821: cuspmult3<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar,PetscScalar> >(),
822: thrust::make_tuple(zero,zero,zero), /*init */
823: cuspadd3<thrust::tuple<PetscScalar,PetscScalar,PetscScalar> >()); /* binary function */
824: z[0] = thrust::get<0>(result3);
825: z[1] = thrust::get<1>(result3);
826: z[2] = thrust::get<2>(result3);
827: } catch(char* ex) {
828: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
829: }
830: z += 3;
831: VecCUSPRestoreArrayRead(yyin[0],&yy0);
832: VecCUSPRestoreArrayRead(yyin[1],&yy1);
833: VecCUSPRestoreArrayRead(yyin[2],&yy2);
834: yyin += 3;
835: break;
836: case 2:
837: VecCUSPGetArrayRead(yyin[0],&yy0);
838: VecCUSPGetArrayRead(yyin[1],&yy1);
839: try {
840: result2 = thrust::transform_reduce(
841: thrust::make_zip_iterator(
842: thrust::make_tuple(
843: xarray->begin(),
844: yy0->begin(),
845: yy1->begin())),
846: thrust::make_zip_iterator(
847: thrust::make_tuple(
848: xarray->end(),
849: yy0->end(),
850: yy1->end())),
851: cuspmult2<thrust::tuple<PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar> >(),
852: thrust::make_tuple(zero,zero), /*init */
853: cuspadd2<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
854: z[0] = thrust::get<0>(result2);
855: z[1] = thrust::get<1>(result2);
856: } catch(char* ex) {
857: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
858: }
859: z += 2;
860: VecCUSPRestoreArrayRead(yyin[0],&yy0);
861: VecCUSPRestoreArrayRead(yyin[1],&yy1);
862: yyin += 2;
863: break;
864: case 1:
865: VecDot_SeqCUSP(xin,yyin[0],&z[0]);
866: z += 1;
867: yyin += 1;
868: break;
869: }
870: for (j=j_rem; j<nv; j+=4) {
871: VecCUSPGetArrayRead(yyin[0],&yy0);
872: VecCUSPGetArrayRead(yyin[1],&yy1);
873: VecCUSPGetArrayRead(yyin[2],&yy2);
874: VecCUSPGetArrayRead(yyin[3],&yy3);
875: try {
876: result4 = thrust::transform_reduce(
877: thrust::make_zip_iterator(
878: thrust::make_tuple(
879: xarray->begin(),
880: yy0->begin(),
881: yy1->begin(),
882: yy2->begin(),
883: yy3->begin())),
884: thrust::make_zip_iterator(
885: thrust::make_tuple(
886: xarray->end(),
887: yy0->end(),
888: yy1->end(),
889: yy2->end(),
890: yy3->end())),
891: cuspmult4<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar,PetscScalar>, thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar> >(),
892: thrust::make_tuple(zero,zero,zero,zero), /*init */
893: cuspadd4<thrust::tuple<PetscScalar,PetscScalar,PetscScalar,PetscScalar> >()); /* binary function */
894: z[0] = thrust::get<0>(result4);
895: z[1] = thrust::get<1>(result4);
896: z[2] = thrust::get<2>(result4);
897: z[3] = thrust::get<3>(result4);
898: } catch(char* ex) {
899: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
900: }
901: z += 4;
902: VecCUSPRestoreArrayRead(yyin[0],&yy0);
903: VecCUSPRestoreArrayRead(yyin[1],&yy1);
904: VecCUSPRestoreArrayRead(yyin[2],&yy2);
905: VecCUSPRestoreArrayRead(yyin[3],&yy3);
906: yyin += 4;
907: }
908: WaitForGPU();CHKERRCUSP(ierr);
909: PetscLogFlops(PetscMax(nv*(2.0*n-1),0.0));
910: return(0);
911: }
916: PetscErrorCode VecSet_SeqCUSP(Vec xin,PetscScalar alpha)
917: {
918: CUSPARRAY *xarray;
922: /* if there's a faster way to do the case alpha=0.0 on the GPU we should do that*/
923: VecCUSPGetArrayWrite(xin,&xarray);
924: try {
925: cusp::blas::fill(*xarray,alpha);
926: } catch(char* ex) {
927: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
928: }
929: WaitForGPU();CHKERRCUSP(ierr);
930: VecCUSPRestoreArrayWrite(xin,&xarray);
931: return(0);
932: }
936: PetscErrorCode VecScale_SeqCUSP(Vec xin, PetscScalar alpha)
937: {
938: CUSPARRAY *xarray;
942: if (alpha == 0.0) {
943: VecSet_SeqCUSP(xin,alpha);
944: } else if (alpha != 1.0) {
945: VecCUSPGetArrayReadWrite(xin,&xarray);
946: try {
947: cusp::blas::scal(*xarray,alpha);
948: } catch(char* ex) {
949: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
950: }
951: VecCUSPRestoreArrayReadWrite(xin,&xarray);
952: }
953: WaitForGPU();CHKERRCUSP(ierr);
954: PetscLogFlops(xin->map->n);
955: return(0);
956: }
961: PetscErrorCode VecTDot_SeqCUSP(Vec xin,Vec yin,PetscScalar *z)
962: {
963: #if defined(PETSC_USE_COMPLEX)
964: PetscScalar *ya,*xa;
965: #endif
966: CUSPARRAY *xarray,*yarray;
970: #if defined(PETSC_USE_COMPLEX)
971: /*Not working for complex*/
972: #else
973: VecCUSPGetArrayRead(xin,&xarray);
974: VecCUSPGetArrayRead(yin,&yarray);
975: try {
976: *z = cusp::blas::dot(*xarray,*yarray);
977: } catch(char* ex) {
978: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
979: }
980: #endif
981: WaitForGPU();CHKERRCUSP(ierr);
982: if (xin->map->n > 0) {
983: PetscLogFlops(2.0*xin->map->n-1);
984: }
985: VecCUSPRestoreArrayRead(yin,&yarray);
986: VecCUSPRestoreArrayRead(xin,&xarray);
987: return(0);
988: }
991: PetscErrorCode VecCopy_SeqCUSP(Vec xin,Vec yin)
992: {
993: CUSPARRAY *xarray,*yarray;
997: if (xin != yin) {
998: if (xin->valid_GPU_array == PETSC_CUSP_GPU) {
999: VecCUSPGetArrayRead(xin,&xarray);
1000: VecCUSPGetArrayWrite(yin,&yarray);
1001: try {
1002: cusp::blas::copy(*xarray,*yarray);
1003: } catch(char* ex) {
1004: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1005: }
1006: WaitForGPU();CHKERRCUSP(ierr);
1007: VecCUSPRestoreArrayRead(xin,&xarray);
1008: VecCUSPRestoreArrayWrite(yin,&yarray);
1010: } else if (xin->valid_GPU_array == PETSC_CUSP_CPU || xin->valid_GPU_array == PETSC_CUSP_UNALLOCATED) {
1011: /* copy in CPU if we are on the CPU*/
1012: VecCopy_Seq(xin,yin);
1013: } else if (xin->valid_GPU_array == PETSC_CUSP_BOTH) {
1014: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
1015: if (yin->valid_GPU_array == PETSC_CUSP_CPU) {
1016: /* copy in CPU */
1017: VecCopy_Seq(xin,yin);
1019: } else if (yin->valid_GPU_array == PETSC_CUSP_GPU) {
1020: /* copy in GPU */
1021: VecCUSPGetArrayRead(xin,&xarray);
1022: VecCUSPGetArrayWrite(yin,&yarray);
1023: try {
1024: cusp::blas::copy(*xarray,*yarray);
1025: WaitForGPU();CHKERRCUSP(ierr);
1026: } catch(char* ex) {
1027: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1028: }
1029: VecCUSPRestoreArrayRead(xin,&xarray);
1030: VecCUSPRestoreArrayWrite(yin,&yarray);
1031: } else if (yin->valid_GPU_array == PETSC_CUSP_BOTH) {
1032: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
1033: default to copy in GPU (this is an arbitrary choice) */
1034: VecCUSPGetArrayRead(xin,&xarray);
1035: VecCUSPGetArrayWrite(yin,&yarray);
1036: try {
1037: cusp::blas::copy(*xarray,*yarray);
1038: WaitForGPU();CHKERRCUSP(ierr);
1039: } catch(char* ex) {
1040: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1041: }
1042: VecCUSPRestoreArrayRead(xin,&xarray);
1043: VecCUSPRestoreArrayWrite(yin,&yarray);
1044: } else {
1045: VecCopy_Seq(xin,yin);
1046: }
1047: }
1048: }
1049: return(0);
1050: }
1055: PetscErrorCode VecSwap_SeqCUSP(Vec xin,Vec yin)
1056: {
1058: PetscBLASInt one = 1,bn = PetscBLASIntCast(xin->map->n);
1059: CUSPARRAY *xarray,*yarray;
1062: if (xin != yin) {
1063: VecCUSPGetArrayReadWrite(xin,&xarray);
1064: VecCUSPGetArrayReadWrite(yin,&yarray);
1065: #if defined(PETSC_USE_REAL_SINGLE)
1066: cublasSswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1067: #else
1068: cublasDswap(bn,VecCUSPCastToRawPtr(*xarray),one,VecCUSPCastToRawPtr(*yarray),one);
1069: #endif
1070: cublasGetError();CHKERRCUSP(ierr);
1071: WaitForGPU();CHKERRCUSP(ierr);
1072: VecCUSPRestoreArrayReadWrite(xin,&xarray);
1073: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1074: }
1075: return(0);
1076: }
1078: struct VecCUSPAX
1079: {
1080: template <typename Tuple>
1081: __host__ __device__
1082: void operator()(Tuple t)
1083: {
1084: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<2>(t);
1085: }
1086: };
1089: PetscErrorCode VecAXPBY_SeqCUSP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
1090: {
1091: PetscErrorCode ierr;
1092: PetscScalar a = alpha,b = beta;
1093: CUSPARRAY *xarray,*yarray;
1096: if (a == 0.0) {
1097: VecScale_SeqCUSP(yin,beta);
1098: } else if (b == 1.0) {
1099: VecAXPY_SeqCUSP(yin,alpha,xin);
1100: } else if (a == 1.0) {
1101: VecAYPX_SeqCUSP(yin,beta,xin);
1102: } else if (b == 0.0) {
1103: VecCUSPGetArrayRead(xin,&xarray);
1104: VecCUSPGetArrayReadWrite(yin,&yarray);
1105: try {
1106: thrust::for_each(
1107: thrust::make_zip_iterator(
1108: thrust::make_tuple(
1109: yarray->begin(),
1110: thrust::make_constant_iterator(a),
1111: xarray->begin())),
1112: thrust::make_zip_iterator(
1113: thrust::make_tuple(
1114: yarray->end(),
1115: thrust::make_constant_iterator(a),
1116: xarray->end())),
1117: VecCUSPAX());
1118: } catch(char* ex) {
1119: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1120: }
1121: PetscLogFlops(xin->map->n);
1122: WaitForGPU();CHKERRCUSP(ierr);
1123: VecCUSPRestoreArrayRead(xin,&xarray);
1124: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1125: } else {
1126: VecCUSPGetArrayRead(xin,&xarray);
1127: VecCUSPGetArrayReadWrite(yin,&yarray);
1128: try {
1129: cusp::blas::axpby(*xarray,*yarray,*yarray,a,b);
1130: } catch(char* ex) {
1131: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1132: }
1133: VecCUSPRestoreArrayRead(xin,&xarray);
1134: VecCUSPRestoreArrayReadWrite(yin,&yarray);
1135: WaitForGPU();CHKERRCUSP(ierr);
1136: PetscLogFlops(3.0*xin->map->n);
1137: }
1138: return(0);
1139: }
1141: /* structs below are for special cases of VecAXPBYPCZ_SeqCUSP */
1142: struct VecCUSPXPBYPCZ
1143: {
1144: /* z = x + b*y + c*z */
1145: template <typename Tuple>
1146: __host__ __device__
1147: void operator()(Tuple t)
1148: {
1149: thrust::get<0>(t) = thrust::get<1>(t)*thrust::get<0>(t)+thrust::get<2>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1150: }
1151: };
1152: struct VecCUSPAXPBYPZ
1153: {
1154: /* z = ax + b*y + z */
1155: template <typename Tuple>
1156: __host__ __device__
1157: void operator()(Tuple t)
1158: {
1159: thrust::get<0>(t) += thrust::get<2>(t)*thrust::get<1>(t)+thrust::get<4>(t)*thrust::get<3>(t);
1160: }
1161: };
1165: PetscErrorCode VecAXPBYPCZ_SeqCUSP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1166: {
1167: PetscErrorCode ierr;
1168: PetscInt n = zin->map->n;
1169: CUSPARRAY *xarray,*yarray,*zarray;
1172: VecCUSPGetArrayRead(xin,&xarray);
1173: VecCUSPGetArrayRead(yin,&yarray);
1174: VecCUSPGetArrayReadWrite(zin,&zarray);
1175: if (alpha == 1.0) {
1176: try {
1177: thrust::for_each(
1178: thrust::make_zip_iterator(
1179: thrust::make_tuple(
1180: zarray->begin(),
1181: thrust::make_constant_iterator(gamma),
1182: xarray->begin(),
1183: yarray->begin(),
1184: thrust::make_constant_iterator(beta))),
1185: thrust::make_zip_iterator(
1186: thrust::make_tuple(
1187: zarray->end(),
1188: thrust::make_constant_iterator(gamma),
1189: xarray->end(),
1190: yarray->end(),
1191: thrust::make_constant_iterator(beta))),
1192: VecCUSPXPBYPCZ());
1193: } catch(char* ex) {
1194: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1195: }
1196: PetscLogFlops(4.0*n);
1197: } else if (gamma == 1.0) {
1198: try {
1199: thrust::for_each(
1200: thrust::make_zip_iterator(
1201: thrust::make_tuple(
1202: zarray->begin(),
1203: xarray->begin(),
1204: thrust::make_constant_iterator(alpha),
1205: yarray->begin(),
1206: thrust::make_constant_iterator(beta))),
1207: thrust::make_zip_iterator(
1208: thrust::make_tuple(
1209: zarray->end(),
1210: xarray->end(),
1211: thrust::make_constant_iterator(alpha),
1212: yarray->end(),
1213: thrust::make_constant_iterator(beta))),
1214: VecCUSPAXPBYPZ());
1215: } catch(char* ex) {
1216: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1217: }
1218: PetscLogFlops(4.0*n);
1219: } else {
1220: try {
1221: cusp::blas::axpbypcz(*xarray,*yarray,*zarray,*zarray,alpha,beta,gamma);
1222: } catch(char* ex) {
1223: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1224: }
1225: VecCUSPRestoreArrayReadWrite(zin,&zarray);
1226: VecCUSPRestoreArrayRead(xin,&xarray);
1227: VecCUSPRestoreArrayRead(yin,&yarray);
1228: PetscLogFlops(5.0*n);
1229: }
1230: WaitForGPU();CHKERRCUSP(ierr);
1231: return(0);
1232: }
1236: PetscErrorCode VecPointwiseMult_SeqCUSP(Vec win,Vec xin,Vec yin)
1237: {
1239: PetscInt n = win->map->n;
1240: CUSPARRAY *xarray,*yarray,*warray;
1243: VecCUSPGetArrayRead(xin,&xarray);
1244: VecCUSPGetArrayRead(yin,&yarray);
1245: VecCUSPGetArrayReadWrite(win,&warray);
1246: try {
1247: cusp::blas::xmy(*xarray,*yarray,*warray);
1248: } catch(char* ex) {
1249: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1250: }
1251: VecCUSPRestoreArrayRead(xin,&xarray);
1252: VecCUSPRestoreArrayRead(yin,&yarray);
1253: VecCUSPRestoreArrayReadWrite(win,&warray);
1254: PetscLogFlops(n);
1255: WaitForGPU();CHKERRCUSP(ierr);
1256: return(0);
1257: }
1260: /* should do infinity norm in cusp */
1264: PetscErrorCode VecNorm_SeqCUSP(Vec xin,NormType type,PetscReal* z)
1265: {
1266: const PetscScalar *xx;
1267: PetscErrorCode ierr;
1268: PetscInt n = xin->map->n;
1269: PetscBLASInt one = 1, bn = PetscBLASIntCast(n);
1270: CUSPARRAY *xarray;
1273: if (type == NORM_2 || type == NORM_FROBENIUS) {
1274: VecCUSPGetArrayRead(xin,&xarray);
1275: try {
1276: *z = cusp::blas::nrm2(*xarray);
1277: } catch(char* ex) {
1278: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1279: }
1280: WaitForGPU();CHKERRCUSP(ierr);
1281: VecCUSPRestoreArrayRead(xin,&xarray);
1282: PetscLogFlops(PetscMax(2.0*n-1,0.0));
1283: } else if (type == NORM_INFINITY) {
1284: PetscInt i;
1285: PetscReal max = 0.0,tmp;
1287: VecGetArrayRead(xin,&xx);
1288: for (i=0; i<n; i++) {
1289: if ((tmp = PetscAbsScalar(*xx)) > max) max = tmp;
1290: /* check special case of tmp == NaN */
1291: if (tmp != tmp) {max = tmp; break;}
1292: xx++;
1293: }
1294: VecRestoreArrayRead(xin,&xx);
1295: *z = max;
1296: } else if (type == NORM_1) {
1297: VecCUSPGetArrayRead(xin,&xarray);
1298: #if defined(PETSC_USE_REAL_SINGLE)
1299: *z = cublasSasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1300: #else
1301: *z = cublasDasum(bn,VecCUSPCastToRawPtr(*xarray),one);
1302: #endif
1303: cublasGetError();CHKERRCUSP(ierr);
1304: VecCUSPRestoreArrayRead(xin,&xarray);
1305: WaitForGPU();CHKERRCUSP(ierr);
1306: PetscLogFlops(PetscMax(n-1.0,0.0));
1307: } else if (type == NORM_1_AND_2) {
1308: VecNorm_SeqCUSP(xin,NORM_1,z);
1309: VecNorm_SeqCUSP(xin,NORM_2,z+1);
1310: }
1311: return(0);
1312: }
1315: /*the following few functions should be modified to actually work with the GPU so they don't force unneccesary allocation of CPU memory */
1319: PetscErrorCode VecSetRandom_SeqCUSP(Vec xin,PetscRandom r)
1320: {
1323: VecSetRandom_Seq(xin,r);
1324: if (xin->valid_GPU_array != PETSC_CUSP_UNALLOCATED){
1325: xin->valid_GPU_array = PETSC_CUSP_CPU;
1326: }
1327: return(0);
1328: }
1332: PetscErrorCode VecResetArray_SeqCUSP(Vec vin)
1333: {
1336: VecCUSPCopyFromGPU(vin);CHKERRCUSP(ierr);
1337: VecResetArray_Seq(vin);
1338: if (vin->valid_GPU_array != PETSC_CUSP_UNALLOCATED){
1339: vin->valid_GPU_array = PETSC_CUSP_CPU;
1340: }
1341: return(0);
1342: }
1346: PetscErrorCode VecPlaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1347: {
1350: VecCUSPCopyFromGPU(vin);CHKERRCUSP(ierr);
1351: VecPlaceArray_Seq(vin,a);
1352: if (vin->valid_GPU_array != PETSC_CUSP_UNALLOCATED){
1353: vin->valid_GPU_array = PETSC_CUSP_CPU;
1354: }
1355: return(0);
1356: }
1361: PetscErrorCode VecReplaceArray_SeqCUSP(Vec vin,const PetscScalar *a)
1362: {
1365: VecCUSPCopyFromGPU(vin);CHKERRCUSP(ierr);
1366: VecReplaceArray_Seq(vin,a);
1367: if (vin->valid_GPU_array != PETSC_CUSP_UNALLOCATED){
1368: vin->valid_GPU_array = PETSC_CUSP_CPU;
1369: }
1370: return(0);
1371: }
1376: /*@
1377: VecCreateSeqCUSP - Creates a standard, sequential array-style vector.
1379: Collective on MPI_Comm
1381: Input Parameter:
1382: + comm - the communicator, should be PETSC_COMM_SELF
1383: - n - the vector length
1385: Output Parameter:
1386: . V - the vector
1388: Notes:
1389: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1390: same type as an existing vector.
1392: Level: intermediate
1394: Concepts: vectors^creating sequential
1396: .seealso: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
1397: @*/
1398: PetscErrorCode VecCreateSeqCUSP(MPI_Comm comm,PetscInt n,Vec *v)
1399: {
1403: VecCreate(comm,v);
1404: VecSetSizes(*v,n,n);
1405: VecSetType(*v,VECSEQCUSP);
1406: return(0);
1407: }
1409: /*The following template functions are for VecDotNorm2_SeqCUSP. Note that there is no complex support as currently written*/
1410: template <typename T>
1411: struct cuspdotnormcalculate : thrust::unary_function<T,T>
1412: {
1413: __host__ __device__
1414: T operator()(T x)
1415: {
1416: return thrust::make_tuple(thrust::get<0>(x)*thrust::get<1>(x),thrust::get<1>(x)*thrust::get<1>(x));
1417: }
1418: };
1420: template <typename T>
1421: struct cuspdotnormreduce : thrust::binary_function<T,T,T>
1422: {
1423: __host__ __device__
1424: T operator()(T x,T y)
1425: {
1426: return thrust::make_tuple(thrust::get<0>(x)+thrust::get<0>(y),thrust::get<1>(x)+thrust::get<1>(y));
1427: }
1428: };
1432: PetscErrorCode VecDotNorm2_SeqCUSP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1433: {
1434: PetscErrorCode ierr;
1435: PetscScalar zero = 0.0,n=s->map->n;
1436: thrust::tuple<PetscScalar,PetscScalar> result;
1437: CUSPARRAY *sarray,*tarray;
1440: /*VecCUSPCopyToGPU(s);
1441: VecCUSPCopyToGPU(t);*/
1442: VecCUSPGetArrayRead(s,&sarray);
1443: VecCUSPGetArrayRead(t,&tarray);
1444: try {
1445: result = thrust::transform_reduce(
1446: thrust::make_zip_iterator(
1447: thrust::make_tuple(
1448: sarray->begin(),
1449: tarray->begin())),
1450: thrust::make_zip_iterator(
1451: thrust::make_tuple(
1452: sarray->end(),
1453: tarray->end())),
1454: cuspdotnormcalculate<thrust::tuple<PetscScalar,PetscScalar> >(),
1455: thrust::make_tuple(zero,zero), /*init */
1456: cuspdotnormreduce<thrust::tuple<PetscScalar, PetscScalar> >()); /* binary function */
1457: *dp = thrust::get<0>(result);
1458: *nm = thrust::get<1>(result);
1459: } catch(char* ex) {
1460: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1461: }
1462: VecCUSPRestoreArrayRead(s,&sarray);
1463: VecCUSPRestoreArrayRead(t,&tarray);
1464: WaitForGPU();CHKERRCUSP(ierr);
1465: PetscLogFlops(4.0*n);
1466: return(0);
1467: }
1471: PetscErrorCode VecDuplicate_SeqCUSP(Vec win,Vec *V)
1472: {
1476: VecCreateSeqCUSP(((PetscObject)win)->comm,win->map->n,V);
1477: PetscLayoutReference(win->map,&(*V)->map);
1478: PetscOListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1479: PetscFListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1480: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1481: return(0);
1482: }
1486: PetscErrorCode VecDestroy_SeqCUSP(Vec v)
1487: {
1491: try {
1492: if (v->spptr) {
1493: delete ((Vec_CUSP *)v->spptr)->GPUarray;
1494: delete (Vec_CUSP *)v->spptr;
1495: }
1496: } catch(char* ex) {
1497: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1498: }
1499: VecDestroy_Seq(v);
1500: return(0);
1501: }
1506: PetscErrorCode VecCreate_SeqCUSP(Vec V)
1507: {
1509: PetscMPIInt size;
1512: MPI_Comm_size(((PetscObject)V)->comm,&size);
1513: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQCUSP on more than one process");
1514: VecCreate_Seq_Private(V,0);
1515: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUSP);
1516: V->ops->dot = VecDot_SeqCUSP;
1517: V->ops->norm = VecNorm_SeqCUSP;
1518: V->ops->tdot = VecTDot_SeqCUSP;
1519: V->ops->scale = VecScale_SeqCUSP;
1520: V->ops->copy = VecCopy_SeqCUSP;
1521: V->ops->set = VecSet_SeqCUSP;
1522: V->ops->swap = VecSwap_SeqCUSP;
1523: V->ops->axpy = VecAXPY_SeqCUSP;
1524: V->ops->axpby = VecAXPBY_SeqCUSP;
1525: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUSP;
1526: V->ops->pointwisemult = VecPointwiseMult_SeqCUSP;
1527: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUSP;
1528: V->ops->setrandom = VecSetRandom_SeqCUSP;
1529: V->ops->dot_local = VecDot_SeqCUSP;
1530: V->ops->tdot_local = VecTDot_SeqCUSP;
1531: V->ops->norm_local = VecNorm_SeqCUSP;
1532: V->ops->mdot_local = VecMDot_SeqCUSP;
1533: V->ops->maxpy = VecMAXPY_SeqCUSP;
1534: V->ops->mdot = VecMDot_SeqCUSP;
1535: V->ops->aypx = VecAYPX_SeqCUSP;
1536: V->ops->waxpy = VecWAXPY_SeqCUSP;
1537: V->ops->dotnorm2 = VecDotNorm2_SeqCUSP;
1538: V->ops->placearray = VecPlaceArray_SeqCUSP;
1539: V->ops->replacearray = VecReplaceArray_SeqCUSP;
1540: V->ops->resetarray = VecResetArray_SeqCUSP;
1541: V->ops->destroy = VecDestroy_SeqCUSP;
1542: V->ops->duplicate = VecDuplicate_SeqCUSP;
1543: V->valid_GPU_array = PETSC_CUSP_UNALLOCATED;
1544: return(0);
1545: }