Actual source code: sveccuda.cu

slepc-3.12.2 2020-01-13
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV implemented as a single Vec (CUDA version)
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include "../src/sys/classes/bv/impls/svec/svec.h"
 16: #include <petsccublas.h>

 18: #if defined(PETSC_USE_COMPLEX)
 19: #include <thrust/device_ptr.h>
 20: #endif

 22: #define BLOCKSIZE 64

 24: /*
 25:     B := alpha*A + beta*B

 27:     A,B are nxk (ld=n)
 28:  */
 29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
 30: {
 32:   PetscBLASInt   m,one=1;
 33:   cublasStatus_t cberr;
 34:   cublasHandle_t cublasv2handle;

 37:   PetscCUBLASGetHandle(&cublasv2handle);
 38:   PetscBLASIntCast(n_*k_,&m);
 39:   PetscLogGpuTimeBegin();
 40:   if (beta!=(PetscScalar)1.0) {
 41:     cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
 42:     PetscLogGpuFlops(1.0*m);
 43:   }
 44:   cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
 45:   PetscLogGpuTimeEnd();
 46:   WaitForGPU();CHKERRCUDA(ierr);
 47:   PetscLogGpuFlops(2.0*m);
 48:   return(0);
 49: }

 51: /*
 52:     C := alpha*A*B + beta*C
 53: */
 54: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
 55: {
 56:   PetscErrorCode    ierr;
 57:   BV_SVEC           *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
 58:   const PetscScalar *d_px,*d_A;
 59:   PetscScalar       *d_py,*q,*d_q,*d_B,*d_C;
 60:   PetscInt          ldq,mq;
 61:   PetscBLASInt      m,n,k,ldq_;
 62:   cublasStatus_t    cberr;
 63:   cudaError_t       err;
 64:   cublasHandle_t    cublasv2handle;

 67:   if (!Y->n) return(0);
 68:   VecCUDAGetArrayRead(x->v,&d_px);
 69:   if (beta==(PetscScalar)0.0) {
 70:     VecCUDAGetArrayWrite(y->v,&d_py);
 71:   } else {
 72:     VecCUDAGetArray(y->v,&d_py);
 73:   }
 74:   d_A = d_px+(X->nc+X->l)*X->n;
 75:   d_C = d_py+(Y->nc+Y->l)*Y->n;
 76:   if (Q) {
 77:     PetscBLASIntCast(Y->n,&m);
 78:     PetscBLASIntCast(Y->k-Y->l,&n);
 79:     PetscBLASIntCast(X->k-X->l,&k);
 80:     PetscCUBLASGetHandle(&cublasv2handle);
 81:     MatGetSize(Q,&ldq,&mq);
 82:     PetscBLASIntCast(ldq,&ldq_);
 83:     MatDenseGetArray(Q,&q);
 84:     err = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(err);
 85:     err = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 86:     PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
 87:     d_B = d_q+Y->l*ldq+X->l;
 88:     PetscLogGpuTimeBegin();
 89:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);CHKERRCUBLAS(cberr);
 90:     PetscLogGpuTimeEnd();
 91:     WaitForGPU();CHKERRCUDA(ierr);
 92:     MatDenseRestoreArray(Q,&q);
 93:     err = cudaFree(d_q);CHKERRCUDA(err);
 94:     PetscLogGpuFlops(2.0*m*n*k);
 95:   } else {
 96:     BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
 97:   }
 98:   VecCUDARestoreArrayRead(x->v,&d_px);
 99:   VecCUDARestoreArrayWrite(y->v,&d_py);
100:   return(0);
101: }

103: /*
104:     y := alpha*A*x + beta*y
105: */
106: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
107: {
108:   PetscErrorCode    ierr;
109:   BV_SVEC           *x = (BV_SVEC*)X->data;
110:   const PetscScalar *d_px,*d_A;
111:   PetscScalar       *d_py,*d_q,*d_x,*d_y;
112:   PetscBLASInt      n,k,one=1;
113:   cublasStatus_t    cberr;
114:   cublasHandle_t    cublasv2handle;

117:   PetscBLASIntCast(X->n,&n);
118:   PetscBLASIntCast(X->k-X->l,&k);
119:   PetscCUBLASGetHandle(&cublasv2handle);
120:   VecCUDAGetArrayRead(x->v,&d_px);
121:   if (beta==(PetscScalar)0.0) {
122:     VecCUDAGetArrayWrite(y,&d_py);
123:   } else {
124:     VecCUDAGetArray(y,&d_py);
125:   }
126:   if (!q) {
127:     VecCUDAGetArray(X->buffer,&d_q);
128:   } else {
129:     cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));
130:     cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);
131:     PetscLogCpuToGpu(k*sizeof(PetscScalar));
132:   }
133:   d_A = d_px+(X->nc+X->l)*X->n;
134:   d_x = d_q;
135:   d_y = d_py;
136:   PetscLogGpuTimeBegin();
137:   cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
138:   PetscLogGpuTimeEnd();
139:   WaitForGPU();CHKERRCUDA(ierr);
140:   VecCUDARestoreArrayRead(x->v,&d_px);
141:   if (beta==(PetscScalar)0.0) {
142:     VecCUDARestoreArrayWrite(y,&d_py);
143:   } else {
144:     VecCUDARestoreArray(y,&d_py);
145:   }
146:   if (!q) {
147:     VecCUDARestoreArray(X->buffer,&d_q);
148:   } else {
149:     cudaFree(d_q);
150:   }
151:   PetscLogGpuFlops(2.0*n*k);
152:   return(0);
153: }

155: /*
156:     A(:,s:e-1) := A*B(:,s:e-1)
157: */
158: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
159: {
161:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
162:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
163:   PetscInt       j,ldq,nq;
164:   PetscBLASInt   m,n,k,l,ldq_,bs=BLOCKSIZE;
165:   cublasStatus_t cberr;
166:   size_t         freemem,totmem;
167:   cublasHandle_t cublasv2handle;

170:   if (!V->n) return(0);
171:   PetscBLASIntCast(V->n,&m);
172:   PetscBLASIntCast(e-s,&n);
173:   PetscBLASIntCast(V->k-V->l,&k);
174:   MatGetSize(Q,&ldq,&nq);
175:   PetscBLASIntCast(ldq,&ldq_);
176:   VecCUDAGetArray(ctx->v,&d_pv);
177:   MatDenseGetArray(Q,&q);
178:   cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
179:   cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
180:   PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
181:   PetscCUBLASGetHandle(&cublasv2handle);
182:   PetscLogGpuTimeBegin();
183:   /* try to allocate the whole matrix */
184:   cudaMemGetInfo(&freemem,&totmem);
185:   if (freemem>=m*n*sizeof(PetscScalar)) {
186:     cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
187:     d_A = d_pv+(V->nc+V->l)*m;
188:     d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
189:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
190:     for (j=0;j<n;j++) {
191:       cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
192:     }
193:   } else {
194:     bs = freemem/(m*sizeof(PetscScalar));
195:     cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));
196:     l = m % bs;
197:     if (l) {
198:       d_A = d_pv+(V->nc+V->l)*m;
199:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
200:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);CHKERRCUBLAS(cberr);
201:       for (j=0;j<n;j++) {
202:         cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
203:       }
204:     }
205:     for (;l<m;l+=bs) {
206:       d_A = d_pv+(V->nc+V->l)*m+l;
207:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
208:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);CHKERRCUBLAS(cberr);
209:       for (j=0;j<n;j++) {
210:         cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
211:       }
212:     }
213:   }
214:   PetscLogGpuTimeEnd();
215:   WaitForGPU();CHKERRCUDA(ierr);
216:   MatDenseRestoreArray(Q,&q);
217:   cudaFree(d_q);
218:   cudaFree(d_work);
219:   VecCUDARestoreArray(ctx->v,&d_pv);
220:   PetscLogGpuFlops(2.0*m*n*k);
221:   return(0);
222: }

224: /*
225:     A(:,s:e-1) := A*B(:,s:e-1)
226: */
227: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
228: {
230:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
231:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
232:   PetscInt       j,ldq,nq;
233:   PetscBLASInt   m,n,k,ldq_;
234:   cublasStatus_t cberr;
235:   cublasHandle_t cublasv2handle;

238:   if (!V->n) return(0);
239:   PetscBLASIntCast(V->n,&m);
240:   PetscBLASIntCast(e-s,&n);
241:   PetscBLASIntCast(V->k-V->l,&k);
242:   MatGetSize(Q,&ldq,&nq);
243:   PetscBLASIntCast(ldq,&ldq_);
244:   VecCUDAGetArray(ctx->v,&d_pv);
245:   MatDenseGetArray(Q,&q);
246:   PetscCUBLASGetHandle(&cublasv2handle);
247:   cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
248:   cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
249:   PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
250:   PetscLogGpuTimeBegin();
251:   cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
252:   d_A = d_pv+(V->nc+V->l)*m;
253:   d_B = d_q+V->l*ldq+s;
254:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
255:   for (j=0;j<n;j++) {
256:     cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
257:   }
258:   PetscLogGpuTimeEnd();
259:   WaitForGPU();CHKERRCUDA(ierr);
260:   MatDenseRestoreArray(Q,&q);
261:   cudaFree(d_q);
262:   cudaFree(d_work);
263:   VecCUDARestoreArray(ctx->v,&d_pv);
264:   PetscLogGpuFlops(2.0*m*n*k);
265:   return(0);
266: }

268: /*
269:     C := A'*B
270: */
271: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
272: {
273:   PetscErrorCode    ierr;
274:   BV_SVEC           *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
275:   const PetscScalar *d_px,*d_py,*d_A,*d_B;
276:   PetscScalar       *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
277:   PetscInt          j,ldm;
278:   PetscBLASInt      m,n,k,ldm_;
279:   PetscMPIInt       len;
280:   cublasStatus_t    cberr;
281:   cublasHandle_t    cublasv2handle;

284:   PetscBLASIntCast(Y->k-Y->l,&m);
285:   PetscBLASIntCast(X->k-X->l,&n);
286:   PetscBLASIntCast(X->n,&k);
287:   MatGetSize(M,&ldm,NULL);
288:   PetscBLASIntCast(ldm,&ldm_);
289:   VecCUDAGetArrayRead(x->v,&d_px);
290:   VecCUDAGetArrayRead(y->v,&d_py);
291:   MatDenseGetArray(M,&pm);
292:   PetscCUBLASGetHandle(&cublasv2handle);
293:   cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
294:   d_A = d_py+(Y->nc+Y->l)*Y->n;
295:   d_B = d_px+(X->nc+X->l)*X->n;
296:   C = pm+X->l*ldm+Y->l;
297:   if (x->mpi) {
298:     if (ldm==m) {
299:       BVAllocateWork_Private(X,m*n);
300:       if (k) {
301:         PetscLogGpuTimeBegin();
302:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);CHKERRCUBLAS(cberr);
303:         PetscLogGpuTimeEnd();
304:         cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
305:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
306:       } else {
307:         PetscArrayzero(X->work,m*n);
308:       }
309:       PetscMPIIntCast(m*n,&len);
310:       MPI_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
311:     } else {
312:       BVAllocateWork_Private(X,2*m*n);
313:       CC = X->work+m*n;
314:       if (k) {
315:         PetscLogGpuTimeBegin();
316:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
317:         PetscLogGpuTimeEnd();
318:         cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
319:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
320:       } else {
321:         PetscArrayzero(X->work,m*n);
322:       }
323:       PetscMPIIntCast(m*n,&len);
324:       MPI_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
325:       for (j=0;j<n;j++) {
326:         PetscArraycpy(C+j*ldm,CC+j*m,m);
327:       }
328:     }
329:   } else {
330:     if (k) {
331:       BVAllocateWork_Private(X,m*n);
332:       PetscLogGpuTimeBegin();
333:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
334:       PetscLogGpuTimeEnd();
335:       cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
336:       PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
337:       for (j=0;j<n;j++) {
338:         PetscArraycpy(C+j*ldm,X->work+j*m,m);
339:       }
340:     }
341:   }
342:   WaitForGPU();CHKERRCUDA(ierr);
343:   cudaFree(d_work);
344:   MatDenseRestoreArray(M,&pm);
345:   VecCUDARestoreArrayRead(x->v,&d_px);
346:   VecCUDARestoreArrayRead(y->v,&d_py);
347:   PetscLogGpuFlops(2.0*m*n*k);
348:   return(0);
349: }

351: #if defined(PETSC_USE_COMPLEX)
352: struct conjugate
353: {
354:   __host__ __device__
355:     PetscScalar operator()(PetscScalar x)
356:     {
357:       return PetscConj(x);
358:     }
359: };

361: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
362: {
363:   PetscErrorCode                  ierr;
364:   thrust::device_ptr<PetscScalar> ptr;

367:   try {
368:     ptr = thrust::device_pointer_cast(a);
369:     thrust::transform(ptr,ptr+n,ptr,conjugate());
370:     WaitForGPU();CHKERRCUDA(ierr);
371:   } catch (char *ex) {
372:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
373:   }
374:   return(0);
375: }
376: #endif

378: /*
379:     y := A'*x computed as y' := x'*A
380: */
381: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
382: {
383:   PetscErrorCode    ierr;
384:   BV_SVEC           *x = (BV_SVEC*)X->data;
385:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
386:   PetscScalar       *d_work,szero=0.0,sone=1.0,*qq=q;
387:   PetscBLASInt      n,k,one=1;
388:   PetscMPIInt       len;
389:   Vec               z = y;
390:   cublasStatus_t    cberr;
391:   cublasHandle_t    cublasv2handle;

394:   PetscBLASIntCast(X->n,&n);
395:   PetscBLASIntCast(X->k-X->l,&k);
396:   PetscCUBLASGetHandle(&cublasv2handle);
397:   if (X->matrix) {
398:     BV_IPMatMult(X,y);
399:     z = X->Bx;
400:   }
401:   VecCUDAGetArrayRead(x->v,&d_px);
402:   VecCUDAGetArrayRead(z,&d_py);
403:   if (!q) {
404:     VecCUDAGetArrayWrite(X->buffer,&d_work);
405:   } else {
406:     cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));
407:   }
408:   d_A = d_px+(X->nc+X->l)*X->n;
409:   d_x = d_py;
410:   if (x->mpi) {
411:     BVAllocateWork_Private(X,k);
412:     if (n) {
413:       PetscLogGpuTimeBegin();
414: #if defined(PETSC_USE_COMPLEX)
415:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
416:       ConjugateCudaArray(d_work,k);
417: #else
418:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
419: #endif
420:       PetscLogGpuTimeEnd();
421:       cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
422:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
423:     } else {
424:       PetscArrayzero(X->work,k);
425:     }
426:     if (!q) {
427:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
428:       VecGetArray(X->buffer,&qq);
429:     } else {
430:       cudaFree(d_work);
431:     }
432:     PetscMPIIntCast(k,&len);
433:     MPI_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
434:     if (!q) { VecRestoreArray(X->buffer,&qq); }
435:   } else {
436:     if (n) {
437:       PetscLogGpuTimeBegin();
438: #if defined(PETSC_USE_COMPLEX)
439:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
440:       ConjugateCudaArray(d_work,k);
441: #else
442:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
443: #endif
444:       PetscLogGpuTimeEnd();
445:     }
446:     if (!q) {
447:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
448:     } else {
449:       cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
450:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
451:       cudaFree(d_work);
452:     }
453:   }
454:   WaitForGPU();CHKERRCUDA(ierr);
455:   VecCUDARestoreArrayRead(z,&d_py);
456:   VecCUDARestoreArrayRead(x->v,&d_px);
457:   PetscLogGpuFlops(2.0*n*k);
458:   return(0);
459: }

461: /*
462:     y := A'*x computed as y' := x'*A
463: */
464: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
465: {
466:   PetscErrorCode    ierr;
467:   BV_SVEC           *x = (BV_SVEC*)X->data;
468:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
469:   PetscScalar       *d_y,szero=0.0,sone=1.0;
470:   PetscBLASInt      n,k,one=1;
471:   Vec               z = y;
472:   cublasStatus_t    cberr;
473:   cublasHandle_t    cublasv2handle;

476:   PetscBLASIntCast(X->n,&n);
477:   PetscBLASIntCast(X->k-X->l,&k);
478:   if (X->matrix) {
479:     BV_IPMatMult(X,y);
480:     z = X->Bx;
481:   }
482:   PetscCUBLASGetHandle(&cublasv2handle);
483:   VecCUDAGetArrayRead(x->v,&d_px);
484:   VecCUDAGetArrayRead(z,&d_py);
485:   d_A = d_px+(X->nc+X->l)*X->n;
486:   d_x = d_py;
487:   if (n) {
488:     cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));
489:     PetscLogGpuTimeBegin();
490: #if defined(PETSC_USE_COMPLEX)
491:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
492:     ConjugateCudaArray(d_y,k);
493: #else
494:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
495: #endif
496:     PetscLogGpuTimeEnd();
497:     WaitForGPU();CHKERRCUDA(ierr);
498:     cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
499:     PetscLogGpuToCpu(k*sizeof(PetscScalar));
500:     cudaFree(d_y);
501:   }
502:   VecCUDARestoreArrayRead(z,&d_py);
503:   VecCUDARestoreArrayRead(x->v,&d_px);
504:   PetscLogGpuFlops(2.0*n*k);
505:   return(0);
506: }

508: /*
509:     Scale n scalars
510: */
511: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
512: {
514:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
515:   PetscScalar    *d_array, *d_A;
516:   PetscBLASInt   n,one=1;
517:   cublasStatus_t cberr;
518:   cublasHandle_t cublasv2handle;

521:   VecCUDAGetArray(ctx->v,&d_array);
522:   if (j<0) {
523:     d_A = d_array+(bv->nc+bv->l)*bv->n;
524:     PetscBLASIntCast((bv->k-bv->l)*bv->n,&n);
525:   } else {
526:     d_A = d_array+(bv->nc+j)*bv->n;
527:     PetscBLASIntCast(bv->n,&n);
528:   }
529:   if (alpha == (PetscScalar)0.0) {
530:     cudaMemset(d_A,0,n*sizeof(PetscScalar));
531:   } else if (alpha != (PetscScalar)1.0) {
532:     PetscCUBLASGetHandle(&cublasv2handle);
533:     PetscLogGpuTimeBegin();
534:     cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
535:     PetscLogGpuTimeEnd();
536:     WaitForGPU();CHKERRCUDA(ierr);
537:     PetscLogGpuFlops(1.0*n);
538:   }
539:   VecCUDARestoreArray(ctx->v,&d_array);
540:   return(0);
541: }

543: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
544: {
545:   PetscErrorCode    ierr;
546:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
547:   const PetscScalar *d_pv;
548:   PetscScalar       *d_pw;
549:   PetscInt          j;

552:   VecCUDAGetArrayRead(v->v,&d_pv);
553:   VecCUDAGetArrayWrite(w->v,&d_pw);
554:   for (j=0;j<V->k-V->l;j++) {
555:     VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
556:     VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
557:     MatMult(A,V->cv[1],W->cv[1]);
558:     VecCUDAResetArray(V->cv[1]);
559:     VecCUDAResetArray(W->cv[1]);
560:   }
561:   VecCUDARestoreArrayRead(v->v,&d_pv);
562:   VecCUDARestoreArrayWrite(w->v,&d_pw);
563:   return(0);
564: }

566: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
567: {
568:   PetscErrorCode    ierr;
569:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
570:   const PetscScalar *d_pv,*d_pvc;
571:   PetscScalar       *d_pw,*d_pwc;
572:   cudaError_t       err;

575:   VecCUDAGetArrayRead(v->v,&d_pv);
576:   VecCUDAGetArrayWrite(w->v,&d_pw);
577:   d_pvc = d_pv+(V->nc+V->l)*V->n;
578:   d_pwc = d_pw+(W->nc+W->l)*W->n;
579:   err = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
580:   VecCUDARestoreArrayRead(v->v,&d_pv);
581:   VecCUDARestoreArrayWrite(w->v,&d_pw);
582:   return(0);
583: }

585: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
586: {
588:   BV_SVEC        *v = (BV_SVEC*)V->data;
589:   PetscScalar    *d_pv;
590:   cudaError_t    err;

593:   VecCUDAGetArray(v->v,&d_pv);
594:   err = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
595:   VecCUDARestoreArray(v->v,&d_pv);
596:   return(0);
597: }

599: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
600: {
601:   PetscErrorCode    ierr;
602:   BV_SVEC           *ctx = (BV_SVEC*)bv->data;
603:   const PetscScalar *d_pv;
604:   PetscScalar       *d_pnew,*d_ptr;
605:   PetscInt          bs,lsplit;
606:   Vec               vnew,vpar;
607:   char              str[50];
608:   cudaError_t       err;
609:   BV                parent;

612:   if (bv->issplit==2) {
613:     parent = bv->splitparent;
614:     lsplit = parent->lsplit;
615:     vpar = ((BV_SVEC*)parent->data)->v;
616:     VecCUDAResetArray(ctx->v);
617:     VecCUDAGetArray(vpar,&d_ptr);
618:     VecCUDAPlaceArray(ctx->v,d_ptr+lsplit*bv->n);
619:     VecCUDARestoreArray(vpar,&d_ptr);
620:   } else if (!bv->issplit) {
621:     VecGetBlockSize(bv->t,&bs);
622:     VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
623:     VecSetType(vnew,((PetscObject)bv->t)->type_name);
624:     VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
625:     VecSetBlockSize(vnew,bs);
626:     PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
627:     if (((PetscObject)bv)->name) {
628:       PetscSNPrintf(str,50,"%s_0",((PetscObject)bv)->name);
629:       PetscObjectSetName((PetscObject)vnew,str);
630:     }
631:     if (copy) {
632:       VecCUDAGetArrayRead(ctx->v,&d_pv);
633:       VecCUDAGetArrayWrite(vnew,&d_pnew);
634:       err = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
635:       VecCUDARestoreArrayRead(ctx->v,&d_pv);
636:       VecCUDARestoreArrayWrite(vnew,&d_pnew);
637:     }
638:     VecDestroy(&ctx->v);
639:     ctx->v = vnew;
640:   }
641:   return(0);
642: }

644: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
645: {
647:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
648:   PetscScalar    *d_pv;
649:   PetscInt       l;

652:   l = BVAvailableVec;
653:   VecCUDAGetArray(ctx->v,&d_pv);
654:   VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
655:   return(0);
656: }

658: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
659: {
661:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
662:   PetscInt       l;

665:   l = (j==bv->ci[0])? 0: 1;
666:   VecCUDAResetArray(bv->cv[l]);
667:   VecCUDARestoreArray(ctx->v,NULL);
668:   return(0);
669: }

671: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
672: {
673:   PetscErrorCode    ierr;
674:   Vec               v;
675:   const PetscScalar *d_pv;
676:   PetscObjectState  lstate,rstate;
677:   PetscBool         change=PETSC_FALSE;

680:   /* force sync flag to PETSC_CUDA_BOTH */
681:   if (L) {
682:     PetscObjectStateGet((PetscObject)*L,&lstate);
683:     if (lstate != bv->lstate) {
684:       v = ((BV_SVEC*)bv->L->data)->v;
685:       VecCUDAGetArrayRead(v,&d_pv);
686:       VecCUDARestoreArrayRead(v,&d_pv);
687:       change = PETSC_TRUE;
688:     }
689:   }
690:   if (R) {
691:     PetscObjectStateGet((PetscObject)*R,&rstate);
692:     if (rstate != bv->rstate) {
693:       v = ((BV_SVEC*)bv->R->data)->v;
694:       VecCUDAGetArrayRead(v,&d_pv);
695:       VecCUDARestoreArrayRead(v,&d_pv);
696:       change = PETSC_TRUE;
697:     }
698:   }
699:   if (change) {
700:     v = ((BV_SVEC*)bv->data)->v;
701:     VecCUDAGetArray(v,(PetscScalar **)&d_pv);
702:     VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
703:   }
704:   return(0);
705: }