Actual source code: aijcusp.cu


  3: /*
  4:     Defines the basic matrix operations for the AIJ (compressed row)
  5:   matrix storage format.
  6: */

  8: #include "petscconf.h"
 10:  #include ../src/mat/impls/aij/seq/aij.h
 11:  #include petscbt.h
 12:  #include ../src/vec/vec/impls/dvecimpl.h
 13:  #include private/vecimpl.h
 15: #undef VecType
 16:  #include ../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h


 19: #ifdef PETSC_HAVE_TXPETSCGPU

 21: #include "csr_matrix_data.h"
 22: #include "csr_matrix_data_gpu.h"
 23: #include "csr_tri_solve_gpu.h"
 24: #include "csr_tri_solve_gpu_level_scheduler.h"
 25: #include "csr_spmv_inode.h"
 26: #include <algorithm>
 27: #include <vector>
 28: #include <string>
 29: #include <thrust/sort.h>
 30: #include <thrust/fill.h>

 32: #define CSRMATRIXCPU csr_matrix_data<PetscInt,PetscScalar>
 33: #define CSRMATRIXGPU csr_matrix_data_gpu<PetscInt,PetscScalar>

 35: static std::string GPU_TRI_SOLVE_ALGORITHM="none";

 37: struct Mat_SeqAIJCUSPTriFactors {
 38:   void *loTriFactorPtr; /* pointer for lower triangular (factored matrix) on GPU */
 39:   void *upTriFactorPtr; /* pointer for upper triangular (factored matrix) on GPU */
 40: };

 42: struct Mat_SeqAIJCUSPInode {
 43:   CSRMATRIXGPU*       mat; /* pointer to the matrix on the GPU */
 44:   CUSPARRAY*        tempvec; /*pointer to a workvector to which we can copy the relevant indices of a vector we want to multiply */
 45:   CUSPINTARRAYGPU*  inodes; /*pointer to an array containing the inode data structure should use inode be true*/
 46:   PetscInt nnzPerRowMax; /* maximum number of nonzeros in a row ... for shared memory vector size */
 47:   PetscInt nodeMax; /* maximum number of nonzeros in a row ... for shared memory vector size */
 48: };

 50: struct Mat_SeqAIJCUSPTriFactorHybrid {
 51:   CSRMATRIXCPU*    cpuMat; /* pointer to the matrix on the CPU */
 52:   CSRMATRIXGPU*    gpuMat; /* pointer to the matrix on the GPU */
 53:   CUSPARRAY*       tempvecGPU; /*pointer to a workvector for storing temporary results on the GPU */
 54:   PetscInt *       nnzPerRowInDiagBlock; /* pointer to a cpu vector defining nnz in diagonal block */
 55:   PetscScalar*     tempvecCPU1; /*pointer to a workvector for storing temporary results on the CPU */
 56:   PetscScalar*     tempvecCPU2; /*pointer to a workvector for storing temporary results on the CPU */
 57:   PetscInt   nnz; /* Number of nonzeros in the triangular factor */
 58:   PetscInt   block_size; /* block size */
 59: };

 61: struct Mat_SeqAIJCUSPTriFactorLevelScheduler {
 62:   CSRMATRIXGPU*    gpuMat;     /* pointer to the matrix on the GPU */
 63:   CUSPARRAY*       tempvecGPU; /* pointer to a workvector for storing temporary results on the GPU */
 64:   CUSPINTARRAYGPU*  perms;     /* pointer to an array containing the permutation array*/
 65:   CUSPINTARRAYGPU*  levels;    /* pointer to an array containing the levels data*/
 66:   CUSPINTARRAYGPU*  ordIndicesGPU; /* For Lower triangular, this is the row permutation. For Upper triangular, this the column permutation */
 67:   PetscInt * levelsCPU;        /* pointer to an array containing the levels data*/
 68:   PetscInt   nLevels; /* number of levels */
 69:   PetscInt   levelSum; /* number of levels */
 70:   PetscInt   maxNumUnknownsAtSameLevel; /* maximum number of unkowns that can be computed simultaneously */
 71: };


 75: PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSP(Mat,Mat,IS,IS,const MatFactorInfo*);
 76: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSP(Mat,Mat,IS,IS,const MatFactorInfo*);
 77: PetscErrorCode MatLUFactorNumeric_SeqAIJCUSP(Mat,Mat,const MatFactorInfo *);
 78: PetscErrorCode MatSolve_SeqAIJCUSP(Mat,Vec,Vec);
 79: PetscErrorCode MatSolve_SeqAIJCUSP_NaturalOrdering(Mat,Vec,Vec);
 80: PetscErrorCode MatMult_SeqAIJCUSP_Inode(Mat,Vec,Vec);


 88: PetscErrorCode MatGetFactor_seqaij_petsccusp(Mat A,MatFactorType ftype,Mat *B)
 89: {
 90:   PetscErrorCode     ierr;

 93:   MatGetFactor_seqaij_petsc(A,ftype,B);

 95:   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT){
 96:     MatSetType(*B,MATSEQAIJCUSP);
 97:     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSP;
 98:     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSP;
 99:   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSP Matrix Types");
100:   (*B)->factortype = ftype;
101:   return(0);
102: }


108: PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSP(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
109: {
110:   PetscErrorCode     ierr;

113:   MatILUFactorSymbolic_SeqAIJ(fact,A,isrow,iscol,info);
114:   (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSP;
115:   return(0);
116: }

120: PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSP(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
121: {
122:   PetscErrorCode     ierr;

125:   MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);
126:   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSP;
127:   return(0);
128: }

132: PetscErrorCode MatCUSPUnravelOrderingAndCopyToGPU(Mat A)
133: {
134:   Mat_SeqAIJCUSPTriFactors *cuspTriFactors  = (Mat_SeqAIJCUSPTriFactors*)A->spptr;
135:   Mat_SeqAIJCUSPTriFactorHybrid *cuspstructLo  = (Mat_SeqAIJCUSPTriFactorHybrid*)cuspTriFactors->loTriFactorPtr;
136:   Mat_SeqAIJCUSPTriFactorHybrid *cuspstructUp  = (Mat_SeqAIJCUSPTriFactorHybrid*)cuspTriFactors->upTriFactorPtr;

138:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
139:   PetscInt          n = A->rmap->n;
140:   const PetscInt    *ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
141:   const MatScalar   *aa = a->a,*v;
142:   PetscInt *AiLo, *AjLo, *AiUp, *AjUp;
143:   PetscScalar *AALo, *AAUp;
144:   PetscInt          i,nz, nzLower, nzUpper, offset, rowOffset, j, block_size_counter, nnzBlockLower=0, nnzBlockUpper=0;
145:   PetscErrorCode    ierr;
146:   bool success;


150:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
151:     // Get the block size from the command line
152:     PetscInt int_value=0;
153:     PetscBool found;
154:     PetscInt block_size=1;
155:     PetscOptionsGetInt(PETSC_NULL, "-gpu_LU_block_size", &int_value, &found);
156:     if(found == PETSC_TRUE) {
157:       if(int_value > 0)
158:         block_size = int_value;
159:       else
160:         printf("Bad argument to -gpu_LU_block_size.  Must be positive.\n");
161:     }
162:     else {
163:       printf("-gpu_LU_block_size positive_int not found.  Use internal formula.\n");
164:       block_size = 1000; //get_gpu_LU_block_size(); // something like that for now
165:     }
166: 
167:     /*************************************************************************/
168:     /* To Unravel the factored matrix into 2 CSR matrices, do the following  */
169:     /* - Calculate the number of nonzeros in the lower triangular sparse     */
170:     /*   including 1's on the diagonal.                                      */
171:     /* - Calculate the number of nonzeros in the upper triangular sparse     */
172:     /*   including arbitrary values on the diagonal.                         */
173:     /* - Fill the Lower triangular portion from the matrix A                 */
174:     /* - Fill the Upper triangular portion from the matrix A                 */
175:     /* - Assign each to a separate cusp data structure                       */
176:     /*************************************************************************/

178:     /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
179:     nzLower=n+ai[n]-ai[1];
180:     /* next, figure out the number of nonzeros in the upper triangular matrix. */
181:     nzUpper = adiag[0]-adiag[n];

183:     cudaError_t err;
184:     /* Allocate Space for the lower triangular matrix */
185:     PetscMalloc(n*sizeof(PetscInt), &(cuspstructLo->nnzPerRowInDiagBlock));
186:     err = cudaMallocHost((void **) &AiLo, (n+1)*sizeof(PetscInt)); CHKERRCUSP(err);
187:     err = cudaMallocHost((void **) &AjLo, nzLower*sizeof(PetscInt)); CHKERRCUSP(err);
188:     err = cudaMallocHost((void **) &AALo, nzLower*sizeof(PetscScalar)); CHKERRCUSP(err);

190:     /* set the number of nonzeros */
191:     cuspstructLo->nnz=nzLower;
192:     cuspstructLo->block_size=block_size;

194:     /* Fill the lower triangular matrix */
195:     AiLo[0]=(PetscInt) 0;
196:     AiLo[n]=nzLower;
197:     AjLo[0]=(PetscInt) 0;
198:     AALo[0]=(MatScalar) 1.0;
199:     v    = aa;
200:     vi   = aj;
201:     offset=1;
202:     rowOffset=1;
203:     cuspstructLo->nnzPerRowInDiagBlock[0]=1;
204:     nnzBlockLower+=1;
205:     block_size_counter=0;
206:     for (i=1; i<n; i++) {
207:       nz  = ai[i+1] - ai[i];
208:       // additional 1 for the term on the diagonal
209:       AiLo[i]=rowOffset;
210:       rowOffset+=nz+1;
211: 
212:       memcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));
213:       memcpy(&(AALo[offset]), v, nz*sizeof(MatScalar));
214: 
215:       offset+=nz;
216:       AjLo[offset]=(PetscInt) i;
217:       AALo[offset]=(MatScalar) 1.0;
218:       offset+=1;

220:       // Count the number of nnz per row in the diagonal blocks.
221:       offset-=nz+1;
222:       if (i%block_size==0 && i>0)
223:         block_size_counter++;
224:       j=0;
225:       while (AjLo[offset+j]<block_size_counter*block_size) j++;
226:       cuspstructLo->nnzPerRowInDiagBlock[i]=nz+1-j;
227:       nnzBlockLower+=cuspstructLo->nnzPerRowInDiagBlock[i];
228:       offset+=nz+1;

230:       v  += nz;
231:       vi += nz;
232:     }

234:     /* set the number of nonzeros */
235:     cuspstructUp->nnz=nzUpper;
236:     cuspstructUp->block_size=block_size;

238:     /* Allocate Space for the upper triangular matrix */
239:     PetscMalloc(n*sizeof(PetscInt), &(cuspstructUp->nnzPerRowInDiagBlock));
240:     err = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt)); CHKERRCUSP(err);
241:     err = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt)); CHKERRCUSP(err);
242:     err = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar)); CHKERRCUSP(err);
243: 
244:     /* Fill the upper triangular matrix */
245:     AiUp[0]=(PetscInt) 0;
246:     AiUp[n]=nzUpper;
247:     offset = nzUpper;
248:     block_size_counter=-1;
249:     for (i=n-1; i>=0; i--){
250:       v   = aa + adiag[i+1] + 1;
251:       vi  = aj + adiag[i+1] + 1;
252: 
253:       // number of elements NOT on the diagonal
254:       nz = adiag[i] - adiag[i+1]-1;
255: 
256:       // decrement the offset
257:       offset -= (nz+1);
258: 
259:       // first, set the diagonal elements
260:       AjUp[offset] = (PetscInt) i;
261:       AAUp[offset] = 1./v[nz];
262:       AiUp[i] = AiUp[i+1] - (nz+1);
263: 
264:       // copy the off diagonal elements
265:       memcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));
266:       memcpy(&(AAUp[offset+1]), v, nz*sizeof(MatScalar));
267:     }

269:     // Count the number of nnz per row in the diagonal blocks.
270:     // need to do this by working from the top of the matrix
271:     block_size_counter=0;
272:     for (i=0; i<n; i++){
273:       if (i%block_size==0 && i>0)
274:         block_size_counter++;
275:       nz = AiUp[i+1]-AiUp[i];
276:       j=0;
277:       while (AjUp[AiUp[i]+j]<(block_size_counter+1)*block_size && j<nz) j++;
278:       cuspstructUp->nnzPerRowInDiagBlock[i]=j;
279:       nnzBlockUpper+=cuspstructUp->nnzPerRowInDiagBlock[i];
280:     }

282:     try {
283:       /* The Lower triangular matrix */
284:       cuspstructLo->cpuMat = new CSRMATRIXCPU(n,n,nzLower,AiLo,AjLo,AALo);
285:       cuspstructLo->gpuMat = new CSRMATRIXGPU;
286:       success = (cuspstructLo->gpuMat)->copy_from_host(*(cuspstructLo->cpuMat));
287:       if (!success) {
288:         printf("Failed in cuspstructLo->gpuMat->copy_from_host\n");
289:         CHKERRCUSP(1);
290:       }
291:       // allocate temporary vectors using pinned memory
292:       err = cudaMallocHost((void **) &(cuspstructLo->tempvecCPU1),
293:                            (size_t) n*sizeof(PetscScalar)); CHKERRCUSP(err);
294:       err = cudaMallocHost((void **) &(cuspstructLo->tempvecCPU2),
295:                            (size_t) n*sizeof(PetscScalar)); CHKERRCUSP(err);

297:       cuspstructLo->tempvecGPU = new CUSPARRAY;
298:       (cuspstructLo->tempvecGPU)->resize(n);
299: 

301:       /* The Upper triangular matrix */
302:       cuspstructUp->cpuMat = new CSRMATRIXCPU(n,n,nzUpper,AiUp,AjUp,AAUp);
303:       cuspstructUp->gpuMat = new CSRMATRIXGPU;
304:       success = (cuspstructUp->gpuMat)->copy_from_host(*(cuspstructUp->cpuMat));
305:       if (!success) {
306:         printf("Failed in cuspstructUp->gpuMat->copy_from_host\n");
307:         CHKERRCUSP(1);
308:       }
309:       // allocate temporary vectors using pinned memory
310:       err = cudaMallocHost((void **) &(cuspstructUp->tempvecCPU1),
311:                            (size_t) n*sizeof(PetscScalar)); CHKERRCUSP(err);
312:       err = cudaMallocHost((void **) &(cuspstructUp->tempvecCPU2),
313:                            (size_t) n*sizeof(PetscScalar)); CHKERRCUSP(err);

315:       cuspstructUp->tempvecGPU = new CUSPARRAY;
316:       (cuspstructUp->tempvecGPU)->resize(n);
317: 
318:     } catch(char* ex) {
319:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
320:     }
321:     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
322:   }
323:   return(0);
324: }


329: PetscErrorCode MatCUSPUnravelOrderingToLevelSchedulerAndCopyToGPU(Mat A)
330: {
331:   PetscErrorCode     ierr;
332:   Mat_SeqAIJCUSPTriFactors *cuspTriFactors  = (Mat_SeqAIJCUSPTriFactors*)A->spptr;
333:   Mat_SeqAIJCUSPTriFactorLevelScheduler *cuspstructLo  = (Mat_SeqAIJCUSPTriFactorLevelScheduler*)cuspTriFactors->loTriFactorPtr;
334:   Mat_SeqAIJCUSPTriFactorLevelScheduler *cuspstructUp  = (Mat_SeqAIJCUSPTriFactorLevelScheduler*)cuspTriFactors->upTriFactorPtr;
335:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
336:   PetscInt          n = A->rmap->n;
337:   const PetscInt    *ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
338:   const MatScalar   *aa = a->a,*v;
339:   PetscInt i, j, max, nz, nzLower, nzUpper, offset;
340:   PetscInt *AiLo, *AjLo, *AiUp, *AjUp, *levelsCPULo, *levelsCPUUp;
341:   PetscScalar *AALo, *AAUp, *AADiag;
342:   bool success;
343:   std::vector<PetscInt> lLo(n);
344:   std::vector<PetscInt> lUp(n);
345:   std::vector<PetscInt> qLo(n);
346:   std::vector<PetscInt> qUp(n);
347:   std::vector<PetscInt> lLoBin(0);
348:   std::vector<PetscInt> lUpBin(0);

351:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){

353:     PetscBool diagFlag = PETSC_FALSE;
354:     PetscBool diagFlagFull = PETSC_FALSE;
355:     PetscOptionsGetBool(PETSC_NULL, "-level_scheduler_diagnostics_view", &diagFlag, PETSC_NULL);
356:     PetscOptionsGetBool(PETSC_NULL, "-level_scheduler_diagnostics_view_full", &diagFlagFull, PETSC_NULL);

358:     /* initialize to lLo/lUp to 0 ... qLo/qUp to the index */
359:     for (i=0; i<n; i++) { lLo[i] = 0; qLo[i] = i; lUp[i] = 0; qUp[i] = i; }

361:     /* Compute the lower triangular levels */
362:     lLo[0] = 1;
363:     lLoBin.push_back(1);
364:     for (i=1; i<n; i++) {
365:       nz  = ai[i+1] - ai[i];
366:       lLo[i] = 1; max = 1;

368:       for (j=0; j<nz; j++)
369:         max = lLo[ aj[ ai[i]+j ] ]>max ? lLo[ aj[ ai[i]+j ] ] : max;

371:       lLo[i] = 1+max;

373:       if (max>= static_cast<PetscInt>(lLoBin.size()))
374:         lLoBin.push_back(1);
375:       else
376:         lLoBin[max]++;
377:     }
378: 
379:     /* compute the number of levels */
380:     cuspstructLo->nLevels=lLoBin.size();

382:     /* set the maximum number of unknowns at the same level */
383:     cuspstructLo->maxNumUnknownsAtSameLevel=*(std::max_element(&lLoBin[0], &lLoBin[0]+cuspstructLo->nLevels));

385:     /* compute the sum of all the levels */
386:     cuspstructLo->levelSum=0;
387:     for (i=0; i< static_cast<PetscInt>(lLoBin.size()); i++)
388:       cuspstructLo->levelSum+=lLoBin[i];

390:     /* Determine the permutation array through a keyed sort ... easy to do in thrust */
391:     thrust::sort_by_key(&lLo[0], &lLo[0]+n, &qLo[0]);

393:     /* print out level scheduler diagnostics for the lower triangular matrix */
394:     if (diagFlag || diagFlagFull) {
395:       std::cout << std::endl;
396:       std::cout << "nlevels in lower triangular factor="<<cuspstructLo->nLevels<<std::endl;
397:       std::cout << "maxNumUnknownsAtSameLevel in lower triangular factor="<<cuspstructLo->maxNumUnknownsAtSameLevel<<std::endl;
398:       std::cout << "levelSum (should be equal to number of unknowns)="<<cuspstructLo->levelSum<<std::endl;
399:       std::cout << "number of unknowns="<<n<<std::endl;

401:       if (diagFlagFull) {
402:         std::cout << "Ordering of unknowns in the lower triangular matrix"<<std::endl;
403:         std::cout << "==================================================="<<std::endl;
404:         int index = 0;
405:         std::cout << "Level # : number of unknowns at this level :  (level #,  unknown index)" << std::endl;
406:         for (i=0; i<static_cast<PetscInt>(lLoBin.size()); i++) {
407:           std::cout << "Level " << i+1 << " : " << lLoBin[i] << " : ";
408:           for (j=0; j<lLoBin[i]; j++)
409:          std::cout << "  (" << lLo[index + j] << "," << qLo[index + j] <<")";
410:           std::cout << std::endl;
411:           index+=lLoBin[i];
412:         }
413:       }
414:       std::cout << std::endl;
415:     }

417:     /* Compute the upper triangular levels */
418:     lUp[n-1] = 1;
419:     lUpBin.push_back(1);

421:     // set the pointers
422:     v    = aa+ai[n]-ai[1];
423:     vi   = aj+ai[n]-ai[1];

425:     for (i=n-2; i>=0; i--){
426:       // set the pointers
427:       v   = aa + adiag[i+1] + 1;
428:       vi  = aj + adiag[i+1] + 1;
429: 
430:       // number of elements NOT on the diagonal
431:       nz = adiag[i] - adiag[i+1] -1;

433:       lUp[i] = 1; max = 0;

435:       for (j=0; j<nz; j++)
436:         max = lUp[ vi[ j ] ]>max ? lUp[ vi[ j ] ] : max;

438:       lUp[i] = 1+max;
439:       if (max>= static_cast<PetscInt>(lUpBin.size()))
440:         lUpBin.push_back(1);
441:       else
442:         lUpBin[max]++;
443:     }

445:     /* compute the number of levels */
446:     cuspstructUp->nLevels=lUpBin.size();

448:     /* set the maximum number of unknowns at the same level */
449:     cuspstructUp->maxNumUnknownsAtSameLevel=*(std::max_element(&lUpBin[0], &lUpBin[0]+cuspstructUp->nLevels));

451:     /* compute the sum of all the levels */
452:     cuspstructUp->levelSum=0;
453:     for (i=0; i< static_cast<PetscInt>(lUpBin.size()); i++)
454:       cuspstructUp->levelSum+=lUpBin[i];

456:     /* Determine the permutation array through a keyed sort ... easy to do in thrust */
457:     thrust::sort_by_key(&lUp[0], &lUp[0]+n, &qUp[0]);

459:     /* print out level scheduler diagnostics for the upper triangular matrix */
460:     if (diagFlag || diagFlagFull) {
461:       std::cout << std::endl;
462:       std::cout << "nlevels in upper triangular factor="<<cuspstructUp->nLevels<<std::endl;
463:       std::cout << "maxNumUnknownsAtSameLevel in upper triangular factor="<<cuspstructUp->maxNumUnknownsAtSameLevel<<std::endl;
464:       std::cout << "levelSum (should be equal to number of unknowns)="<<cuspstructUp->levelSum<<std::endl;
465:       std::cout << "number of unknowns="<<n<<std::endl;

467:       if (diagFlagFull) {
468:         std::cout << "Ordering of unknowns in the upper triangular matrix"<<std::endl;
469:         std::cout << "==================================================="<<std::endl;
470:         int index = 0;
471:         std::cout << "Level # : number of unknowns at this level :  (level #,  unknown index)" << std::endl;
472:         for (i=0; i<static_cast<PetscInt>(lUpBin.size()); i++) {
473:           std::cout << "Level " << i+1 << " : " << lUpBin[i] << " : ";
474:           for (j=0; j<lUpBin[i]; j++)
475:             std::cout << "  (" << lUp[index + j] << "," << qUp[index + j] <<")";
476:           std::cout << std::endl;
477:           index+=lUpBin[i];
478:         }
479:       }
480:       std::cout << std::endl;
481:     }

483:     PetscMalloc(lLoBin.size()*sizeof(PetscInt), &levelsCPULo);
484:     PetscMalloc(lUpBin.size()*sizeof(PetscInt), &levelsCPUUp);

486:     memcpy(&levelsCPULo[0], &lLoBin[0], lLoBin.size()*sizeof(PetscInt));
487:     memcpy(&levelsCPUUp[0], &lUpBin[0], lUpBin.size()*sizeof(PetscInt));

489:     /*************************************************************************/
490:     /* To Unravel the factored matrix into 2 CSR matrices, do the following  */
491:     /* - Calculate the number of nonzeros in the lower triangular sparse     */
492:     /*   including 1's on the diagonal.                                      */
493:     /* - Calculate the number of nonzeros in the upper triangular sparse     */
494:     /*   including arbitrary values on the diagonal.                         */
495:     /* - Fill the Lower triangular portion from the matrix A                 */
496:     /* - Fill the Upper triangular portion from the matrix A                 */
497:     /* - Assign each to a separate cusp data structure                       */
498:     /*************************************************************************/

500:     /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
501:     nzLower=ai[n]-ai[1];
502:     /* next, figure out the number of nonzeros in the upper triangular matrix ... excluding the diagonal. */
503:     nzUpper = adiag[0]-adiag[n]-n;

505:     /* Set pointers for lower triangular matrices */
506:     AiLo = const_cast<PetscInt *>(ai);
507:     AjLo = const_cast<PetscInt *>(aj);
508:     AALo = const_cast<PetscScalar *>(aa);

510:     /* Allocate Space for the upper triangular matrix */
511:     cudaError_t err = cudaMallocHost((void **) &AiUp, (n+1)*sizeof(PetscInt)); CHKERRCUSP(err);
512:     err = cudaMallocHost((void **) &AjUp, nzUpper*sizeof(PetscInt)); CHKERRCUSP(err);
513:     err = cudaMallocHost((void **) &AAUp, nzUpper*sizeof(PetscScalar)); CHKERRCUSP(err);
514:     err = cudaMallocHost((void **) &AADiag, n*sizeof(PetscScalar)); CHKERRCUSP(err);
515: 
516:     /* Fill the upper triangular matrix */
517:     AiUp[0]=(PetscInt) 0;
518:     AiUp[n]=nzUpper;
519:     offset = nzUpper;
520:     for (i=n-1; i>=0; i--){
521:       v   = aa + adiag[i+1] + 1;
522:       vi  = aj + adiag[i+1] + 1;
523: 
524:       // number of elements NOT on the diagonal
525:       nz = adiag[i] - adiag[i+1]-1;
526: 
527:       // decrement the offset
528:       offset -= nz;
529: 
530:       // first, set the diagonal elements
531:       // this is actually the inverse of the diagonal.
532:       AADiag[i] = v[nz];
533:       AiUp[i] = AiUp[i+1] - nz;
534: 
535:       // copy the off diagonal elements
536:       memcpy(&(AjUp[offset]), vi, nz*sizeof(PetscInt));
537:       memcpy(&(AAUp[offset]), v, nz*sizeof(MatScalar));
538:       // scale the rest of the matrix by the inverse of the diagonal
539:       for (j=0; j<nz; j++) AAUp[offset+j]*=v[nz];
540:     }

542:     try {
543:       Mat_SeqAIJ *b=(Mat_SeqAIJ *)A->data;
544:       IS               isrow = b->row,iscol = b->icol;
545:       PetscBool        row_identity,col_identity;
546:       const PetscInt   *r,*c;

548:       ISGetIndices(isrow,&r);
549:       ISGetIndices(iscol,&c);
550:       ISIdentity(isrow,&row_identity);
551:       ISIdentity(iscol,&col_identity);

553:       cuspstructLo->ordIndicesGPU = new CUSPINTARRAYGPU;
554:       (cuspstructLo->ordIndicesGPU)->assign(&r[0], &r[0]+A->rmap->n);
555: 
556:       cuspstructUp->ordIndicesGPU = new CUSPINTARRAYGPU;
557:       (cuspstructUp->ordIndicesGPU)->assign(&c[0], &c[0]+A->rmap->n);
558: 
559:       /* The Lower triangular matrix */
560:       CSRMATRIXCPU * cpuMat = new CSRMATRIXCPU(n,n,nzLower,AiLo,AjLo,AALo);
561:       cuspstructLo->gpuMat = new CSRMATRIXGPU;
562:       success = (cuspstructLo->gpuMat)->copy_from_host(*cpuMat);
563:       if (!success) { printf("Failed in cuspstructLo->gpuMat->copy_from_host\n"); CHKERRCUSP(1); }
564:       delete cpuMat;

566:       cuspstructLo->tempvecGPU = new CUSPARRAY;
567:       (cuspstructLo->tempvecGPU)->resize(n);
568:       thrust::fill(cuspstructLo->tempvecGPU->begin(), cuspstructLo->tempvecGPU->end(), (PetscScalar) 1.0);
569: 
570:       cuspstructLo->levels = new CUSPINTARRAYGPU;
571:       (cuspstructLo->levels)->assign(&lLoBin[0], &lLoBin[0]+cuspstructLo->nLevels);

573:       cuspstructLo->levelsCPU = levelsCPULo;

575:       cuspstructLo->perms = new CUSPINTARRAYGPU;
576:       (cuspstructLo->perms)->assign(&qLo[0], &qLo[0]+n);

578:       /* The Upper triangular matrix */
579:       cpuMat = new CSRMATRIXCPU(n,n,nzUpper,AiUp,AjUp,AAUp);
580:       cuspstructUp->gpuMat = new CSRMATRIXGPU;
581:       success = (cuspstructUp->gpuMat)->copy_from_host(*cpuMat);
582:       if (!success) { printf("Failed in cuspstructUp->gpuMat->copy_from_host\n"); CHKERRCUSP(1); }
583:       delete cpuMat;

585:       // will use this vector to contain the inverse of the diagonal
586:       cuspstructUp->tempvecGPU = new CUSPARRAY;
587:       (cuspstructUp->tempvecGPU)->assign(&AADiag[0], &AADiag[0]+n);

589:       cuspstructUp->levels = new CUSPINTARRAYGPU;
590:       (cuspstructUp->levels)->assign(&lUpBin[0], &lUpBin[0]+cuspstructUp->nLevels);

592:       cuspstructUp->levelsCPU = levelsCPUUp;

594:       cuspstructUp->perms = new CUSPINTARRAYGPU;
595:       (cuspstructUp->perms)->assign(&qUp[0], &qUp[0]+n);
596: 
597:     } catch(char* ex) {
598:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
599:     }

601:     // Free CPU space
602:     err = cudaFreeHost(AiUp); CHKERRCUSP(err);
603:     err = cudaFreeHost(AjUp); CHKERRCUSP(err);
604:     err = cudaFreeHost(AAUp); CHKERRCUSP(err);
605:     err = cudaFreeHost(AADiag); CHKERRCUSP(err);

607:     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
608:   }
609:   return(0);
610: }

614: PetscErrorCode MatLUFactorNumeric_SeqAIJCUSP(Mat B,Mat A,const MatFactorInfo *info)
615: {
616:   PetscErrorCode   ierr;
617:   Mat_SeqAIJ       *b=(Mat_SeqAIJ *)B->data;
618:   IS               isrow = b->row,iscol = b->col;
619:   PetscBool        row_identity,col_identity;

622: 
623:   MatLUFactorNumeric_SeqAIJ(B,A,info);
624: 
625:   // determine which version of MatSolve needs to be used.
626:   ISIdentity(isrow,&row_identity);
627:   ISIdentity(iscol,&col_identity);
628:   if (row_identity && col_identity) B->ops->solve = MatSolve_SeqAIJCUSP_NaturalOrdering;
629:   else                              B->ops->solve = MatSolve_SeqAIJCUSP;

631:   // get the triangular factors
632:   if (GPU_TRI_SOLVE_ALGORITHM!="none") {
633:     if (GPU_TRI_SOLVE_ALGORITHM=="levelScheduler") {
634:       MatCUSPUnravelOrderingToLevelSchedulerAndCopyToGPU(B);
635:     } else {
636:       MatCUSPUnravelOrderingAndCopyToGPU(B);
637:     }
638:   }

640:   return(0);
641: }



647: PetscErrorCode MatSolve_SeqAIJCUSP(Mat A,Vec bb,Vec xx)
648: {
649:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
651:   PetscBool      usecprow    = a->compressedrow.use;
652:   CUSPARRAY      *xGPU, *bGPU;


656:   if (GPU_TRI_SOLVE_ALGORITHM!="none") {
657:     // Get the GPU pointers
658:     VecCUSPGetArrayWrite(xx,&xGPU);
659:     VecCUSPGetArrayRead(bb,&bGPU);
660:     if (usecprow){ /* use compressed row format */
661:       try {
662:         ;
663:         // Have no idea what to do here!
664: 
665:       } catch (char* ex) {
666:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
667:       }
668:     } else { /* do not use compressed row format */
669:       try {
670: 
671:         if (GPU_TRI_SOLVE_ALGORITHM=="levelScheduler") {
672:           Mat_SeqAIJCUSPTriFactors *cuspTriFactors  = (Mat_SeqAIJCUSPTriFactors*)A->spptr;
673:           Mat_SeqAIJCUSPTriFactorLevelScheduler *cuspstructLo  = (Mat_SeqAIJCUSPTriFactorLevelScheduler*)cuspTriFactors->loTriFactorPtr;
674:           Mat_SeqAIJCUSPTriFactorLevelScheduler *cuspstructUp  = (Mat_SeqAIJCUSPTriFactorLevelScheduler*)cuspTriFactors->upTriFactorPtr;
675: 
676:           // Copy the right hand side vector, bGPU, into xGPU with the row permutation
677:           thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), (cuspstructLo->ordIndicesGPU)->begin()),
678:                        thrust::make_permutation_iterator(bGPU->end(),   (cuspstructLo->ordIndicesGPU)->end()),
679:                        xGPU->begin());
680: 
681:           // Lower solve
682:           csr_tri_solve_level_scheduler<PetscInt, PetscScalar>(cuspstructLo->gpuMat,
683:                                                                       cuspstructLo->nLevels,
684:                                                                       cuspstructLo->maxNumUnknownsAtSameLevel,
685:                                                                       cuspstructLo->levelSum,
686:                                                                       thrust::raw_pointer_cast((cuspstructLo->levels)->data()),
687:                                                                       cuspstructLo->levelsCPU,
688:                                                                       thrust::raw_pointer_cast((cuspstructLo->perms)->data()),
689:                                                                       thrust::raw_pointer_cast(xGPU->data())); CHKERRCUSP(ierr);
690: 
691:           // Scale the result of the lower solve by diagonal vector stored in cuspstructUp->tempvecGPU.
692:           // ALL off-diagonal terms in the upper triangular matrix are already normalized by the diagonal factor
693:           thrust::transform((cuspstructUp->tempvecGPU)->begin(), (cuspstructUp->tempvecGPU)->end(),
694:                             xGPU->begin(), xGPU->begin(), thrust::multiplies<PetscScalar>());

696:           // Upper solve
697:           csr_tri_solve_level_scheduler<PetscInt, PetscScalar>(cuspstructUp->gpuMat,
698:                                                                       cuspstructUp->nLevels,
699:                                                                       cuspstructUp->maxNumUnknownsAtSameLevel,
700:                                                                       cuspstructUp->levelSum,
701:                                                                       thrust::raw_pointer_cast((cuspstructUp->levels)->data()),
702:                                                                       cuspstructUp->levelsCPU,
703:                                                                       thrust::raw_pointer_cast((cuspstructUp->perms)->data()),
704:                                                                       thrust::raw_pointer_cast(xGPU->data())); CHKERRCUSP(ierr);
705: 
706: 
707:           // Copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place.
708:           thrust::copy(thrust::make_permutation_iterator(xGPU->begin(),   (cuspstructUp->ordIndicesGPU)->begin()),
709:                        thrust::make_permutation_iterator(xGPU->end(), (cuspstructUp->ordIndicesGPU)->end()),
710:                        (cuspstructLo->tempvecGPU)->begin());
711: 
712:           // Copy the temporary to the full solution.
713:           thrust::copy((cuspstructLo->tempvecGPU)->begin(), (cuspstructLo->tempvecGPU)->end(), xGPU->begin());
714: 
715:         }
716:         else {
717:           std::cout << "Error in MatSolve_SeqAIJCUSP : Currently, only levelScheduler is supported for GPU tri-solve when using matrix reordering." << std::endl;
718:           CHKERRCUSP(1);
719:         }
720:       } catch(char* ex) {
721:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
722:       }
723:     }
724:     VecCUSPRestoreArrayRead(bb,&bGPU);
725:     VecCUSPRestoreArrayWrite(xx,&xGPU);
726:     WaitForGPU();CHKERRCUSP(ierr);
727:     PetscLogFlops(2.0*a->nz - A->cmap->n);
728:   } else {

730:     // Revert to the CPU solve if a GPU algorithm is not found!
731:     MatSolve_SeqAIJ(A,bb,xx);
732:   }
733:   return(0);
734: }




741: PetscErrorCode MatSolve_SeqAIJCUSP_NaturalOrdering(Mat A,Vec bb,Vec xx)
742: {
743:   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
744:   PetscErrorCode    ierr;
745:   PetscBool         usecprow    = a->compressedrow.use;
746:   PetscScalar       *x;
747:   const PetscScalar *b;
748:   CUSPARRAY         *xGPU, *bGPU;

751:   if (GPU_TRI_SOLVE_ALGORITHM!="none") {
752:     // Get the GPU pointers
753:     VecCUSPGetArrayWrite(xx,&xGPU);
754:     VecCUSPGetArrayRead(bb,&bGPU);
755:     if (usecprow){ /* use compressed row format */
756:       try {
757:         ;
758:         // Have no idea what to do here!
759: 
760:       } catch (char* ex) {
761:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
762:       }
763:     } else { /* do not use compressed row format */
764:       try {
765: 
766:         if (GPU_TRI_SOLVE_ALGORITHM=="levelScheduler") {
767:           Mat_SeqAIJCUSPTriFactors *cuspTriFactors  = (Mat_SeqAIJCUSPTriFactors*)A->spptr;
768:           Mat_SeqAIJCUSPTriFactorLevelScheduler *cuspstructLo  = (Mat_SeqAIJCUSPTriFactorLevelScheduler*)cuspTriFactors->loTriFactorPtr;
769:           Mat_SeqAIJCUSPTriFactorLevelScheduler *cuspstructUp  = (Mat_SeqAIJCUSPTriFactorLevelScheduler*)cuspTriFactors->upTriFactorPtr;
770: 
771:           // Copy bGPU to another temporary on the GPU so that the temporary can be overwritten.
772:           // This should be wrapped in a VecCUSPCopyGpuToGpu function with safety mechanisms.
773:           thrust::copy(bGPU->begin(),bGPU->end(), xGPU->begin());
774: 
775:           // Lower solve
776:           csr_tri_solve_level_scheduler<PetscInt, PetscScalar>(cuspstructLo->gpuMat,
777:                                                                       cuspstructLo->nLevels,
778:                                                                       cuspstructLo->maxNumUnknownsAtSameLevel,
779:                                                                       cuspstructLo->levelSum,
780:                                                                       thrust::raw_pointer_cast((cuspstructLo->levels)->data()),
781:                                                                       cuspstructLo->levelsCPU,
782:                                                                       thrust::raw_pointer_cast((cuspstructLo->perms)->data()),
783:                                                                       thrust::raw_pointer_cast(xGPU->data())); CHKERRCUSP(ierr);
784: 
785: 
786:           // Scale the result of the lower solve by diagonal vector stored in
787:           // the remainder off diagonal terms in the upper triangular matrix are already normalized
788:           thrust::transform((cuspstructUp->tempvecGPU)->begin(), (cuspstructUp->tempvecGPU)->end(),
789:                             xGPU->begin(), xGPU->begin(), thrust::multiplies<PetscScalar>());
790: 
791:           // Upper solve
792:           csr_tri_solve_level_scheduler<PetscInt, PetscScalar>(cuspstructUp->gpuMat,
793:                                                                       cuspstructUp->nLevels,
794:                                                                       cuspstructUp->maxNumUnknownsAtSameLevel,
795:                                                                       cuspstructUp->levelSum,
796:                                                                       thrust::raw_pointer_cast((cuspstructUp->levels)->data()),
797:                                                                       cuspstructUp->levelsCPU,
798:                                                                       thrust::raw_pointer_cast((cuspstructUp->perms)->data()),
799:                                                                       thrust::raw_pointer_cast(xGPU->data())); CHKERRCUSP(ierr);
800: 
801: 
802:         } else {
803:           // Get the CPU pointers
804:           VecGetArrayRead(bb,&b);
805:           VecGetArray(xx,&x);
806: 
807:           Mat_SeqAIJCUSPTriFactors *cuspTriFactors  = (Mat_SeqAIJCUSPTriFactors*)A->spptr;
808:           Mat_SeqAIJCUSPTriFactorHybrid *cuspstructLo = (Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->loTriFactorPtr;
809:           Mat_SeqAIJCUSPTriFactorHybrid *cuspstructUp = (Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->upTriFactorPtr;
810: 
811:           thrust::copy(bGPU->begin(),bGPU->end(),(cuspstructUp->tempvecGPU)->begin());
812:           memcpy(&(cuspstructLo->tempvecCPU2[0]), b, A->rmap->n*sizeof(PetscScalar));
813: 
814:           // Lower solve
815:           csr_tri_solve_gpu_hybrid<PetscInt, PetscScalar>(cuspstructLo->cpuMat,
816:                                                                  cuspstructLo->nnzPerRowInDiagBlock,
817:                                                                  uplo_lo_only, cuspstructLo->tempvecCPU2, cuspstructLo->tempvecCPU1,
818:                                                                  cuspstructLo->gpuMat,
819:                                                                  thrust::raw_pointer_cast((cuspstructUp->tempvecGPU)->data()),
820:                                                                  thrust::raw_pointer_cast((cuspstructLo->tempvecGPU)->data()),
821:                                                                  cuspstructLo->block_size, 1, 0); CHKERRCUSP(ierr);
822: 
823:           // Upper solve
824:           csr_tri_solve_gpu_hybrid<PetscInt, PetscScalar> (cuspstructUp->cpuMat,
825:                                                                   cuspstructUp->nnzPerRowInDiagBlock,
826:                                                                   uplo_up_only, cuspstructLo->tempvecCPU1, x,
827:                                                                   cuspstructUp->gpuMat,
828:                                                                   thrust::raw_pointer_cast((cuspstructLo->tempvecGPU)->data()),
829:                                                                   thrust::raw_pointer_cast(xGPU->data()),
830:                                                                   cuspstructUp->block_size, 1 , 0); CHKERRCUSP(ierr);
831:         }
832:       } catch(char* ex) {
833:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
834:       }
835:     }
836:     VecCUSPRestoreArrayRead(bb,&bGPU);
837:     VecCUSPRestoreArrayWrite(xx,&xGPU);
838:     WaitForGPU();CHKERRCUSP(ierr);
839:     PetscLogFlops(2.0*a->nz - A->cmap->n);
840:   } else {

842:     // Revert to the CPU solve if a GPU algorithm is not found!
843:     MatSolve_SeqAIJ_NaturalOrdering(A,bb,xx);
844:   }
845: 
846:   return(0);
847: }


850: #endif // PETSC_HAVE_TXPETSCGPU

854: PetscErrorCode MatCUSPCopyToGPU(Mat A)
855: {
856:   Mat_SeqAIJCUSP *cuspstruct  = (Mat_SeqAIJCUSP*)A->spptr;
857:   Mat_SeqAIJ      *a          = (Mat_SeqAIJ*)A->data;
858:   PetscInt        m           = A->rmap->n,*ii,*ridx;
859:   PetscErrorCode  ierr;

862:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
863:     PetscLogEventBegin(MAT_CUSPCopyToGPU,A,0,0,0);
864:     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED){
865:       try {
866:         cuspstruct->mat = new CUSPMATRIX;
867:         if (a->compressedrow.use) {
868:           m    = a->compressedrow.nrows;
869:           ii   = a->compressedrow.i;
870:           ridx = a->compressedrow.rindex;
871:           cuspstruct->mat->resize(m,A->cmap->n,a->nz);
872:           cuspstruct->mat->row_offsets.assign(ii,ii+m+1);
873:           cuspstruct->mat->column_indices.assign(a->j,a->j+a->nz);
874:           cuspstruct->mat->values.assign(a->a,a->a+a->nz);
875:           cuspstruct->indices = new CUSPINTARRAYGPU;
876:           cuspstruct->indices->assign(ridx,ridx+m);
877:         } else {
878:           cuspstruct->mat->resize(m,A->cmap->n,a->nz);
879:           cuspstruct->mat->row_offsets.assign(a->i,a->i+m+1);
880:           cuspstruct->mat->column_indices.assign(a->j,a->j+a->nz);
881:           cuspstruct->mat->values.assign(a->a,a->a+a->nz);
882:         }
883:         cuspstruct->tempvec = new CUSPARRAY;
884:         cuspstruct->tempvec->resize(m);
885:       } catch(char* ex) {
886:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
887:       }
888:     } else if (A->valid_GPU_matrix == PETSC_CUSP_CPU) {
889:       /*
890:        It may be possible to reuse nonzero structure with new matrix values but 
891:        for simplicity and insured correctness we delete and build a new matrix on
892:        the GPU. Likely a very small performance hit.
893:        */
894:       if (cuspstruct->mat){
895:         try {
896:           delete (cuspstruct->mat);
897:           if (cuspstruct->tempvec) {
898:             delete (cuspstruct->tempvec);
899:           }
900:           if (cuspstruct->indices) {
901:             delete (cuspstruct->indices);
902:           }
903:         } catch(char* ex) {
904:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
905:         }
906:       }
907:       try {
908:         cuspstruct->mat = new CUSPMATRIX;
909:         if (a->compressedrow.use) {
910:           m    = a->compressedrow.nrows;
911:           ii   = a->compressedrow.i;
912:           ridx = a->compressedrow.rindex;
913:           cuspstruct->mat->resize(m,A->cmap->n,a->nz);
914:           cuspstruct->mat->row_offsets.assign(ii,ii+m+1);
915:           cuspstruct->mat->column_indices.assign(a->j,a->j+a->nz);
916:           cuspstruct->mat->values.assign(a->a,a->a+a->nz);
917:           cuspstruct->indices = new CUSPINTARRAYGPU;
918:           cuspstruct->indices->assign(ridx,ridx+m);
919:         } else {
920:           cuspstruct->mat->resize(m,A->cmap->n,a->nz);
921:           cuspstruct->mat->row_offsets.assign(a->i,a->i+m+1);
922:           cuspstruct->mat->column_indices.assign(a->j,a->j+a->nz);
923:           cuspstruct->mat->values.assign(a->a,a->a+a->nz);
924:         }
925:         cuspstruct->tempvec = new CUSPARRAY;
926:         cuspstruct->tempvec->resize(m);
927:       } catch(char* ex) {
928:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
929:       }
930:     }
931:     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
932:     PetscLogEventEnd(MAT_CUSPCopyToGPU,A,0,0,0);
933:   }
934:   return(0);
935: }

939: PetscErrorCode MatCUSPCopyFromGPU(Mat A, CUSPMATRIX *Agpu)
940: {
941:   Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP *) A->spptr;
942:   Mat_SeqAIJ     *a          = (Mat_SeqAIJ *) A->data;
943:   PetscInt        m          = A->rmap->n;
944:   PetscErrorCode  ierr;

947:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
948:     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED) {
949:       try {
950:         cuspstruct->mat = Agpu;
951:         if (a->compressedrow.use) {
952:           //PetscInt *ii, *ridx;
953:           SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Cannot handle row compression for GPU matrices");
954:         } else {
955:           PetscInt i;

957:           if (m+1 != (PetscInt) cuspstruct->mat->row_offsets.size()) {SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", cuspstruct->mat->row_offsets.size()-1, m);}
958:           a->nz    = cuspstruct->mat->values.size();
959:           a->maxnz = a->nz; /* Since we allocate exactly the right amount */
960:           A->preallocated = PETSC_TRUE;
961:           // Copy ai, aj, aa
962:           if (a->singlemalloc) {
963:             if (a->a) {PetscFree3(a->a,a->j,a->i);}
964:           } else {
965:             if (a->i) {PetscFree(a->i);}
966:             if (a->j) {PetscFree(a->j);}
967:             if (a->a) {PetscFree(a->a);}
968:           }
969:           PetscMalloc3(a->nz,PetscScalar,&a->a,a->nz,PetscInt,&a->j,m+1,PetscInt,&a->i);
970:           PetscLogObjectMemory(A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));
971:           a->singlemalloc = PETSC_TRUE;
972:           thrust::copy(cuspstruct->mat->row_offsets.begin(), cuspstruct->mat->row_offsets.end(), a->i);
973:           thrust::copy(cuspstruct->mat->column_indices.begin(), cuspstruct->mat->column_indices.end(), a->j);
974:           thrust::copy(cuspstruct->mat->values.begin(), cuspstruct->mat->values.end(), a->a);
975:           // Setup row lengths
976:           if (a->imax) {PetscFree2(a->imax,a->ilen);}
977:           PetscMalloc2(m,PetscInt,&a->imax,m,PetscInt,&a->ilen);
978:           PetscLogObjectMemory(A, 2*m*sizeof(PetscInt));
979:           for(i = 0; i < m; ++i) {
980:             a->imax[i] = a->ilen[i] = a->i[i+1] - a->i[i];
981:           }
982:           // a->diag?
983:         }
984:         cuspstruct->tempvec = new CUSPARRAY;
985:         cuspstruct->tempvec->resize(m);
986:       } catch(char *ex) {
987:         SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSP error: %s", ex);
988:       }
989:     }
990:     // This assembly prevents resetting the flag to PETSC_CUSP_CPU and recopying
991:     MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);
992:     MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);
993:     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
994:   } else {
995:     SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Only valid for unallocated GPU matrices");
996:   }
997:   return(0);
998: }

1002: PetscErrorCode MatGetVecs_SeqAIJCUSP(Mat mat, Vec *right, Vec *left)
1003: {


1008:   if (right) {
1009:     VecCreate(((PetscObject)mat)->comm,right);
1010:     VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);
1011:     VecSetBlockSize(*right,mat->rmap->bs);
1012:     VecSetType(*right,VECSEQCUSP);
1013:     PetscLayoutReference(mat->cmap,&(*right)->map);
1014:   }
1015:   if (left) {
1016:     VecCreate(((PetscObject)mat)->comm,left);
1017:     VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);
1018:     VecSetBlockSize(*left,mat->rmap->bs);
1019:     VecSetType(*left,VECSEQCUSP);
1020:     PetscLayoutReference(mat->rmap,&(*left)->map);
1021:   }
1022:   return(0);
1023: }

1027: PetscErrorCode MatMult_SeqAIJCUSP(Mat A,Vec xx,Vec yy)
1028: {
1029:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1031:   PetscInt       nonzerorow=0;
1032:   PetscBool      usecprow    = a->compressedrow.use;
1033:   Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP *)A->spptr;
1034:   CUSPARRAY      *xarray,*yarray;

1037:   MatCUSPCopyToGPU(A);
1038:   VecCUSPGetArrayRead(xx,&xarray);
1039:   VecCUSPGetArrayWrite(yy,&yarray);
1040:   if (usecprow){ /* use compressed row format */
1041:     try {
1042:       cusp::multiply(*cuspstruct->mat,*xarray,*cuspstruct->tempvec);
1043:       VecSet_SeqCUSP(yy,0.0);
1044:       thrust::copy(cuspstruct->tempvec->begin(),cuspstruct->tempvec->end(),thrust::make_permutation_iterator(yarray->begin(),cuspstruct->indices->begin()));
1045:     } catch (char* ex) {
1046:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1047:     }
1048:   } else { /* do not use compressed row format */
1049:     try {
1050:       cusp::multiply(*cuspstruct->mat,*xarray,*yarray);
1051:     } catch(char* ex) {
1052:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1053:     }
1054:   }
1055:   VecCUSPRestoreArrayRead(xx,&xarray);
1056:   VecCUSPRestoreArrayWrite(yy,&yarray);
1057:   WaitForGPU();CHKERRCUSP(ierr);
1058:   PetscLogFlops(2.0*a->nz - nonzerorow);
1059:   return(0);
1060: }


1063: #ifdef PETSC_HAVE_TXPETSCGPU

1067: PetscErrorCode MatInodeCUSPCopyToGPU(Mat A)
1068: {
1069:   Mat_SeqAIJCUSPInode *cuspstruct  = (Mat_SeqAIJCUSPInode*)A->spptr;
1070:   Mat_SeqAIJ          *a          = (Mat_SeqAIJ*)A->data;
1071:   PetscErrorCode      ierr;
1072:   bool                success=0;

1075:   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU){
1076:     WaitForGPU();CHKERRCUSP(ierr);
1077:     PetscLogEventBegin(MAT_CUSPCopyToGPU,A,0,0,0);
1078:     if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED){
1079:       try {
1080:         // Construct the GPU csr matrix
1081:         CSRMATRIXCPU * cpuMat = new CSRMATRIXCPU(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a);
1082:         cuspstruct->mat = new CSRMATRIXGPU;
1083:         success = (cuspstruct->mat)->copy_from_host(*cpuMat);
1084:         if (!success) {
1085:           printf("Failed in cuspstructLo->gpuMat->copy_from_host\n");
1086:           CHKERRCUSP(1);
1087:         }
1088:         delete cpuMat;
1089: 
1090:         cuspstruct->tempvec = new CUSPARRAY;
1091:         cuspstruct->tempvec->resize(A->rmap->n);

1093:         // Determine the inode data structure for the GPU
1094:         PetscInt * temp;
1095:         PetscMalloc((a->inode.node_count+1)*sizeof(PetscInt), &temp);
1096:         temp[0]=0;
1097:         cuspstruct->nodeMax = 0;
1098:         for (int i = 0; i<a->inode.node_count; i++) {
1099:           temp[i+1]= a->inode.size[i]+temp[i];
1100:           if (a->inode.size[i] > cuspstruct->nodeMax)
1101:             cuspstruct->nodeMax = a->inode.size[i];
1102:         }
1103:         cuspstruct->inodes = new CUSPINTARRAYGPU;
1104:         cuspstruct->inodes->assign(temp, temp+a->inode.node_count+1);
1105:         PetscFree(temp);

1107:         // Determine the maximum number of nonzeros in a row.
1108:         cuspstruct->nnzPerRowMax=0;
1109:         for (int j = 0; j<A->rmap->n; j++) {
1110:           if (a->i[j+1]-a->i[j] > cuspstruct->nnzPerRowMax) {
1111:             cuspstruct->nnzPerRowMax = a->i[j+1]-a->i[j];
1112:           }
1113:         }

1115:       } catch(char* ex) {
1116:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1117:       }
1118:     } else if (A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1119:       /*
1120:        It may be possible to reuse nonzero structure with new matrix values but 
1121:        for simplicity and insured correctness we delete and build a new matrix on
1122:        the GPU. Likely a very small performance hit.
1123:        */
1124:       if (cuspstruct->mat){
1125:         try {
1126:           delete (cuspstruct->mat);
1127:           if (cuspstruct->tempvec) {
1128:             delete (cuspstruct->tempvec);
1129:           }
1130:           if (cuspstruct->inodes) {
1131:             delete (cuspstruct->inodes);
1132:           }
1133:         } catch(char* ex) {
1134:           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1135:         }
1136:       }
1137:       try {
1138:         // Construct the GPU csr matrix
1139:         CSRMATRIXCPU * cpuMat = new CSRMATRIXCPU(A->rmap->n, A->cmap->n, a->nz, a->i, a->j, a->a);
1140:         cuspstruct->mat = new CSRMATRIXGPU;
1141:         success = (cuspstruct->mat)->copy_from_host(*cpuMat);
1142:         if (!success) {
1143:           printf("Failed in cuspstructLo->gpuMat->copy_from_host\n");
1144:           CHKERRCUSP(1);
1145:         }
1146:         delete cpuMat;

1148:         cuspstruct->tempvec = new CUSPARRAY;
1149:         cuspstruct->tempvec->resize(A->rmap->n);

1151:         // Determine the inode data structure for the GPU
1152:         PetscInt * temp;
1153:         PetscMalloc((a->inode.node_count+1)*sizeof(PetscInt), &temp);
1154:         temp[0]=0;
1155:         cuspstruct->nodeMax = 0;
1156:         for (int i = 0; i<a->inode.node_count; i++) {
1157:           temp[i+1]= a->inode.size[i]+temp[i];
1158:           if (a->inode.size[i] > cuspstruct->nodeMax)
1159:             cuspstruct->nodeMax = a->inode.size[i];
1160:         }
1161:         cuspstruct->inodes = new CUSPINTARRAYGPU;
1162:         cuspstruct->inodes->assign(temp, temp+a->inode.node_count+1);
1163:         PetscFree(temp);

1165:         // Determine the maximum number of nonzeros in a row.
1166:         cuspstruct->nnzPerRowMax=0;
1167:         for (int j = 0; j<A->rmap->n+1; j++)
1168:           if (a->i[j+1]-a->i[j] > cuspstruct->nnzPerRowMax)
1169:             cuspstruct->nnzPerRowMax = a->i[j+1]-a->i[j];

1171:       } catch(char* ex) {
1172:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1173:       }
1174:     }
1175:     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1176:     WaitForGPU();CHKERRCUSP(ierr);
1177:     PetscLogEventEnd(MAT_CUSPCopyToGPU,A,0,0,0);
1178:   }
1179:   return(0);
1180: }


1185: PetscErrorCode MatMult_SeqAIJCUSP_Inode(Mat A,Vec xx,Vec yy)
1186: {
1187:   Mat_SeqAIJ                *a = (Mat_SeqAIJ*)A->data;
1188:   PetscErrorCode            ierr;
1189:   PetscInt                  nonzerorow=0;
1190:   PetscBool                 usecprow    = a->compressedrow.use;
1191:   const Mat_SeqAIJCUSPInode *cuspstruct = (Mat_SeqAIJCUSPInode *)A->spptr;
1192:   CUSPARRAY                 *xarray, *yarray;

1195:   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");

1197:   MatInodeCUSPCopyToGPU(A);
1198:   VecCUSPCopyToGPU(xx);
1199:   VecCUSPGetArrayRead(xx,&xarray);
1200:   VecCUSPGetArrayWrite(yy,&yarray);
1201:   if (usecprow){ /* use compressed row format */
1202:     try {
1203:       // not sure what to do here
1204:       ;
1205:     } catch(char* ex) {
1206:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1207:     }
1208:   } else { /* do not use compressed row format */
1209:     try {
1210:       csr_spmv_inode<PetscInt, PetscScalar>(cuspstruct->mat,
1211:                                                    a->inode.node_count, cuspstruct->nodeMax, cuspstruct->nnzPerRowMax,
1212:                                                          thrust::raw_pointer_cast((cuspstruct->inodes)->data()),
1213:                                                          thrust::raw_pointer_cast(xarray->data()),
1214:                                                          thrust::raw_pointer_cast(yarray->data())); CHKERRCUSP(ierr);

1216: 
1217:     } catch(char* ex) {
1218:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1219:     }
1220:   }
1221:   VecCUSPRestoreArrayRead(xx,&xarray);
1222:   VecCUSPRestoreArrayWrite(yy,&yarray);
1223:   WaitForGPU();CHKERRCUSP(ierr);
1224:   PetscLogFlops(2.0*a->nz - nonzerorow);
1225:   return(0);
1226: }


1229: #endif // PETSC_HAVE_TXPETSCGPU

1231: struct VecCUSPPlusEquals
1232: {
1233:   template <typename Tuple>
1234:   __host__ __device__
1235:   void operator()(Tuple t)
1236:   {
1237:     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1238:   }
1239: };

1243: PetscErrorCode MatMultAdd_SeqAIJCUSP(Mat A,Vec xx,Vec yy,Vec zz)
1244: {
1245:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1247:   PetscBool      usecprow=a->compressedrow.use;
1248:   Mat_SeqAIJCUSP *cuspstruct = (Mat_SeqAIJCUSP *)A->spptr;
1249:   CUSPARRAY      *xarray,*yarray,*zarray;

1252:   MatCUSPCopyToGPU(A);
1253:   if (usecprow) {
1254:     try {
1255:       VecCopy_SeqCUSP(yy,zz);
1256:       VecCUSPGetArrayRead(xx,&xarray);
1257:       VecCUSPGetArrayRead(yy,&yarray);
1258:       VecCUSPGetArrayWrite(zz,&zarray);
1259:       if (a->compressedrow.nrows) {
1260:         cusp::multiply(*cuspstruct->mat,*xarray, *cuspstruct->tempvec);
1261:         thrust::for_each(
1262:            thrust::make_zip_iterator(
1263:                  thrust::make_tuple(
1264:                                     cuspstruct->tempvec->begin(),
1265:                                     thrust::make_permutation_iterator(zarray->begin(), cuspstruct->indices->begin()))),
1266:            thrust::make_zip_iterator(
1267:                  thrust::make_tuple(
1268:                                     cuspstruct->tempvec->begin(),
1269:                                     thrust::make_permutation_iterator(zarray->begin(),cuspstruct->indices->begin()))) + cuspstruct->tempvec->size(),
1270:            VecCUSPPlusEquals());
1271:       }
1272:     } catch(char* ex) {
1273:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1274:     }
1275:     VecCUSPRestoreArrayRead(xx,&xarray);
1276:     VecCUSPRestoreArrayRead(yy,&yarray);
1277:     VecCUSPRestoreArrayWrite(zz,&zarray);
1278:   } else {
1279:     try {
1280:       VecCopy_SeqCUSP(yy,zz);
1281:       VecCUSPGetArrayRead(xx,&xarray);
1282:       VecCUSPGetArrayRead(yy,&yarray);
1283:       VecCUSPGetArrayWrite(zz,&zarray);
1284:       cusp::multiply(*cuspstruct->mat,*xarray,*cuspstruct->tempvec);
1285:       thrust::for_each(
1286:          thrust::make_zip_iterator(
1287:                  thrust::make_tuple(
1288:                                     cuspstruct->tempvec->begin(),
1289:                                     zarray->begin())),
1290:          thrust::make_zip_iterator(
1291:                  thrust::make_tuple(
1292:                                     cuspstruct->tempvec->end(),
1293:                                    zarray->end())),
1294:          VecCUSPPlusEquals());
1295:     } catch(char* ex) {
1296:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1297:     }
1298:     VecCUSPRestoreArrayRead(xx,&xarray);
1299:     VecCUSPRestoreArrayRead(yy,&yarray);
1300:     VecCUSPRestoreArrayWrite(zz,&zarray);
1301:   }
1302:   PetscLogFlops(2.0*a->nz);
1303:   WaitForGPU();CHKERRCUSP(ierr);
1304:   return(0);
1305: }


1310: PetscErrorCode MatAssemblyEnd_SeqAIJCUSP(Mat A,MatAssemblyType mode)
1311: {
1312:   PetscErrorCode  ierr;
1313: #ifdef PETSC_HAVE_TXPETSCGPU
1314:   Mat_SeqAIJ      *aij = (Mat_SeqAIJ*)A->data;
1315: #endif // PETSC_HAVE_TXPETSCGPU
1316: 
1318:   MatAssemblyEnd_SeqAIJ(A,mode);
1319:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
1320:   if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
1321:     A->valid_GPU_matrix = PETSC_CUSP_CPU;
1322:   }

1324: #ifdef PETSC_HAVE_TXPETSCGPU
1325:   if (aij->inode.use)  A->ops->mult    = MatMult_SeqAIJCUSP_Inode;
1326: #endif // PETSC_HAVE_TXPETSCGPU

1328:   return(0);
1329: }

1331: /* --------------------------------------------------------------------------------*/
1334: /*@C
1335:    MatCreateSeqAIJCUSP - Creates a sparse matrix in AIJ (compressed row) format
1336:    (the default parallel PETSc format).  For good matrix assembly performance
1337:    the user should preallocate the matrix storage by setting the parameter nz
1338:    (or the array nnz).  By setting these parameters accurately, performance
1339:    during matrix assembly can be increased by more than a factor of 50.

1341:    Collective on MPI_Comm

1343:    Input Parameters:
1344: +  comm - MPI communicator, set to PETSC_COMM_SELF
1345: .  m - number of rows
1346: .  n - number of columns
1347: .  nz - number of nonzeros per row (same for all rows)
1348: -  nnz - array containing the number of nonzeros in the various rows 
1349:          (possibly different for each row) or PETSC_NULL

1351:    Output Parameter:
1352: .  A - the matrix 

1354:    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1355:    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1356:    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]

1358:    Notes:
1359:    If nnz is given then nz is ignored

1361:    The AIJ format (also called the Yale sparse matrix format or
1362:    compressed row storage), is fully compatible with standard Fortran 77
1363:    storage.  That is, the stored row and column indices can begin at
1364:    either one (as in Fortran) or zero.  See the users' manual for details.

1366:    Specify the preallocated storage with either nz or nnz (not both).
1367:    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory 
1368:    allocation.  For large problems you MUST preallocate memory or you 
1369:    will get TERRIBLE performance, see the users' manual chapter on matrices.

1371:    By default, this format uses inodes (identical nodes) when possible, to 
1372:    improve numerical efficiency of matrix-vector products and solves. We 
1373:    search for consecutive rows with the same nonzero structure, thereby
1374:    reusing matrix information to achieve increased efficiency.

1376:    Level: intermediate

1378: .seealso: MatCreate(), MatCreateMPIAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateMPIAIJ()

1380: @*/
1381: PetscErrorCode  MatCreateSeqAIJCUSP(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1382: {

1386:   MatCreate(comm,A);
1387:   MatSetSizes(*A,m,n,m,n);
1388:   MatSetType(*A,MATSEQAIJCUSP);
1389:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
1390:   return(0);
1391: }

1393: #ifdef PETSC_HAVE_TXPETSCGPU

1397: PetscErrorCode MatDestroy_SeqAIJCUSP(Mat A)
1398: {
1399:   PetscErrorCode      ierr;
1400:   Mat_SeqAIJ          *a          = (Mat_SeqAIJ*)A->data;
1401:   Mat_SeqAIJCUSP      *cuspstruct = (Mat_SeqAIJCUSP*)A->spptr;
1402:   Mat_SeqAIJCUSPInode *cuspstructInode = (Mat_SeqAIJCUSPInode*)A->spptr;
1403:   cudaError_t         err;

1406:   if (A->factortype==MAT_FACTOR_NONE) {
1407:     // The regular matrices
1408:     try {
1409:       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
1410:         if (!a->inode.use)
1411:           delete (CUSPMATRIX *)(cuspstruct->mat);
1412:         else
1413:           delete (CSRMATRIXGPU *)(cuspstructInode->mat);
1414:       }
1415:       if (!a->inode.use) {
1416:         if (cuspstruct->tempvec!=0)
1417:           delete cuspstruct->tempvec;
1418:         if (cuspstruct->indices!=0)
1419:           delete cuspstruct->indices;
1420:         delete cuspstruct;
1421:       } else {
1422:         if (cuspstructInode->tempvec!=0)
1423:           delete cuspstructInode->tempvec;
1424:         if (cuspstructInode->inodes!=0)
1425:           delete cuspstructInode->inodes;
1426:         delete cuspstructInode;
1427:       }
1428:       A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1429:     } catch(char* ex) {
1430:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1431:     }
1432:   } else {
1433:     // The triangular factors
1434:     if (GPU_TRI_SOLVE_ALGORITHM!="none") {
1435:       try {
1436:         if (GPU_TRI_SOLVE_ALGORITHM=="levelScheduler") {
1437: 
1438:           Mat_SeqAIJCUSPTriFactors *cuspTriFactors  = (Mat_SeqAIJCUSPTriFactors*)A->spptr;
1439:           Mat_SeqAIJCUSPTriFactorLevelScheduler *cuspstructLo  = (Mat_SeqAIJCUSPTriFactorLevelScheduler*)cuspTriFactors->loTriFactorPtr;
1440:           Mat_SeqAIJCUSPTriFactorLevelScheduler *cuspstructUp  = (Mat_SeqAIJCUSPTriFactorLevelScheduler*)cuspTriFactors->upTriFactorPtr;
1441: 
1442:           // the Lower factor
1443:           if (cuspstructLo->gpuMat!=0)
1444:             delete (CSRMATRIXGPU *)(cuspstructLo->gpuMat);
1445:           if (cuspstructLo->tempvecGPU!=0)
1446:             delete cuspstructLo->tempvecGPU;
1447:           if (cuspstructLo->levels!=0)
1448:             delete cuspstructLo->levels;
1449:           if (cuspstructLo->levelsCPU!=0) {
1450:             PetscFree(cuspstructLo->levelsCPU);  }
1451:           if (cuspstructLo->perms!=0)
1452:             delete cuspstructLo->perms;
1453:           if (cuspstructLo->ordIndicesGPU!=0)
1454:             delete cuspstructLo->ordIndicesGPU;
1455:           delete cuspstructLo;
1456: 
1457:           // the Upper factor
1458:           if (cuspstructUp->gpuMat!=0)
1459:             delete (CSRMATRIXGPU *)(cuspstructUp->gpuMat);
1460:           if (cuspstructUp->tempvecGPU!=0)
1461:             delete cuspstructUp->tempvecGPU;
1462:           if (cuspstructUp->levels!=0)
1463:             delete cuspstructUp->levels;
1464:           if (cuspstructUp->levelsCPU!=0) {
1465:             PetscFree(cuspstructUp->levelsCPU);  }
1466:           if (cuspstructUp->perms!=0)
1467:             delete cuspstructUp->perms;
1468:           if (cuspstructUp->ordIndicesGPU!=0)
1469:             delete cuspstructUp->ordIndicesGPU;
1470: 
1471:           delete cuspstructUp;
1472: 
1473:           /* Set the pointers to 0 */
1474:           cuspTriFactors->loTriFactorPtr = 0;
1475:           cuspTriFactors->upTriFactorPtr = 0;
1476: 
1477:         } else {
1478: 
1479:           Mat_SeqAIJCUSPTriFactors      *cuspTriFactors  = (Mat_SeqAIJCUSPTriFactors*)A->spptr;
1480:           Mat_SeqAIJCUSPTriFactorHybrid *cuspstructLo = (Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->loTriFactorPtr;
1481:           Mat_SeqAIJCUSPTriFactorHybrid *cuspstructUp = (Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->upTriFactorPtr;
1482: 
1483:           // the Lower factor
1484:           if (cuspstructLo->cpuMat) {
1485:             PetscFree(cuspstructLo->nnzPerRowInDiagBlock);
1486:             err = cudaFreeHost(cuspstructLo->tempvecCPU1); CHKERRCUSP(err);
1487:             err = cudaFreeHost(cuspstructLo->tempvecCPU2); CHKERRCUSP(err);
1488:             err = cudaFreeHost(cuspstructLo->cpuMat->row_offsets); CHKERRCUSP(err);
1489:             err = cudaFreeHost(cuspstructLo->cpuMat->column_indices); CHKERRCUSP(err);
1490:             err = cudaFreeHost(cuspstructLo->cpuMat->values); CHKERRCUSP(err);
1491:             delete (CSRMATRIXCPU *)(cuspstructLo->cpuMat);
1492:           }
1493:           if (cuspstructLo->gpuMat)
1494:             delete (CSRMATRIXGPU *)(cuspstructLo->gpuMat);
1495:           if (cuspstructLo->tempvecGPU)
1496:             delete cuspstructLo->tempvecGPU;
1497:           delete cuspstructLo;
1498: 
1499:           // the Upper factor
1500:           if (cuspstructUp->cpuMat) {
1501:             PetscFree(cuspstructUp->nnzPerRowInDiagBlock);
1502:             err = cudaFreeHost(cuspstructUp->tempvecCPU1); CHKERRCUSP(err);
1503:             err = cudaFreeHost(cuspstructUp->tempvecCPU2); CHKERRCUSP(err);
1504:             err = cudaFreeHost(cuspstructUp->cpuMat->row_offsets); CHKERRCUSP(err);
1505:             err = cudaFreeHost(cuspstructUp->cpuMat->column_indices); CHKERRCUSP(err);
1506:             err = cudaFreeHost(cuspstructUp->cpuMat->values); CHKERRCUSP(err);
1507:             delete (CSRMATRIXCPU *)(cuspstructUp->cpuMat);
1508:           }
1509:           if (cuspstructUp->gpuMat)
1510:             delete (CSRMATRIXGPU *)(cuspstructUp->gpuMat);
1511:           if (cuspstructUp->tempvecGPU)
1512:             delete cuspstructUp->tempvecGPU;
1513:           delete cuspstructUp;
1514: 
1515:           /* Set the pointers to 0 */
1516:           cuspTriFactors->loTriFactorPtr = 0;
1517:           cuspTriFactors->upTriFactorPtr = 0;
1518:         }
1519:       } catch(char* ex) {
1520:         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1521:       }
1522:     }
1523:   }
1524:   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1525:   A->spptr = 0;

1527:   MatDestroy_SeqAIJ(A);
1528:   return(0);
1529: }

1531: #else // if PETSC_HAVE_TXPETSCGPU is 0

1535: PetscErrorCode MatDestroy_SeqAIJCUSP(Mat A)
1536: {
1538:   Mat_SeqAIJCUSP *cuspcontainer = (Mat_SeqAIJCUSP*)A->spptr;

1541:   try {
1542:     if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED){
1543:       delete (CUSPMATRIX *)(cuspcontainer->mat);
1544:     }
1545:     delete cuspcontainer;
1546:     A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1547:   } catch(char* ex) {
1548:       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSP error: %s", ex);
1549:   }
1550:   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1551:   A->spptr = 0;
1552:   MatDestroy_SeqAIJ(A);
1553:   return(0);
1554: }

1556: #endif // PETSC_HAVE_TXPETSCGPU

1558: PetscErrorCode MatSetValuesBatch_SeqAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats);

1560: #ifdef PETSC_HAVE_TXPETSCGPU

1565: PetscErrorCode  MatCreate_SeqAIJCUSP(Mat B)
1566: {
1567:   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;

1571:   MatCreate_SeqAIJ(B);
1572:   b = (Mat_SeqAIJ*)B->data;
1573:   B->ops->mult    = MatMult_SeqAIJCUSP;
1574:   B->ops->multadd = MatMultAdd_SeqAIJCUSP;

1576:   if (B->factortype==MAT_FACTOR_NONE) {
1577:     /* you cannot check the inode.use flag here since the matrix was just created.*/
1578:     if (!b->inode.use) {
1579:       B->spptr        = new Mat_SeqAIJCUSP;
1580:       ((Mat_SeqAIJCUSP *)B->spptr)->mat = 0;
1581:       ((Mat_SeqAIJCUSP *)B->spptr)->tempvec = 0;
1582:       ((Mat_SeqAIJCUSP *)B->spptr)->indices = 0;
1583:     } else {
1584:       B->spptr        = new Mat_SeqAIJCUSPInode;
1585:       ((Mat_SeqAIJCUSPInode *)B->spptr)->mat = 0;
1586:       ((Mat_SeqAIJCUSPInode *)B->spptr)->tempvec = 0;
1587:       ((Mat_SeqAIJCUSPInode *)B->spptr)->inodes = 0;
1588:       ((Mat_SeqAIJCUSPInode *)B->spptr)->nnzPerRowMax = 0;
1589:       ((Mat_SeqAIJCUSPInode *)B->spptr)->nodeMax = 0;
1590:     }
1591:   } else {
1592:     // Get the tri solve algorithm
1593:     PetscBool found;
1594:     char      input[20] = "hybrid";

1596:     PetscOptionsGetString(PETSC_NULL, "-gpu_tri_solve_algorithm", input, 20, &found);
1597:     GPU_TRI_SOLVE_ALGORITHM.assign(input);
1598:     if(GPU_TRI_SOLVE_ALGORITHM!="levelScheduler" && GPU_TRI_SOLVE_ALGORITHM!="hybrid") SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Bad argument to -gpu_tri_solve_algorithm. Must be either 'hybrid' or 'levelScheduler'\n");

1600: 
1601:     if (GPU_TRI_SOLVE_ALGORITHM!="none") {
1602:       Mat_SeqAIJCUSPTriFactors *cuspTriFactors  = (Mat_SeqAIJCUSPTriFactors*)B->spptr;
1603:       /* NEXT, set the pointers to the triangular factors */
1604:       B->spptr = new Mat_SeqAIJCUSPTriFactors;
1605:       cuspTriFactors->loTriFactorPtr = 0;
1606:       cuspTriFactors->upTriFactorPtr = 0;
1607: 
1608:       if (GPU_TRI_SOLVE_ALGORITHM=="levelScheduler") {
1609:         cuspTriFactors->loTriFactorPtr        = new Mat_SeqAIJCUSPTriFactorLevelScheduler;
1610:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->loTriFactorPtr)->gpuMat = 0;
1611:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->loTriFactorPtr)->tempvecGPU = 0;
1612:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->loTriFactorPtr)->levels = 0;
1613:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->loTriFactorPtr)->ordIndicesGPU = 0;
1614:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->loTriFactorPtr)->levelsCPU = 0;
1615:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->loTriFactorPtr)->perms = 0;
1616:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->loTriFactorPtr)->nLevels = 0;
1617:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->loTriFactorPtr)->maxNumUnknownsAtSameLevel = 0;
1618:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->loTriFactorPtr)->levelSum = 0;
1619: 
1620:         cuspTriFactors->upTriFactorPtr        = new Mat_SeqAIJCUSPTriFactorLevelScheduler;
1621:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->upTriFactorPtr)->gpuMat = 0;
1622:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->upTriFactorPtr)->tempvecGPU = 0;
1623:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->upTriFactorPtr)->levels = 0;
1624:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->upTriFactorPtr)->ordIndicesGPU = 0;
1625:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->upTriFactorPtr)->levelsCPU = 0;
1626:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->upTriFactorPtr)->perms = 0;
1627:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->upTriFactorPtr)->nLevels = 0;
1628:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->upTriFactorPtr)->maxNumUnknownsAtSameLevel = 0;
1629:         ((Mat_SeqAIJCUSPTriFactorLevelScheduler *)cuspTriFactors->upTriFactorPtr)->levelSum = 0;
1630:       } else {
1631:         cuspTriFactors->loTriFactorPtr        = new Mat_SeqAIJCUSPTriFactorHybrid;
1632:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->loTriFactorPtr)->cpuMat = 0;
1633:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->loTriFactorPtr)->gpuMat = 0;
1634:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->loTriFactorPtr)->nnzPerRowInDiagBlock = 0;
1635:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->loTriFactorPtr)->tempvecGPU = 0;
1636:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->loTriFactorPtr)->tempvecCPU1 = 0;
1637:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->loTriFactorPtr)->tempvecCPU2 = 0;
1638:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->loTriFactorPtr)->nnz = 0;
1639:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->loTriFactorPtr)->block_size = 0;
1640: 
1641:         cuspTriFactors->upTriFactorPtr        = new Mat_SeqAIJCUSPTriFactorHybrid;
1642:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->upTriFactorPtr)->cpuMat = 0;
1643:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->upTriFactorPtr)->gpuMat = 0;
1644:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->upTriFactorPtr)->nnzPerRowInDiagBlock = 0;
1645:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->upTriFactorPtr)->tempvecGPU = 0;
1646:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->upTriFactorPtr)->tempvecCPU1 = 0;
1647:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->upTriFactorPtr)->tempvecCPU2 = 0;
1648:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->upTriFactorPtr)->nnz = 0;
1649:         ((Mat_SeqAIJCUSPTriFactorHybrid *)cuspTriFactors->upTriFactorPtr)->block_size = 0;
1650:       }
1651:     }
1652:   }

1654:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSP;
1655:   B->ops->destroy        = MatDestroy_SeqAIJCUSP;
1656:   B->ops->getvecs        = MatGetVecs_SeqAIJCUSP;
1657:   B->ops->setvaluesbatch = MatSetValuesBatch_SeqAIJCUSP;
1658:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSP);
1659:   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;

1661:   PetscObjectComposeFunctionDynamic((PetscObject)B,"MatGetFactor_petsc_C","MatGetFactor_seqaij_petsccusp",MatGetFactor_seqaij_petsccusp);
1662:   return(0);
1663: }

1666: #else // if PETSC_HAVE_TXPETSCGPU is 0

1671: PetscErrorCode  MatCreate_SeqAIJCUSP(Mat B)
1672: {
1674:   Mat_SeqAIJ     *aij;

1677:   MatCreate_SeqAIJ(B);
1678:   aij             = (Mat_SeqAIJ*)B->data;
1679:   aij->inode.use  = PETSC_FALSE;
1680:   B->ops->mult    = MatMult_SeqAIJCUSP;
1681:   B->ops->multadd = MatMultAdd_SeqAIJCUSP;
1682:   B->spptr        = new Mat_SeqAIJCUSP;
1683:   ((Mat_SeqAIJCUSP *)B->spptr)->mat = 0;
1684:   ((Mat_SeqAIJCUSP *)B->spptr)->tempvec = 0;
1685:   ((Mat_SeqAIJCUSP *)B->spptr)->indices = 0;
1686: 
1687:   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSP;
1688:   B->ops->destroy        = MatDestroy_SeqAIJCUSP;
1689:   B->ops->getvecs        = MatGetVecs_SeqAIJCUSP;
1690:   B->ops->setvaluesbatch = MatSetValuesBatch_SeqAIJCUSP;
1691:   PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSP);
1692:   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1693:   return(0);
1694: }

1697: #endif // PETSC_HAVE_TXPETSCGPU