Actual source code: aijAssemble.cu

  1: #include "petscconf.h"
 3:  #include ../src/mat/impls/aij/seq/aij.h
 4:  #include petscbt.h
 5:  #include ../src/vec/vec/impls/dvecimpl.h
 6:  #include private/vecimpl.h
  8: #undef VecType
 9:  #include ../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h

 11: #include <thrust/reduce.h>
 12: #include <thrust/inner_product.h>

 14: #include <cusp/array1d.h>
 15: #include <cusp/print.h>
 16: #include <cusp/coo_matrix.h>
 17: #include <cusp/detail/device/reduce_by_key.h>

 19: #include <cusp/io/matrix_market.h>

 21: #include <thrust/iterator/counting_iterator.h>
 22: #include <thrust/iterator/transform_iterator.h>
 23: #include <thrust/iterator/permutation_iterator.h>
 24: #include <thrust/functional.h>

 26: // this example illustrates how to make repeated access to a range of values
 27: // examples:
 28: //   repeated_range([0, 1, 2, 3], 1) -> [0, 1, 2, 3]
 29: //   repeated_range([0, 1, 2, 3], 2) -> [0, 0, 1, 1, 2, 2, 3, 3]
 30: //   repeated_range([0, 1, 2, 3], 3) -> [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
 31: //   ...

 33: template <typename Iterator>
 34: class repeated_range
 35: {
 36:     public:

 38:     typedef typename thrust::iterator_difference<Iterator>::type difference_type;

 40:     struct repeat_functor : public thrust::unary_function<difference_type,difference_type>
 41:     {
 42:         difference_type repeats;

 44:         repeat_functor(difference_type repeats)
 45:             : repeats(repeats) {}

 47:         __host__ __device__
 48:         difference_type operator()(const difference_type& i) const
 49:         {
 50:             return i / repeats;
 51:         }
 52:     };

 54:     typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
 55:     typedef typename thrust::transform_iterator<repeat_functor, CountingIterator> TransformIterator;
 56:     typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

 58:     // type of the repeated_range iterator
 59:     typedef PermutationIterator iterator;

 61:     // construct repeated_range for the range [first,last)
 62:     repeated_range(Iterator first, Iterator last, difference_type repeats)
 63:         : first(first), last(last), repeats(repeats) {}

 65:     iterator begin(void) const
 66:     {
 67:         return PermutationIterator(first, TransformIterator(CountingIterator(0), repeat_functor(repeats)));
 68:     }

 70:     iterator end(void) const
 71:     {
 72:         return begin() + repeats * (last - first);
 73:     }

 75:     protected:
 76:     difference_type repeats;
 77:     Iterator first;
 78:     Iterator last;

 80: };

 82: // this example illustrates how to repeat blocks in a range multiple times
 83: // examples:
 84: //   tiled_range([0, 1, 2, 3], 2)    -> [0, 1, 2, 3, 0, 1, 2, 3]
 85: //   tiled_range([0, 1, 2, 3], 4, 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
 86: //   tiled_range([0, 1, 2, 3], 2, 2) -> [0, 1, 0, 1, 2, 3, 2, 3]
 87: //   tiled_range([0, 1, 2, 3], 2, 3) -> [0, 1, 0, 1 0, 1, 2, 3, 2, 3, 2, 3]
 88: //   ...

 90: template <typename Iterator>
 91: class tiled_range
 92: {
 93:     public:

 95:     typedef typename thrust::iterator_difference<Iterator>::type difference_type;

 97:     struct tile_functor : public thrust::unary_function<difference_type,difference_type>
 98:     {
 99:         difference_type repeats;
100:         difference_type tile_size;

102:         tile_functor(difference_type repeats, difference_type tile_size)
103:             : tile_size(tile_size), repeats(repeats) {}

105:         __host__ __device__
106:         difference_type operator()(const difference_type& i) const
107:         {
108:             return tile_size * (i / (tile_size * repeats)) + i % tile_size;
109:         }
110:     };

112:     typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
113:     typedef typename thrust::transform_iterator<tile_functor, CountingIterator>   TransformIterator;
114:     typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

116:     // type of the tiled_range iterator
117:     typedef PermutationIterator iterator;

119:     // construct repeated_range for the range [first,last)
120:     tiled_range(Iterator first, Iterator last, difference_type repeats)
121:         : first(first), last(last), repeats(repeats), tile_size(last - first) {}

123:     tiled_range(Iterator first, Iterator last, difference_type repeats, difference_type tile_size)
124:         : first(first), last(last), repeats(repeats), tile_size(tile_size)
125:     {
126:       // ASSERT((last - first) % tile_size == 0)
127:     }

129:     iterator begin(void) const
130:     {
131:         return PermutationIterator(first, TransformIterator(CountingIterator(0), tile_functor(repeats, tile_size)));
132:     }

134:     iterator end(void) const
135:     {
136:         return begin() + repeats * (last - first);
137:     }

139:     protected:
140:     difference_type repeats;
141:     difference_type tile_size;
142:     Iterator first;
143:     Iterator last;
144: };

146: typedef cusp::device_memory memSpace;
147: typedef int   IndexType;
148: typedef float ValueType;
149: typedef cusp::array1d<IndexType, memSpace> IndexArray;
150: typedef cusp::array1d<ValueType, memSpace> ValueArray;
151: typedef IndexArray::iterator IndexArrayIterator;
152: typedef ValueArray::iterator ValueArrayIterator;

154: // Ne: Number of elements
155: // Nl: Number of dof per element
158: PetscErrorCode MatSetValuesBatch_SeqAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats)
159: {
160:   size_t   N  = Ne * Nl;
161:   size_t   No = Ne * Nl*Nl;
162:   PetscInt Nr; // Number of rows

165:   // copy elemRows and elemMat to device
166:   IndexArray d_elemRows(elemRows, elemRows + N);
167:   ValueArray d_elemMats(elemMats, elemMats + No);

170:   MatGetSize(J, &Nr, PETSC_NULL);
171:   // allocate storage for "fat" COO representation of matrix
172:   PetscInfo1(J, "Making COO matrix of size %d\n", Nr);
173:   cusp::coo_matrix<IndexType,ValueType, memSpace> COO(Nr, Nr, No);

175:   // repeat elemRows entries Nl times
176:   PetscInfo(J, "Making row indices\n");
177:   repeated_range<IndexArrayIterator> rowInd(d_elemRows.begin(), d_elemRows.end(), Nl);
178:   thrust::copy(rowInd.begin(), rowInd.end(), COO.row_indices.begin());

180:   // tile rows of elemRows Nl times
181:   PetscInfo(J, "Making column indices\n");
182:   tiled_range<IndexArrayIterator> colInd(d_elemRows.begin(), d_elemRows.end(), Nl, Nl);
183:   thrust::copy(colInd.begin(), colInd.end(), COO.column_indices.begin());

185:   // copy values from elemMats into COO structure (could be avoided)
186:   thrust::copy(d_elemMats.begin(), d_elemMats.end(), COO.values.begin());

188:   // For MPIAIJ, split this into two COO matrices, and return both
189:   //   Need the column map

191:   // print the "fat" COO representation
192:   if (PetscLogPrintInfo) {cusp::print(COO);}

194:   // sort COO format by (i,j), this is the most costly step
195:   PetscInfo(J, "Sorting rows and columns\n");
196: #if 1
197:   COO.sort_by_row_and_column();
198: #else
199:   {
200:     PetscInfo(J, "  Making permutation\n");
201:     IndexArray permutation(No);
202:     thrust::sequence(permutation.begin(), permutation.end());

204:     // compute permutation and sort by (I,J)
205:     {
206:         PetscInfo(J, "  Sorting columns\n");
207:         IndexArray temp(No);
208:         thrust::copy(COO.column_indices.begin(), COO.column_indices.end(), temp.begin());
209:         thrust::stable_sort_by_key(temp.begin(), temp.end(), permutation.begin());
210:         PetscInfo(J, "    Sorted columns\n");
211:         if (PetscLogPrintInfo) {
212:           for(IndexArrayIterator t_iter = temp.begin(), p_iter = permutation.begin(); t_iter != temp.end(); ++t_iter, ++p_iter) {
213:             PetscInfo2(J, "%d(%d)\n", *t_iter, *p_iter);
214:           }
215:         }

217:         PetscInfo(J, "  Copying rows\n");
218:         //cusp::copy(COO.row_indices, temp);
219:         thrust::copy(COO.row_indices.begin(), COO.row_indices.end(), temp.begin());
220:         PetscInfo(J, "  Gathering rows\n");
221:         thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.row_indices.begin());
222:         PetscInfo(J, "  Sorting rows\n");
223:         thrust::stable_sort_by_key(COO.row_indices.begin(), COO.row_indices.end(), permutation.begin());

225:         PetscInfo(J, "  Gathering columns\n");
226:         cusp::copy(COO.column_indices, temp);
227:         thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.column_indices.begin());
228:     }

230:     // use permutation to reorder the values
231:     {
232:         PetscInfo(J, "  Sorting values\n");
233:         ValueArray temp(COO.values);
234:         cusp::copy(COO.values, temp);
235:         thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.values.begin());
236:     }
237:   }
238: #endif

240:   // print the "fat" COO representation
241:   if (PetscLogPrintInfo) {cusp::print(COO);}

243:   // compute number of unique (i,j) entries
244:   //   this counts the number of changes as we move along the (i,j) list
245:   PetscInfo(J, "Computing number of unique entries\n");
246:   size_t num_entries = thrust::inner_product
247:     (thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
248:      thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end (),  COO.column_indices.end()))   - 1,
249:      thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())) + 1,
250:      size_t(1),
251:      thrust::plus<size_t>(),
252:      thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());

254:   // allocate COO storage for final matrix
255:   PetscInfo(J, "Allocating compressed matrix\n");
256:   cusp::coo_matrix<IndexType, ValueType, memSpace> A(Nr, Nr, num_entries);

258:   // sum values with the same (i,j) index
259:   // XXX thrust::reduce_by_key is unoptimized right now, so we provide a SpMV-based one in cusp::detail
260:   //     the Cusp one is 2x faster, but still not optimal
261:   // This could possibly be done in-place
262:   PetscInfo(J, "Compressing matrix\n");
263: #if 1
264:   cusp::detail::device::reduce_by_key
265:     (thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
266:      thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end(),   COO.column_indices.end())),
267:      COO.values.begin(),
268:      thrust::make_zip_iterator(thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())),
269:      A.values.begin(),
270:      thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
271:      thrust::plus<ValueType>());
272: #else
273:   thrust::reduce_by_key
274:     (thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
275:      thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end(),   COO.column_indices.end())),
276:      COO.values.begin(),
277:      thrust::make_zip_iterator(thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())),
278:      A.values.begin(),
279:      thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
280:      thrust::plus<ValueType>());
281: #endif

283:   // print the final matrix
284:   if (PetscLogPrintInfo) {cusp::print(A);}

286:   //std::cout << "Writing matrix" << std::endl;
287:   //cusp::io::write_matrix_market_file(A, "A.mtx");

289:   PetscInfo(J, "Converting to PETSc matrix\n");
290:   MatSetType(J, MATSEQAIJCUSP);
291:   //cusp::csr_matrix<PetscInt,PetscScalar,cusp::device_memory> Jgpu;
292:   CUSPMATRIX *Jgpu = new CUSPMATRIX;
293:   cusp::convert(A, *Jgpu);
294:   if (PetscLogPrintInfo) {cusp::print(*Jgpu);}
295:   PetscInfo(J, "Copying to CPU matrix\n");
296:   MatCUSPCopyFromGPU(J, Jgpu);
297:   return(0);
298: }