Actual source code: aijAssemble.cu
1: #include "petscconf.h"
3: #include ../src/mat/impls/aij/seq/aij.h
4: #include petscbt.h
5: #include ../src/vec/vec/impls/dvecimpl.h
6: #include private/vecimpl.h
8: #undef VecType
9: #include ../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h
11: #include <thrust/reduce.h>
12: #include <thrust/inner_product.h>
14: #include <cusp/array1d.h>
15: #include <cusp/print.h>
16: #include <cusp/coo_matrix.h>
17: #include <cusp/detail/device/reduce_by_key.h>
19: #include <cusp/io/matrix_market.h>
21: #include <thrust/iterator/counting_iterator.h>
22: #include <thrust/iterator/transform_iterator.h>
23: #include <thrust/iterator/permutation_iterator.h>
24: #include <thrust/functional.h>
26: // this example illustrates how to make repeated access to a range of values
27: // examples:
28: // repeated_range([0, 1, 2, 3], 1) -> [0, 1, 2, 3]
29: // repeated_range([0, 1, 2, 3], 2) -> [0, 0, 1, 1, 2, 2, 3, 3]
30: // repeated_range([0, 1, 2, 3], 3) -> [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
31: // ...
33: template <typename Iterator>
34: class repeated_range
35: {
36: public:
38: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
40: struct repeat_functor : public thrust::unary_function<difference_type,difference_type>
41: {
42: difference_type repeats;
44: repeat_functor(difference_type repeats)
45: : repeats(repeats) {}
47: __host__ __device__
48: difference_type operator()(const difference_type& i) const
49: {
50: return i / repeats;
51: }
52: };
54: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
55: typedef typename thrust::transform_iterator<repeat_functor, CountingIterator> TransformIterator;
56: typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
58: // type of the repeated_range iterator
59: typedef PermutationIterator iterator;
61: // construct repeated_range for the range [first,last)
62: repeated_range(Iterator first, Iterator last, difference_type repeats)
63: : first(first), last(last), repeats(repeats) {}
65: iterator begin(void) const
66: {
67: return PermutationIterator(first, TransformIterator(CountingIterator(0), repeat_functor(repeats)));
68: }
70: iterator end(void) const
71: {
72: return begin() + repeats * (last - first);
73: }
75: protected:
76: difference_type repeats;
77: Iterator first;
78: Iterator last;
80: };
82: // this example illustrates how to repeat blocks in a range multiple times
83: // examples:
84: // tiled_range([0, 1, 2, 3], 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
85: // tiled_range([0, 1, 2, 3], 4, 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
86: // tiled_range([0, 1, 2, 3], 2, 2) -> [0, 1, 0, 1, 2, 3, 2, 3]
87: // tiled_range([0, 1, 2, 3], 2, 3) -> [0, 1, 0, 1 0, 1, 2, 3, 2, 3, 2, 3]
88: // ...
90: template <typename Iterator>
91: class tiled_range
92: {
93: public:
95: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
97: struct tile_functor : public thrust::unary_function<difference_type,difference_type>
98: {
99: difference_type repeats;
100: difference_type tile_size;
102: tile_functor(difference_type repeats, difference_type tile_size)
103: : tile_size(tile_size), repeats(repeats) {}
105: __host__ __device__
106: difference_type operator()(const difference_type& i) const
107: {
108: return tile_size * (i / (tile_size * repeats)) + i % tile_size;
109: }
110: };
112: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
113: typedef typename thrust::transform_iterator<tile_functor, CountingIterator> TransformIterator;
114: typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
116: // type of the tiled_range iterator
117: typedef PermutationIterator iterator;
119: // construct repeated_range for the range [first,last)
120: tiled_range(Iterator first, Iterator last, difference_type repeats)
121: : first(first), last(last), repeats(repeats), tile_size(last - first) {}
123: tiled_range(Iterator first, Iterator last, difference_type repeats, difference_type tile_size)
124: : first(first), last(last), repeats(repeats), tile_size(tile_size)
125: {
126: // ASSERT((last - first) % tile_size == 0)
127: }
129: iterator begin(void) const
130: {
131: return PermutationIterator(first, TransformIterator(CountingIterator(0), tile_functor(repeats, tile_size)));
132: }
134: iterator end(void) const
135: {
136: return begin() + repeats * (last - first);
137: }
139: protected:
140: difference_type repeats;
141: difference_type tile_size;
142: Iterator first;
143: Iterator last;
144: };
146: typedef cusp::device_memory memSpace;
147: typedef int IndexType;
148: typedef float ValueType;
149: typedef cusp::array1d<IndexType, memSpace> IndexArray;
150: typedef cusp::array1d<ValueType, memSpace> ValueArray;
151: typedef IndexArray::iterator IndexArrayIterator;
152: typedef ValueArray::iterator ValueArrayIterator;
154: // Ne: Number of elements
155: // Nl: Number of dof per element
158: PetscErrorCode MatSetValuesBatch_SeqAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats)
159: {
160: size_t N = Ne * Nl;
161: size_t No = Ne * Nl*Nl;
162: PetscInt Nr; // Number of rows
165: // copy elemRows and elemMat to device
166: IndexArray d_elemRows(elemRows, elemRows + N);
167: ValueArray d_elemMats(elemMats, elemMats + No);
170: MatGetSize(J, &Nr, PETSC_NULL);
171: // allocate storage for "fat" COO representation of matrix
172: PetscInfo1(J, "Making COO matrix of size %d\n", Nr);
173: cusp::coo_matrix<IndexType,ValueType, memSpace> COO(Nr, Nr, No);
175: // repeat elemRows entries Nl times
176: PetscInfo(J, "Making row indices\n");
177: repeated_range<IndexArrayIterator> rowInd(d_elemRows.begin(), d_elemRows.end(), Nl);
178: thrust::copy(rowInd.begin(), rowInd.end(), COO.row_indices.begin());
180: // tile rows of elemRows Nl times
181: PetscInfo(J, "Making column indices\n");
182: tiled_range<IndexArrayIterator> colInd(d_elemRows.begin(), d_elemRows.end(), Nl, Nl);
183: thrust::copy(colInd.begin(), colInd.end(), COO.column_indices.begin());
185: // copy values from elemMats into COO structure (could be avoided)
186: thrust::copy(d_elemMats.begin(), d_elemMats.end(), COO.values.begin());
188: // For MPIAIJ, split this into two COO matrices, and return both
189: // Need the column map
191: // print the "fat" COO representation
192: if (PetscLogPrintInfo) {cusp::print(COO);}
194: // sort COO format by (i,j), this is the most costly step
195: PetscInfo(J, "Sorting rows and columns\n");
196: #if 1
197: COO.sort_by_row_and_column();
198: #else
199: {
200: PetscInfo(J, " Making permutation\n");
201: IndexArray permutation(No);
202: thrust::sequence(permutation.begin(), permutation.end());
204: // compute permutation and sort by (I,J)
205: {
206: PetscInfo(J, " Sorting columns\n");
207: IndexArray temp(No);
208: thrust::copy(COO.column_indices.begin(), COO.column_indices.end(), temp.begin());
209: thrust::stable_sort_by_key(temp.begin(), temp.end(), permutation.begin());
210: PetscInfo(J, " Sorted columns\n");
211: if (PetscLogPrintInfo) {
212: for(IndexArrayIterator t_iter = temp.begin(), p_iter = permutation.begin(); t_iter != temp.end(); ++t_iter, ++p_iter) {
213: PetscInfo2(J, "%d(%d)\n", *t_iter, *p_iter);
214: }
215: }
217: PetscInfo(J, " Copying rows\n");
218: //cusp::copy(COO.row_indices, temp);
219: thrust::copy(COO.row_indices.begin(), COO.row_indices.end(), temp.begin());
220: PetscInfo(J, " Gathering rows\n");
221: thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.row_indices.begin());
222: PetscInfo(J, " Sorting rows\n");
223: thrust::stable_sort_by_key(COO.row_indices.begin(), COO.row_indices.end(), permutation.begin());
225: PetscInfo(J, " Gathering columns\n");
226: cusp::copy(COO.column_indices, temp);
227: thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.column_indices.begin());
228: }
230: // use permutation to reorder the values
231: {
232: PetscInfo(J, " Sorting values\n");
233: ValueArray temp(COO.values);
234: cusp::copy(COO.values, temp);
235: thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.values.begin());
236: }
237: }
238: #endif
240: // print the "fat" COO representation
241: if (PetscLogPrintInfo) {cusp::print(COO);}
243: // compute number of unique (i,j) entries
244: // this counts the number of changes as we move along the (i,j) list
245: PetscInfo(J, "Computing number of unique entries\n");
246: size_t num_entries = thrust::inner_product
247: (thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
248: thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end (), COO.column_indices.end())) - 1,
249: thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())) + 1,
250: size_t(1),
251: thrust::plus<size_t>(),
252: thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());
254: // allocate COO storage for final matrix
255: PetscInfo(J, "Allocating compressed matrix\n");
256: cusp::coo_matrix<IndexType, ValueType, memSpace> A(Nr, Nr, num_entries);
258: // sum values with the same (i,j) index
259: // XXX thrust::reduce_by_key is unoptimized right now, so we provide a SpMV-based one in cusp::detail
260: // the Cusp one is 2x faster, but still not optimal
261: // This could possibly be done in-place
262: PetscInfo(J, "Compressing matrix\n");
263: #if 1
264: cusp::detail::device::reduce_by_key
265: (thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
266: thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end(), COO.column_indices.end())),
267: COO.values.begin(),
268: thrust::make_zip_iterator(thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())),
269: A.values.begin(),
270: thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
271: thrust::plus<ValueType>());
272: #else
273: thrust::reduce_by_key
274: (thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
275: thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end(), COO.column_indices.end())),
276: COO.values.begin(),
277: thrust::make_zip_iterator(thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())),
278: A.values.begin(),
279: thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
280: thrust::plus<ValueType>());
281: #endif
283: // print the final matrix
284: if (PetscLogPrintInfo) {cusp::print(A);}
286: //std::cout << "Writing matrix" << std::endl;
287: //cusp::io::write_matrix_market_file(A, "A.mtx");
289: PetscInfo(J, "Converting to PETSc matrix\n");
290: MatSetType(J, MATSEQAIJCUSP);
291: //cusp::csr_matrix<PetscInt,PetscScalar,cusp::device_memory> Jgpu;
292: CUSPMATRIX *Jgpu = new CUSPMATRIX;
293: cusp::convert(A, *Jgpu);
294: if (PetscLogPrintInfo) {cusp::print(*Jgpu);}
295: PetscInfo(J, "Copying to CPU matrix\n");
296: MatCUSPCopyFromGPU(J, Jgpu);
297: return(0);
298: }