ViennaCL - The Vienna Computing Library  1.5.2
compressed_matrix.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
3 
7 #include "viennacl/ocl/utils.hpp"
8 
10 
13 namespace viennacl
14 {
15  namespace linalg
16  {
17  namespace opencl
18  {
19  namespace kernels
20  {
21 
23 
24  template <typename StringType>
25  void generate_compressed_matrix_block_trans_lu_backward(StringType & source, std::string const & numeric_string)
26  {
27  source.append("__kernel void block_trans_lu_backward( \n");
28  source.append(" __global const unsigned int * row_jumper_U, \n"); //U part (note that U is transposed in memory)
29  source.append(" __global const unsigned int * column_indices_U, \n");
30  source.append(" __global const "); source.append(numeric_string); source.append(" * elements_U, \n");
31  source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_U, \n");
32  source.append(" __global const unsigned int * block_offsets, \n");
33  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
34  source.append(" unsigned int size) \n");
35  source.append("{ \n");
36  source.append(" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
37  source.append(" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
38  source.append(" unsigned int row_start; \n");
39  source.append(" unsigned int row_stop; \n");
40  source.append(" "); source.append(numeric_string); source.append(" result_entry = 0; \n");
41 
42  source.append(" if (col_start >= col_stop) \n");
43  source.append(" return; \n");
44 
45  //backward elimination, using U and diagonal_U
46  source.append(" for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n");
47  source.append(" { \n");
48  source.append(" unsigned int col = (col_stop - iter) - 1; \n");
49  source.append(" result_entry = result[col] / diagonal_U[col]; \n");
50  source.append(" row_start = row_jumper_U[col]; \n");
51  source.append(" row_stop = row_jumper_U[col + 1]; \n");
52  source.append(" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
53  source.append(" result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n");
54  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
55  source.append(" } \n");
56 
57  //divide result vector by diagonal:
58  source.append(" for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n");
59  source.append(" result[col] /= diagonal_U[col]; \n");
60  source.append("} \n");
61  }
62 
63  template <typename StringType>
64  void generate_compressed_matrix_block_trans_unit_lu_forward(StringType & source, std::string const & numeric_string)
65  {
66  source.append("__kernel void block_trans_unit_lu_forward( \n");
67  source.append(" __global const unsigned int * row_jumper_L, \n"); //L part (note that L is transposed in memory)
68  source.append(" __global const unsigned int * column_indices_L, \n");
69  source.append(" __global const "); source.append(numeric_string); source.append(" * elements_L, \n");
70  source.append(" __global const unsigned int * block_offsets, \n");
71  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
72  source.append(" unsigned int size) \n");
73  source.append("{ \n");
74  source.append(" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
75  source.append(" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
76  source.append(" unsigned int row_start = row_jumper_L[col_start]; \n");
77  source.append(" unsigned int row_stop; \n");
78  source.append(" "); source.append(numeric_string); source.append(" result_entry = 0; \n");
79 
80  source.append(" if (col_start >= col_stop) \n");
81  source.append(" return; \n");
82 
83  //forward elimination, using L:
84  source.append(" for (unsigned int col = col_start; col < col_stop; ++col) \n");
85  source.append(" { \n");
86  source.append(" result_entry = result[col]; \n");
87  source.append(" row_stop = row_jumper_L[col + 1]; \n");
88  source.append(" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
89  source.append(" result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n");
90  source.append(" row_start = row_stop; \n"); //for next iteration (avoid unnecessary loads from GPU RAM)
91  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
92  source.append(" } \n");
93 
94  source.append("}; \n");
95  }
96 
97  namespace detail
98  {
100  template <typename StringType>
101  void generate_compressed_matrix_dense_matrix_mult(StringType & source, std::string const & numeric_string,
102  bool B_transposed, bool B_row_major, bool C_row_major)
103  {
104  source.append("__kernel void ");
105  source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
106  source.append("( \n");
107  source.append(" __global const unsigned int * sp_mat_row_indices, \n");
108  source.append(" __global const unsigned int * sp_mat_col_indices, \n");
109  source.append(" __global const "); source.append(numeric_string); source.append(" * sp_mat_elements, \n");
110  source.append(" __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
111  source.append(" unsigned int d_mat_row_start, \n");
112  source.append(" unsigned int d_mat_col_start, \n");
113  source.append(" unsigned int d_mat_row_inc, \n");
114  source.append(" unsigned int d_mat_col_inc, \n");
115  source.append(" unsigned int d_mat_row_size, \n");
116  source.append(" unsigned int d_mat_col_size, \n");
117  source.append(" unsigned int d_mat_internal_rows, \n");
118  source.append(" unsigned int d_mat_internal_cols, \n");
119  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
120  source.append(" unsigned int result_row_start, \n");
121  source.append(" unsigned int result_col_start, \n");
122  source.append(" unsigned int result_row_inc, \n");
123  source.append(" unsigned int result_col_inc, \n");
124  source.append(" unsigned int result_row_size, \n");
125  source.append(" unsigned int result_col_size, \n");
126  source.append(" unsigned int result_internal_rows, \n");
127  source.append(" unsigned int result_internal_cols) { \n");
128 
129  // split work rows (sparse matrix rows) to thread groups
130  source.append(" for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n");
131 
132  source.append(" unsigned int row_start = sp_mat_row_indices[row]; \n");
133  source.append(" unsigned int row_end = sp_mat_row_indices[row+1]; \n");
134 
135  // split result cols between threads in a thread group
136  source.append(" for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n");
137 
138  source.append(" "); source.append(numeric_string); source.append(" r = 0; \n");
139 
140  source.append(" for (unsigned int k = row_start; k < row_end; k ++) { \n");
141 
142  source.append(" unsigned int j = sp_mat_col_indices[k]; \n");
143  source.append(" "); source.append(numeric_string); source.append(" x = sp_mat_elements[k]; \n");
144 
145  source.append(" "); source.append(numeric_string);
146  if (B_transposed && B_row_major)
147  source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + j * d_mat_col_inc ]; \n");
148  else if (B_transposed && !B_row_major)
149  source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) + (d_mat_col_start + j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
150  else if (!B_transposed && B_row_major)
151  source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
152  else
153  source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
154  source.append(" r += x * y; \n");
155  source.append(" } \n");
156 
157  if (C_row_major)
158  source.append(" result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
159  else
160  source.append(" result[ (result_row_start + row * result_row_inc) + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
161  source.append(" } \n");
162  source.append(" } \n");
163 
164  source.append("} \n");
165 
166  }
167  }
168  template <typename StringType>
169  void generate_compressed_matrix_dense_matrix_multiplication(StringType & source, std::string const & numeric_string)
170  {
171  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, false);
172  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, true);
173  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, true, false);
174  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, true, true);
175 
176  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, false);
177  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, true);
178  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, true, false);
179  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, true, true);
180  }
181 
182  template <typename StringType>
183  void generate_compressed_matrix_jacobi(StringType & source, std::string const & numeric_string)
184  {
185 
186  source.append(" __kernel void jacobi( \n");
187  source.append(" __global const unsigned int * row_indices, \n");
188  source.append(" __global const unsigned int * column_indices, \n");
189  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
190  source.append(" "); source.append(numeric_string); source.append(" weight, \n");
191  source.append(" __global const "); source.append(numeric_string); source.append(" * old_result, \n");
192  source.append(" __global "); source.append(numeric_string); source.append(" * new_result, \n");
193  source.append(" __global const "); source.append(numeric_string); source.append(" * rhs, \n");
194  source.append(" unsigned int size) \n");
195  source.append(" { \n");
196  source.append(" "); source.append(numeric_string); source.append(" sum, diag=1; \n");
197  source.append(" int col; \n");
198  source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
199  source.append(" { \n");
200  source.append(" sum = 0; \n");
201  source.append(" for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n");
202  source.append(" { \n");
203  source.append(" col = column_indices[j]; \n");
204  source.append(" if (i == col) \n");
205  source.append(" diag = elements[j]; \n");
206  source.append(" else \n");
207  source.append(" sum += elements[j] * old_result[col]; \n");
208  source.append(" } \n");
209  source.append(" new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n");
210  source.append(" } \n");
211  source.append(" } \n");
212 
213  }
214 
215  template <typename StringType>
216  void generate_compressed_matrix_lu_backward(StringType & source, std::string const & numeric_string)
217  {
218  // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
219  source.append("__kernel void lu_backward( \n");
220  source.append(" __global const unsigned int * row_indices, \n");
221  source.append(" __global const unsigned int * column_indices, \n");
222  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
223  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
224  source.append(" unsigned int size) \n");
225  source.append("{ \n");
226  source.append(" __local unsigned int col_index_buffer[128]; \n");
227  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
228  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
229 
230  source.append(" unsigned int nnz = row_indices[size]; \n");
231  source.append(" unsigned int current_row = size-1; \n");
232  source.append(" unsigned int row_at_window_start = size-1; \n");
233  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
234  source.append(" "); source.append(numeric_string); source.append(" diagonal_entry = 0; \n");
235  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
236  source.append(" unsigned int next_row = row_indices[size-1]; \n");
237 
238  source.append(" unsigned int i = loop_end + get_local_id(0); \n");
239  source.append(" while (1) \n");
240  source.append(" { \n");
241  //load into shared memory (coalesced access):
242  source.append(" if (i < nnz) \n");
243  source.append(" { \n");
244  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
245  source.append(" unsigned int tmp = column_indices[i]; \n");
246  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
247  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
248  source.append(" } \n");
249 
250  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
251 
252  //now a single thread does the remaining work in shared memory:
253  source.append(" if (get_local_id(0) == 0) \n");
254  source.append(" { \n");
255  // traverse through all the loaded data from back to front:
256  source.append(" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
257  source.append(" { \n");
258  source.append(" unsigned int k = (get_local_size(0) - k2) - 1; \n");
259 
260  source.append(" if (i+k >= nnz) \n");
261  source.append(" continue; \n");
262 
263  source.append(" if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
264  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
265  source.append(" else if (col_index_buffer[k] > current_row) \n"); //use buffered data
266  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
267  source.append(" else if (col_index_buffer[k] == current_row) \n");
268  source.append(" diagonal_entry = element_buffer[k]; \n");
269 
270  source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
271  source.append(" { \n");
272  source.append(" vector[current_row] = current_vector_entry / diagonal_entry; \n");
273  source.append(" if (current_row > 0) //load next row's data \n");
274  source.append(" { \n");
275  source.append(" --current_row; \n");
276  source.append(" next_row = row_indices[current_row]; \n");
277  source.append(" current_vector_entry = vector[current_row]; \n");
278  source.append(" } \n");
279  source.append(" } \n");
280 
281 
282  source.append(" } \n"); // for k
283 
284  source.append(" row_at_window_start = current_row; \n");
285  source.append(" } \n"); // if (get_local_id(0) == 0)
286 
287  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
288 
289  source.append(" if (i < get_local_size(0)) \n");
290  source.append(" break; \n");
291 
292  source.append(" i -= get_local_size(0); \n");
293  source.append(" } \n"); //for i
294  source.append("} \n");
295 
296  }
297 
298  template <typename StringType>
299  void generate_compressed_matrix_lu_forward(StringType & source, std::string const & numeric_string)
300  {
301 
302  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
303  source.append("__kernel void lu_forward( \n");
304  source.append(" __global const unsigned int * row_indices, \n");
305  source.append(" __global const unsigned int * column_indices, \n");
306  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
307  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
308  source.append(" unsigned int size) \n");
309  source.append("{ \n");
310  source.append(" __local unsigned int col_index_buffer[128]; \n");
311  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
312  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
313 
314  source.append(" unsigned int nnz = row_indices[size]; \n");
315  source.append(" unsigned int current_row = 0; \n");
316  source.append(" unsigned int row_at_window_start = 0; \n");
317  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
318  source.append(" "); source.append(numeric_string); source.append(" diagonal_entry; \n");
319  source.append(" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
320  source.append(" unsigned int next_row = row_indices[1]; \n");
321 
322  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
323  source.append(" { \n");
324  //load into shared memory (coalesced access):
325  source.append(" if (i < nnz) \n");
326  source.append(" { \n");
327  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
328  source.append(" unsigned int tmp = column_indices[i]; \n");
329  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
330  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
331  source.append(" } \n");
332 
333  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
334 
335  //now a single thread does the remaining work in shared memory:
336  source.append(" if (get_local_id(0) == 0) \n");
337  source.append(" { \n");
338  // traverse through all the loaded data:
339  source.append(" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
340  source.append(" { \n");
341  source.append(" if (current_row < size && i+k == next_row) \n"); //current row is finished. Write back result
342  source.append(" { \n");
343  source.append(" vector[current_row] = current_vector_entry / diagonal_entry; \n");
344  source.append(" ++current_row; \n");
345  source.append(" if (current_row < size) \n"); //load next row's data
346  source.append(" { \n");
347  source.append(" next_row = row_indices[current_row+1]; \n");
348  source.append(" current_vector_entry = vector[current_row]; \n");
349  source.append(" } \n");
350  source.append(" } \n");
351 
352  source.append(" if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
353  source.append(" { \n");
354  source.append(" if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
355  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
356  source.append(" else if (col_index_buffer[k] < current_row) \n"); //use buffered data
357  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
358  source.append(" } \n");
359  source.append(" else if (col_index_buffer[k] == current_row) \n");
360  source.append(" diagonal_entry = element_buffer[k]; \n");
361 
362  source.append(" } \n"); // for k
363 
364  source.append(" row_at_window_start = current_row; \n");
365  source.append(" } \n"); // if (get_local_id(0) == 0)
366 
367  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
368  source.append(" } \n"); //for i
369  source.append("} \n");
370 
371  }
372 
373  template <typename StringType>
374  void generate_compressed_matrix_row_info_extractor(StringType & source, std::string const & numeric_string)
375  {
376  source.append("__kernel void row_info_extractor( \n");
377  source.append(" __global const unsigned int * row_indices, \n");
378  source.append(" __global const unsigned int * column_indices, \n");
379  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
380  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
381  source.append(" unsigned int size, \n");
382  source.append(" unsigned int option \n");
383  source.append(" ) \n");
384  source.append("{ \n");
385  source.append(" for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
386  source.append(" { \n");
387  source.append(" "); source.append(numeric_string); source.append(" value = 0; \n");
388  source.append(" unsigned int row_end = row_indices[row+1]; \n");
389 
390  source.append(" switch (option) \n");
391  source.append(" { \n");
392  source.append(" case 0: \n"); //inf-norm
393  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
394  source.append(" value = max(value, fabs(elements[i])); \n");
395  source.append(" break; \n");
396 
397  source.append(" case 1: \n"); //1-norm
398  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
399  source.append(" value += fabs(elements[i]); \n");
400  source.append(" break; \n");
401 
402  source.append(" case 2: \n"); //2-norm
403  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
404  source.append(" value += elements[i] * elements[i]; \n");
405  source.append(" value = sqrt(value); \n");
406  source.append(" break; \n");
407 
408  source.append(" case 3: \n"); //diagonal entry
409  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
410  source.append(" { \n");
411  source.append(" if (column_indices[i] == row) \n");
412  source.append(" { \n");
413  source.append(" value = elements[i]; \n");
414  source.append(" break; \n");
415  source.append(" } \n");
416  source.append(" } \n");
417  source.append(" break; \n");
418 
419  source.append(" default: \n");
420  source.append(" break; \n");
421  source.append(" } \n");
422  source.append(" result[row] = value; \n");
423  source.append(" } \n");
424  source.append("} \n");
425 
426  }
427 
428  template <typename StringType>
429  void generate_compressed_matrix_trans_lu_backward(StringType & source, std::string const & numeric_string)
430  {
431 
432  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
433  source.append("__kernel void trans_lu_backward( \n");
434  source.append(" __global const unsigned int * row_indices, \n");
435  source.append(" __global const unsigned int * column_indices, \n");
436  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
437  source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
438  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
439  source.append(" unsigned int size) \n");
440  source.append("{ \n");
441  source.append(" __local unsigned int row_index_lookahead[256]; \n");
442  source.append(" __local unsigned int row_index_buffer[256]; \n");
443 
444  source.append(" unsigned int row_index; \n");
445  source.append(" unsigned int col_index; \n");
446  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
447  source.append(" unsigned int nnz = row_indices[size]; \n");
448  source.append(" unsigned int row_at_window_start = size; \n");
449  source.append(" unsigned int row_at_window_end; \n");
450  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
451 
452  source.append(" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
453  source.append(" { \n");
454  source.append(" unsigned int i = (nnz - i2) - 1; \n");
455  source.append(" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
456  source.append(" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
457  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
458 
459  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
460 
461  source.append(" if (i2 < nnz) \n");
462  source.append(" { \n");
463  source.append(" unsigned int row_index_dec = 0; \n");
464  source.append(" while (row_index_lookahead[row_index_dec] > i) \n");
465  source.append(" ++row_index_dec; \n");
466  source.append(" row_index = row_at_window_start - row_index_dec; \n");
467  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
468  source.append(" } \n");
469  source.append(" else \n");
470  source.append(" { \n");
471  source.append(" row_index = size+1; \n");
472  source.append(" row_index_buffer[get_local_id(0)] = 0; \n");
473  source.append(" } \n");
474 
475  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
476 
477  source.append(" row_at_window_start = row_index_buffer[0]; \n");
478  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
479 
480  //backward elimination
481  source.append(" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
482  source.append(" { \n");
483  source.append(" unsigned int row = row_at_window_start - row2; \n");
484  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
485 
486  source.append(" if ( (row_index == row) && (col_index < row) ) \n");
487  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
488 
489  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
490  source.append(" } \n");
491 
492  source.append(" row_at_window_start = row_at_window_end; \n");
493  source.append(" } \n");
494 
495  // final step: Divide vector by diagonal entries:
496  source.append(" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
497  source.append(" vector[i] /= diagonal_entries[i]; \n");
498  source.append("} \n");
499 
500  }
501 
502  template <typename StringType>
503  void generate_compressed_matrix_trans_lu_forward(StringType & source, std::string const & numeric_string)
504  {
505 
506  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
507  source.append("__kernel void trans_lu_forward( \n");
508  source.append(" __global const unsigned int * row_indices, \n");
509  source.append(" __global const unsigned int * column_indices, \n");
510  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
511  source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
512  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
513  source.append(" unsigned int size) \n");
514  source.append("{ \n");
515  source.append(" __local unsigned int row_index_lookahead[256]; \n");
516  source.append(" __local unsigned int row_index_buffer[256]; \n");
517 
518  source.append(" unsigned int row_index; \n");
519  source.append(" unsigned int col_index; \n");
520  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
521  source.append(" unsigned int nnz = row_indices[size]; \n");
522  source.append(" unsigned int row_at_window_start = 0; \n");
523  source.append(" unsigned int row_at_window_end = 0; \n");
524  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
525 
526  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
527  source.append(" { \n");
528  source.append(" col_index = (i < nnz) ? column_indices[i] : 0; \n");
529  source.append(" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
530  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1; \n");
531 
532  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
533 
534  source.append(" if (i < nnz) \n");
535  source.append(" { \n");
536  source.append(" unsigned int row_index_inc = 0; \n");
537  source.append(" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
538  source.append(" ++row_index_inc; \n");
539  source.append(" row_index = row_at_window_start + row_index_inc; \n");
540  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
541  source.append(" } \n");
542  source.append(" else \n");
543  source.append(" { \n");
544  source.append(" row_index = size+1; \n");
545  source.append(" row_index_buffer[get_local_id(0)] = size - 1; \n");
546  source.append(" } \n");
547 
548  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
549 
550  source.append(" row_at_window_start = row_index_buffer[0]; \n");
551  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
552 
553  //forward elimination
554  source.append(" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
555  source.append(" { \n");
556  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
557 
558  source.append(" if ( (row_index == row) && (col_index > row) ) \n");
559  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
560 
561  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
562  source.append(" } \n");
563 
564  source.append(" row_at_window_start = row_at_window_end; \n");
565  source.append(" } \n");
566 
567  // final step: Divide vector by diagonal entries:
568  source.append(" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
569  source.append(" vector[i] /= diagonal_entries[i]; \n");
570  source.append("} \n");
571 
572  }
573 
574  template <typename StringType>
575  void generate_compressed_matrix_trans_unit_lu_backward(StringType & source, std::string const & numeric_string)
576  {
577 
578  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
579  source.append("__kernel void trans_unit_lu_backward( \n");
580  source.append(" __global const unsigned int * row_indices, \n");
581  source.append(" __global const unsigned int * column_indices, \n");
582  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
583  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
584  source.append(" unsigned int size) \n");
585  source.append("{ \n");
586  source.append(" __local unsigned int row_index_lookahead[256]; \n");
587  source.append(" __local unsigned int row_index_buffer[256]; \n");
588 
589  source.append(" unsigned int row_index; \n");
590  source.append(" unsigned int col_index; \n");
591  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
592  source.append(" unsigned int nnz = row_indices[size]; \n");
593  source.append(" unsigned int row_at_window_start = size; \n");
594  source.append(" unsigned int row_at_window_end; \n");
595  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
596 
597  source.append(" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
598  source.append(" { \n");
599  source.append(" unsigned int i = (nnz - i2) - 1; \n");
600  source.append(" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
601  source.append(" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
602  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
603 
604  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
605 
606  source.append(" if (i2 < nnz) \n");
607  source.append(" { \n");
608  source.append(" unsigned int row_index_dec = 0; \n");
609  source.append(" while (row_index_lookahead[row_index_dec] > i) \n");
610  source.append(" ++row_index_dec; \n");
611  source.append(" row_index = row_at_window_start - row_index_dec; \n");
612  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
613  source.append(" } \n");
614  source.append(" else \n");
615  source.append(" { \n");
616  source.append(" row_index = size+1; \n");
617  source.append(" row_index_buffer[get_local_id(0)] = 0; \n");
618  source.append(" } \n");
619 
620  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
621 
622  source.append(" row_at_window_start = row_index_buffer[0]; \n");
623  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
624 
625  //backward elimination
626  source.append(" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
627  source.append(" { \n");
628  source.append(" unsigned int row = row_at_window_start - row2; \n");
629  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
630 
631  source.append(" if ( (row_index == row) && (col_index < row) ) \n");
632  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
633 
634  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
635  source.append(" } \n");
636 
637  source.append(" row_at_window_start = row_at_window_end; \n");
638  source.append(" } \n");
639  source.append("} \n");
640 
641  }
642 
643 
644  template <typename StringType>
645  void generate_compressed_matrix_trans_unit_lu_forward(StringType & source, std::string const & numeric_string)
646  {
647 
648  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
649  source.append("__kernel void trans_unit_lu_forward( \n");
650  source.append(" __global const unsigned int * row_indices, \n");
651  source.append(" __global const unsigned int * column_indices, \n");
652  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
653  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
654  source.append(" unsigned int size) \n");
655  source.append("{ \n");
656  source.append(" __local unsigned int row_index_lookahead[256]; \n");
657  source.append(" __local unsigned int row_index_buffer[256]; \n");
658 
659  source.append(" unsigned int row_index; \n");
660  source.append(" unsigned int col_index; \n");
661  source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
662  source.append(" unsigned int nnz = row_indices[size]; \n");
663  source.append(" unsigned int row_at_window_start = 0; \n");
664  source.append(" unsigned int row_at_window_end = 0; \n");
665  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
666 
667  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
668  source.append(" { \n");
669  source.append(" col_index = (i < nnz) ? column_indices[i] : 0; \n");
670  source.append(" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
671  source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1; \n");
672 
673  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
674 
675  source.append(" if (i < nnz) \n");
676  source.append(" { \n");
677  source.append(" unsigned int row_index_inc = 0; \n");
678  source.append(" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
679  source.append(" ++row_index_inc; \n");
680  source.append(" row_index = row_at_window_start + row_index_inc; \n");
681  source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
682  source.append(" } \n");
683  source.append(" else \n");
684  source.append(" { \n");
685  source.append(" row_index = size+1; \n");
686  source.append(" row_index_buffer[get_local_id(0)] = size - 1; \n");
687  source.append(" } \n");
688 
689  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
690 
691  source.append(" row_at_window_start = row_index_buffer[0]; \n");
692  source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
693 
694  //forward elimination
695  source.append(" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
696  source.append(" { \n");
697  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
698 
699  source.append(" if ( (row_index == row) && (col_index > row) ) \n");
700  source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
701 
702  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
703  source.append(" } \n");
704 
705  source.append(" row_at_window_start = row_at_window_end; \n");
706  source.append(" } \n");
707  source.append("} \n");
708 
709  }
710 
711  template <typename StringType>
712  void generate_compressed_matrix_trans_unit_lu_forward_slow(StringType & source, std::string const & numeric_string)
713  {
714 
715  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
716  source.append("__kernel void trans_unit_lu_forward_slow( \n");
717  source.append(" __global const unsigned int * row_indices, \n");
718  source.append(" __global const unsigned int * column_indices, \n");
719  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
720  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
721  source.append(" unsigned int size) \n");
722  source.append("{ \n");
723  source.append(" for (unsigned int row = 0; row < size; ++row) \n");
724  source.append(" { \n");
725  source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
726 
727  source.append(" unsigned int row_start = row_indices[row]; \n");
728  source.append(" unsigned int row_stop = row_indices[row + 1]; \n");
729  source.append(" for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n");
730  source.append(" { \n");
731  source.append(" unsigned int col_index = column_indices[entry_index]; \n");
732  source.append(" if (col_index > row) \n");
733  source.append(" vector[col_index] -= result_entry * elements[entry_index]; \n");
734  source.append(" } \n");
735 
736  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
737  source.append(" } \n");
738  source.append("} \n");
739 
740  }
741 
742  template <typename StringType>
743  void generate_compressed_matrix_unit_lu_backward(StringType & source, std::string const & numeric_string)
744  {
745 
746  // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
747  source.append("__kernel void unit_lu_backward( \n");
748  source.append(" __global const unsigned int * row_indices, \n");
749  source.append(" __global const unsigned int * column_indices, \n");
750  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
751  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
752  source.append(" unsigned int size) \n");
753  source.append("{ \n");
754  source.append(" __local unsigned int col_index_buffer[128]; \n");
755  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
756  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
757 
758  source.append(" unsigned int nnz = row_indices[size]; \n");
759  source.append(" unsigned int current_row = size-1; \n");
760  source.append(" unsigned int row_at_window_start = size-1; \n");
761  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
762  source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
763  source.append(" unsigned int next_row = row_indices[size-1]; \n");
764 
765  source.append(" unsigned int i = loop_end + get_local_id(0); \n");
766  source.append(" while (1) \n");
767  source.append(" { \n");
768  //load into shared memory (coalesced access):
769  source.append(" if (i < nnz) \n");
770  source.append(" { \n");
771  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
772  source.append(" unsigned int tmp = column_indices[i]; \n");
773  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
774  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
775  source.append(" } \n");
776 
777  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
778 
779  //now a single thread does the remaining work in shared memory:
780  source.append(" if (get_local_id(0) == 0) \n");
781  source.append(" { \n");
782  // traverse through all the loaded data from back to front:
783  source.append(" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
784  source.append(" { \n");
785  source.append(" unsigned int k = (get_local_size(0) - k2) - 1; \n");
786 
787  source.append(" if (i+k >= nnz) \n");
788  source.append(" continue; \n");
789 
790  source.append(" if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
791  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
792  source.append(" else if (col_index_buffer[k] > current_row) \n"); //use buffered data
793  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
794 
795  source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
796  source.append(" { \n");
797  source.append(" vector[current_row] = current_vector_entry; \n");
798  source.append(" if (current_row > 0) \n"); //load next row's data
799  source.append(" { \n");
800  source.append(" --current_row; \n");
801  source.append(" next_row = row_indices[current_row]; \n");
802  source.append(" current_vector_entry = vector[current_row]; \n");
803  source.append(" } \n");
804  source.append(" } \n");
805 
806 
807  source.append(" } \n"); // for k
808 
809  source.append(" row_at_window_start = current_row; \n");
810  source.append(" } \n"); // if (get_local_id(0) == 0)
811 
812  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
813 
814  source.append(" if (i < get_local_size(0)) \n");
815  source.append(" break; \n");
816 
817  source.append(" i -= get_local_size(0); \n");
818  source.append(" } \n"); //for i
819  source.append("} \n");
820 
821  }
822 
823  template <typename StringType>
824  void generate_compressed_matrix_unit_lu_forward(StringType & source, std::string const & numeric_string)
825  {
826 
827  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
828  source.append("__kernel void unit_lu_forward( \n");
829  source.append(" __global const unsigned int * row_indices, \n");
830  source.append(" __global const unsigned int * column_indices, \n");
831  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
832  source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
833  source.append(" unsigned int size) \n");
834  source.append("{ \n");
835  source.append(" __local unsigned int col_index_buffer[128]; \n");
836  source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
837  source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
838 
839  source.append(" unsigned int nnz = row_indices[size]; \n");
840  source.append(" unsigned int current_row = 0; \n");
841  source.append(" unsigned int row_at_window_start = 0; \n");
842  source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
843  source.append(" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
844  source.append(" unsigned int next_row = row_indices[1]; \n");
845 
846  source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
847  source.append(" { \n");
848  //load into shared memory (coalesced access):
849  source.append(" if (i < nnz) \n");
850  source.append(" { \n");
851  source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
852  source.append(" unsigned int tmp = column_indices[i]; \n");
853  source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
854  source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
855  source.append(" } \n");
856 
857  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
858 
859  //now a single thread does the remaining work in shared memory:
860  source.append(" if (get_local_id(0) == 0) \n");
861  source.append(" { \n");
862  // traverse through all the loaded data:
863  source.append(" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
864  source.append(" { \n");
865  source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
866  source.append(" { \n");
867  source.append(" vector[current_row] = current_vector_entry; \n");
868  source.append(" ++current_row; \n");
869  source.append(" if (current_row < size) //load next row's data \n");
870  source.append(" { \n");
871  source.append(" next_row = row_indices[current_row+1]; \n");
872  source.append(" current_vector_entry = vector[current_row]; \n");
873  source.append(" } \n");
874  source.append(" } \n");
875 
876  source.append(" if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
877  source.append(" { \n");
878  source.append(" if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
879  source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
880  source.append(" else if (col_index_buffer[k] < current_row) \n"); //use buffered data
881  source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
882  source.append(" } \n");
883 
884  source.append(" } \n"); // for k
885 
886  source.append(" row_at_window_start = current_row; \n");
887  source.append(" } \n"); // if (get_local_id(0) == 0)
888 
889  source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
890  source.append(" } //for i \n");
891  source.append("} \n");
892 
893  }
894 
895  template <typename StringType>
896  void generate_compressed_matrix_vec_mul(StringType & source, std::string const & numeric_string)
897  {
898 
899  source.append("__kernel void vec_mul( \n");
900  source.append(" __global const unsigned int * row_indices, \n");
901  source.append(" __global const unsigned int * column_indices, \n");
902  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
903  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
904  source.append(" uint4 layout_x, \n");
905  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
906  source.append(" uint4 layout_result) \n");
907  source.append("{ \n");
908  source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
909  source.append(" { \n");
910  source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
911  source.append(" unsigned int row_end = row_indices[row+1]; \n");
912  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
913  source.append(" dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
914  source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
915  source.append(" } \n");
916  source.append("} \n");
917 
918  }
919 
920  template <typename StringType>
921  void generate_compressed_matrix_vec_mul4(StringType & source, std::string const & numeric_string)
922  {
923  source.append("__kernel void vec_mul4( \n");
924  source.append(" __global const unsigned int * row_indices, \n");
925  source.append(" __global const uint4 * column_indices, \n");
926  source.append(" __global const "); source.append(numeric_string); source.append("4 * elements, \n");
927  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
928  source.append(" uint4 layout_x, \n");
929  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
930  source.append(" uint4 layout_result) \n");
931  source.append("{ \n");
932  source.append(" "); source.append(numeric_string); source.append(" dot_prod; \n");
933  source.append(" unsigned int start, next_stop; \n");
934  source.append(" uint4 col_idx; \n");
935  source.append(" "); source.append(numeric_string); source.append("4 tmp_vec; \n");
936  source.append(" "); source.append(numeric_string); source.append("4 tmp_entries; \n");
937 
938  source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
939  source.append(" { \n");
940  source.append(" dot_prod = 0; \n");
941  source.append(" start = row_indices[row] / 4; \n");
942  source.append(" next_stop = row_indices[row+1] / 4; \n");
943 
944  source.append(" for (unsigned int i = start; i < next_stop; ++i) \n");
945  source.append(" { \n");
946  source.append(" col_idx = column_indices[i]; \n");
947 
948  source.append(" tmp_entries = elements[i]; \n");
949  source.append(" tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n");
950  source.append(" tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n");
951  source.append(" tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n");
952  source.append(" tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n");
953 
954  source.append(" dot_prod += dot(tmp_entries, tmp_vec); \n");
955  source.append(" } \n");
956  source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
957  source.append(" } \n");
958  source.append("} \n");
959  }
960 
961  template <typename StringType>
962  void generate_compressed_matrix_vec_mul8(StringType & source, std::string const & numeric_string)
963  {
964  source.append("__kernel void vec_mul8( \n");
965  source.append(" __global const unsigned int * row_indices, \n");
966  source.append(" __global const uint8 * column_indices, \n");
967  source.append(" __global const "); source.append(numeric_string); source.append("8 * elements, \n");
968  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
969  source.append(" uint4 layout_x, \n");
970  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
971  source.append(" uint4 layout_result) \n");
972  source.append("{ \n");
973  source.append(" "); source.append(numeric_string); source.append(" dot_prod; \n");
974  source.append(" unsigned int start, next_stop; \n");
975  source.append(" uint8 col_idx; \n");
976  source.append(" "); source.append(numeric_string); source.append("8 tmp_vec; \n");
977  source.append(" "); source.append(numeric_string); source.append("8 tmp_entries; \n");
978 
979  source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
980  source.append(" { \n");
981  source.append(" dot_prod = 0; \n");
982  source.append(" start = row_indices[row] / 8; \n");
983  source.append(" next_stop = row_indices[row+1] / 8; \n");
984 
985  source.append(" for (unsigned int i = start; i < next_stop; ++i) \n");
986  source.append(" { \n");
987  source.append(" col_idx = column_indices[i]; \n");
988 
989  source.append(" tmp_entries = elements[i]; \n");
990  source.append(" tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n");
991  source.append(" tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n");
992  source.append(" tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n");
993  source.append(" tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n");
994  source.append(" tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n");
995  source.append(" tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n");
996  source.append(" tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n");
997  source.append(" tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n");
998 
999  source.append(" dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n");
1000  source.append(" dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n");
1001  source.append(" } \n");
1002  source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
1003  source.append(" } \n");
1004  source.append("} \n");
1005  }
1006 
1007  template <typename StringType>
1008  void generate_compressed_matrix_vec_mul_cpu(StringType & source, std::string const & numeric_string)
1009  {
1010  source.append("__kernel void vec_mul_cpu( \n");
1011  source.append(" __global const unsigned int * row_indices, \n");
1012  source.append(" __global const unsigned int * column_indices, \n");
1013  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
1014  source.append(" __global const "); source.append(numeric_string); source.append(" * vector, \n");
1015  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
1016  source.append(" unsigned int size) \n");
1017  source.append("{ \n");
1018  source.append(" unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n");
1019  source.append(" unsigned int row_start = get_global_id(0) * work_per_item; \n");
1020  source.append(" unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n");
1021  source.append(" for (unsigned int row = row_start; row < row_stop; ++row) \n");
1022  source.append(" { \n");
1023  source.append(" "); source.append(numeric_string); source.append(" dot_prod = ("); source.append(numeric_string); source.append(")0; \n");
1024  source.append(" unsigned int row_end = row_indices[row+1]; \n");
1025  source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
1026  source.append(" dot_prod += elements[i] * vector[column_indices[i]]; \n");
1027  source.append(" result[row] = dot_prod; \n");
1028  source.append(" } \n");
1029  source.append("} \n");
1030 
1031  }
1032 
1033 
1035 
1036  // main kernel class
1038  template <typename NumericT>
1040  {
1041  static std::string program_name()
1042  {
1043  return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_matrix";
1044  }
1045 
1046  static void init(viennacl::ocl::context & ctx)
1047  {
1049  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
1050 
1051  static std::map<cl_context, bool> init_done;
1052  if (!init_done[ctx.handle().get()])
1053  {
1054  std::string source;
1055  source.reserve(1024);
1056 
1057  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1058 
1059  if (numeric_string == "float" || numeric_string == "double")
1060  {
1063  generate_compressed_matrix_jacobi(source, numeric_string);
1064  generate_compressed_matrix_lu_backward(source, numeric_string);
1065  generate_compressed_matrix_lu_forward(source, numeric_string);
1066  generate_compressed_matrix_trans_lu_backward(source, numeric_string);
1067  generate_compressed_matrix_trans_lu_forward(source, numeric_string);
1068  generate_compressed_matrix_trans_unit_lu_backward(source, numeric_string);
1069  generate_compressed_matrix_trans_unit_lu_forward(source, numeric_string);
1071  generate_compressed_matrix_unit_lu_backward(source, numeric_string);
1072  generate_compressed_matrix_unit_lu_forward(source, numeric_string);
1073  }
1075  generate_compressed_matrix_row_info_extractor(source, numeric_string);
1076  generate_compressed_matrix_vec_mul(source, numeric_string);
1077  generate_compressed_matrix_vec_mul4(source, numeric_string);
1078  generate_compressed_matrix_vec_mul8(source, numeric_string);
1079  generate_compressed_matrix_vec_mul_cpu(source, numeric_string);
1080 
1081  std::string prog_name = program_name();
1082  #ifdef VIENNACL_BUILD_INFO
1083  std::cout << "Creating program " << prog_name << std::endl;
1084  #endif
1085  ctx.add_program(source, prog_name);
1086  init_done[ctx.handle().get()] = true;
1087  } //if
1088  } //init
1089  };
1090 
1091  } // namespace kernels
1092  } // namespace opencl
1093  } // namespace linalg
1094 } // namespace viennacl
1095 #endif
1096 
void generate_compressed_matrix_lu_backward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:216
void generate_compressed_matrix_dense_matrix_mult(StringType &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Generate kernel for C = A * B with A being a compressed_matrix, B and C dense.
Definition: compressed_matrix.hpp:101
static std::string program_name()
Definition: compressed_matrix.hpp:1041
void generate_compressed_matrix_trans_lu_backward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:429
void generate_compressed_matrix_block_trans_lu_backward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:25
Implements a OpenCL platform within ViennaCL.
void generate_compressed_matrix_vec_mul_cpu(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:1008
Common implementations shared by OpenCL-based operations.
void generate_compressed_matrix_trans_unit_lu_forward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:645
Various little tools used here and there in ViennaCL.
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Definition: common.hpp:46
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:51
Provides OpenCL-related utilities.
void generate_compressed_matrix_trans_lu_forward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:503
void generate_compressed_matrix_unit_lu_backward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:743
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:476
void generate_compressed_matrix_dense_matrix_multiplication(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:169
const OCL_TYPE & get() const
Definition: handle.hpp:189
Main kernel class for generating OpenCL kernels for compressed_matrix.
Definition: compressed_matrix.hpp:1039
void generate_compressed_matrix_unit_lu_forward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:824
static void init(viennacl::ocl::context &ctx)
Definition: compressed_matrix.hpp:1046
void generate_compressed_matrix_row_info_extractor(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:374
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
void generate_compressed_matrix_jacobi(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:183
void generate_compressed_matrix_vec_mul4(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:921
Representation of an OpenCL kernel in ViennaCL.
void generate_compressed_matrix_trans_unit_lu_forward_slow(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:712
void generate_compressed_matrix_vec_mul(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:896
void generate_compressed_matrix_vec_mul8(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:962
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_compressed_matrix_trans_unit_lu_backward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:575
void generate_compressed_matrix_block_trans_unit_lu_forward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:64
void generate_compressed_matrix_lu_forward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:299