1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
24 template <
typename StringType>
27 source.append(
"__kernel void block_trans_lu_backward( \n");
28 source.append(
" __global const unsigned int * row_jumper_U, \n");
29 source.append(
" __global const unsigned int * column_indices_U, \n");
30 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements_U, \n");
31 source.append(
" __global const "); source.append(numeric_string); source.append(
" * diagonal_U, \n");
32 source.append(
" __global const unsigned int * block_offsets, \n");
33 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
34 source.append(
" unsigned int size) \n");
35 source.append(
"{ \n");
36 source.append(
" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
37 source.append(
" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
38 source.append(
" unsigned int row_start; \n");
39 source.append(
" unsigned int row_stop; \n");
40 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = 0; \n");
42 source.append(
" if (col_start >= col_stop) \n");
43 source.append(
" return; \n");
46 source.append(
" for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n");
47 source.append(
" { \n");
48 source.append(
" unsigned int col = (col_stop - iter) - 1; \n");
49 source.append(
" result_entry = result[col] / diagonal_U[col]; \n");
50 source.append(
" row_start = row_jumper_U[col]; \n");
51 source.append(
" row_stop = row_jumper_U[col + 1]; \n");
52 source.append(
" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
53 source.append(
" result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n");
54 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
55 source.append(
" } \n");
58 source.append(
" for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n");
59 source.append(
" result[col] /= diagonal_U[col]; \n");
60 source.append(
"} \n");
63 template <
typename StringType>
66 source.append(
"__kernel void block_trans_unit_lu_forward( \n");
67 source.append(
" __global const unsigned int * row_jumper_L, \n");
68 source.append(
" __global const unsigned int * column_indices_L, \n");
69 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements_L, \n");
70 source.append(
" __global const unsigned int * block_offsets, \n");
71 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
72 source.append(
" unsigned int size) \n");
73 source.append(
"{ \n");
74 source.append(
" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
75 source.append(
" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
76 source.append(
" unsigned int row_start = row_jumper_L[col_start]; \n");
77 source.append(
" unsigned int row_stop; \n");
78 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = 0; \n");
80 source.append(
" if (col_start >= col_stop) \n");
81 source.append(
" return; \n");
84 source.append(
" for (unsigned int col = col_start; col < col_stop; ++col) \n");
85 source.append(
" { \n");
86 source.append(
" result_entry = result[col]; \n");
87 source.append(
" row_stop = row_jumper_L[col + 1]; \n");
88 source.append(
" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
89 source.append(
" result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n");
90 source.append(
" row_start = row_stop; \n");
91 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
92 source.append(
" } \n");
94 source.append(
"}; \n");
100 template <
typename StringType>
102 bool B_transposed,
bool B_row_major,
bool C_row_major)
104 source.append(
"__kernel void ");
106 source.append(
"( \n");
107 source.append(
" __global const unsigned int * sp_mat_row_indices, \n");
108 source.append(
" __global const unsigned int * sp_mat_col_indices, \n");
109 source.append(
" __global const "); source.append(numeric_string); source.append(
" * sp_mat_elements, \n");
110 source.append(
" __global const "); source.append(numeric_string); source.append(
" * d_mat, \n");
111 source.append(
" unsigned int d_mat_row_start, \n");
112 source.append(
" unsigned int d_mat_col_start, \n");
113 source.append(
" unsigned int d_mat_row_inc, \n");
114 source.append(
" unsigned int d_mat_col_inc, \n");
115 source.append(
" unsigned int d_mat_row_size, \n");
116 source.append(
" unsigned int d_mat_col_size, \n");
117 source.append(
" unsigned int d_mat_internal_rows, \n");
118 source.append(
" unsigned int d_mat_internal_cols, \n");
119 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
120 source.append(
" unsigned int result_row_start, \n");
121 source.append(
" unsigned int result_col_start, \n");
122 source.append(
" unsigned int result_row_inc, \n");
123 source.append(
" unsigned int result_col_inc, \n");
124 source.append(
" unsigned int result_row_size, \n");
125 source.append(
" unsigned int result_col_size, \n");
126 source.append(
" unsigned int result_internal_rows, \n");
127 source.append(
" unsigned int result_internal_cols) { \n");
130 source.append(
" for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n");
132 source.append(
" unsigned int row_start = sp_mat_row_indices[row]; \n");
133 source.append(
" unsigned int row_end = sp_mat_row_indices[row+1]; \n");
136 source.append(
" for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n");
138 source.append(
" "); source.append(numeric_string); source.append(
" r = 0; \n");
140 source.append(
" for (unsigned int k = row_start; k < row_end; k ++) { \n");
142 source.append(
" unsigned int j = sp_mat_col_indices[k]; \n");
143 source.append(
" "); source.append(numeric_string); source.append(
" x = sp_mat_elements[k]; \n");
145 source.append(
" "); source.append(numeric_string);
146 if (B_transposed && B_row_major)
147 source.append(
" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + j * d_mat_col_inc ]; \n");
148 else if (B_transposed && !B_row_major)
149 source.append(
" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) + (d_mat_col_start + j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
150 else if (!B_transposed && B_row_major)
151 source.append(
" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
153 source.append(
" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
154 source.append(
" r += x * y; \n");
155 source.append(
" } \n");
158 source.append(
" result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
160 source.append(
" result[ (result_row_start + row * result_row_inc) + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
161 source.append(
" } \n");
162 source.append(
" } \n");
164 source.append(
"} \n");
168 template <
typename StringType>
182 template <
typename StringType>
186 source.append(
" __kernel void jacobi( \n");
187 source.append(
" __global const unsigned int * row_indices, \n");
188 source.append(
" __global const unsigned int * column_indices, \n");
189 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
190 source.append(
" "); source.append(numeric_string); source.append(
" weight, \n");
191 source.append(
" __global const "); source.append(numeric_string); source.append(
" * old_result, \n");
192 source.append(
" __global "); source.append(numeric_string); source.append(
" * new_result, \n");
193 source.append(
" __global const "); source.append(numeric_string); source.append(
" * rhs, \n");
194 source.append(
" unsigned int size) \n");
195 source.append(
" { \n");
196 source.append(
" "); source.append(numeric_string); source.append(
" sum, diag=1; \n");
197 source.append(
" int col; \n");
198 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
199 source.append(
" { \n");
200 source.append(
" sum = 0; \n");
201 source.append(
" for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n");
202 source.append(
" { \n");
203 source.append(
" col = column_indices[j]; \n");
204 source.append(
" if (i == col) \n");
205 source.append(
" diag = elements[j]; \n");
206 source.append(
" else \n");
207 source.append(
" sum += elements[j] * old_result[col]; \n");
208 source.append(
" } \n");
209 source.append(
" new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n");
210 source.append(
" } \n");
211 source.append(
" } \n");
215 template <
typename StringType>
219 source.append(
"__kernel void lu_backward( \n");
220 source.append(
" __global const unsigned int * row_indices, \n");
221 source.append(
" __global const unsigned int * column_indices, \n");
222 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
223 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
224 source.append(
" unsigned int size) \n");
225 source.append(
"{ \n");
226 source.append(
" __local unsigned int col_index_buffer[128]; \n");
227 source.append(
" __local "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
228 source.append(
" __local "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
230 source.append(
" unsigned int nnz = row_indices[size]; \n");
231 source.append(
" unsigned int current_row = size-1; \n");
232 source.append(
" unsigned int row_at_window_start = size-1; \n");
233 source.append(
" "); source.append(numeric_string); source.append(
" current_vector_entry = vector[size-1]; \n");
234 source.append(
" "); source.append(numeric_string); source.append(
" diagonal_entry = 0; \n");
235 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
236 source.append(
" unsigned int next_row = row_indices[size-1]; \n");
238 source.append(
" unsigned int i = loop_end + get_local_id(0); \n");
239 source.append(
" while (1) \n");
240 source.append(
" { \n");
242 source.append(
" if (i < nnz) \n");
243 source.append(
" { \n");
244 source.append(
" element_buffer[get_local_id(0)] = elements[i]; \n");
245 source.append(
" unsigned int tmp = column_indices[i]; \n");
246 source.append(
" col_index_buffer[get_local_id(0)] = tmp; \n");
247 source.append(
" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
248 source.append(
" } \n");
250 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
253 source.append(
" if (get_local_id(0) == 0) \n");
254 source.append(
" { \n");
256 source.append(
" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
257 source.append(
" { \n");
258 source.append(
" unsigned int k = (get_local_size(0) - k2) - 1; \n");
260 source.append(
" if (i+k >= nnz) \n");
261 source.append(
" continue; \n");
263 source.append(
" if (col_index_buffer[k] > row_at_window_start) \n");
264 source.append(
" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
265 source.append(
" else if (col_index_buffer[k] > current_row) \n");
266 source.append(
" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
267 source.append(
" else if (col_index_buffer[k] == current_row) \n");
268 source.append(
" diagonal_entry = element_buffer[k]; \n");
270 source.append(
" if (i+k == next_row) \n");
271 source.append(
" { \n");
272 source.append(
" vector[current_row] = current_vector_entry / diagonal_entry; \n");
273 source.append(
" if (current_row > 0) //load next row's data \n");
274 source.append(
" { \n");
275 source.append(
" --current_row; \n");
276 source.append(
" next_row = row_indices[current_row]; \n");
277 source.append(
" current_vector_entry = vector[current_row]; \n");
278 source.append(
" } \n");
279 source.append(
" } \n");
282 source.append(
" } \n");
284 source.append(
" row_at_window_start = current_row; \n");
285 source.append(
" } \n");
287 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
289 source.append(
" if (i < get_local_size(0)) \n");
290 source.append(
" break; \n");
292 source.append(
" i -= get_local_size(0); \n");
293 source.append(
" } \n");
294 source.append(
"} \n");
298 template <
typename StringType>
303 source.append(
"__kernel void lu_forward( \n");
304 source.append(
" __global const unsigned int * row_indices, \n");
305 source.append(
" __global const unsigned int * column_indices, \n");
306 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
307 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
308 source.append(
" unsigned int size) \n");
309 source.append(
"{ \n");
310 source.append(
" __local unsigned int col_index_buffer[128]; \n");
311 source.append(
" __local "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
312 source.append(
" __local "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
314 source.append(
" unsigned int nnz = row_indices[size]; \n");
315 source.append(
" unsigned int current_row = 0; \n");
316 source.append(
" unsigned int row_at_window_start = 0; \n");
317 source.append(
" "); source.append(numeric_string); source.append(
" current_vector_entry = vector[0]; \n");
318 source.append(
" "); source.append(numeric_string); source.append(
" diagonal_entry; \n");
319 source.append(
" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
320 source.append(
" unsigned int next_row = row_indices[1]; \n");
322 source.append(
" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
323 source.append(
" { \n");
325 source.append(
" if (i < nnz) \n");
326 source.append(
" { \n");
327 source.append(
" element_buffer[get_local_id(0)] = elements[i]; \n");
328 source.append(
" unsigned int tmp = column_indices[i]; \n");
329 source.append(
" col_index_buffer[get_local_id(0)] = tmp; \n");
330 source.append(
" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
331 source.append(
" } \n");
333 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
336 source.append(
" if (get_local_id(0) == 0) \n");
337 source.append(
" { \n");
339 source.append(
" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
340 source.append(
" { \n");
341 source.append(
" if (current_row < size && i+k == next_row) \n");
342 source.append(
" { \n");
343 source.append(
" vector[current_row] = current_vector_entry / diagonal_entry; \n");
344 source.append(
" ++current_row; \n");
345 source.append(
" if (current_row < size) \n");
346 source.append(
" { \n");
347 source.append(
" next_row = row_indices[current_row+1]; \n");
348 source.append(
" current_vector_entry = vector[current_row]; \n");
349 source.append(
" } \n");
350 source.append(
" } \n");
352 source.append(
" if (current_row < size && col_index_buffer[k] < current_row) \n");
353 source.append(
" { \n");
354 source.append(
" if (col_index_buffer[k] < row_at_window_start) \n");
355 source.append(
" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
356 source.append(
" else if (col_index_buffer[k] < current_row) \n");
357 source.append(
" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
358 source.append(
" } \n");
359 source.append(
" else if (col_index_buffer[k] == current_row) \n");
360 source.append(
" diagonal_entry = element_buffer[k]; \n");
362 source.append(
" } \n");
364 source.append(
" row_at_window_start = current_row; \n");
365 source.append(
" } \n");
367 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
368 source.append(
" } \n");
369 source.append(
"} \n");
373 template <
typename StringType>
376 source.append(
"__kernel void row_info_extractor( \n");
377 source.append(
" __global const unsigned int * row_indices, \n");
378 source.append(
" __global const unsigned int * column_indices, \n");
379 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
380 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
381 source.append(
" unsigned int size, \n");
382 source.append(
" unsigned int option \n");
383 source.append(
" ) \n");
384 source.append(
"{ \n");
385 source.append(
" for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
386 source.append(
" { \n");
387 source.append(
" "); source.append(numeric_string); source.append(
" value = 0; \n");
388 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
390 source.append(
" switch (option) \n");
391 source.append(
" { \n");
392 source.append(
" case 0: \n");
393 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
394 source.append(
" value = max(value, fabs(elements[i])); \n");
395 source.append(
" break; \n");
397 source.append(
" case 1: \n");
398 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
399 source.append(
" value += fabs(elements[i]); \n");
400 source.append(
" break; \n");
402 source.append(
" case 2: \n");
403 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
404 source.append(
" value += elements[i] * elements[i]; \n");
405 source.append(
" value = sqrt(value); \n");
406 source.append(
" break; \n");
408 source.append(
" case 3: \n");
409 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
410 source.append(
" { \n");
411 source.append(
" if (column_indices[i] == row) \n");
412 source.append(
" { \n");
413 source.append(
" value = elements[i]; \n");
414 source.append(
" break; \n");
415 source.append(
" } \n");
416 source.append(
" } \n");
417 source.append(
" break; \n");
419 source.append(
" default: \n");
420 source.append(
" break; \n");
421 source.append(
" } \n");
422 source.append(
" result[row] = value; \n");
423 source.append(
" } \n");
424 source.append(
"} \n");
428 template <
typename StringType>
433 source.append(
"__kernel void trans_lu_backward( \n");
434 source.append(
" __global const unsigned int * row_indices, \n");
435 source.append(
" __global const unsigned int * column_indices, \n");
436 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
437 source.append(
" __global const "); source.append(numeric_string); source.append(
" * diagonal_entries, \n");
438 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
439 source.append(
" unsigned int size) \n");
440 source.append(
"{ \n");
441 source.append(
" __local unsigned int row_index_lookahead[256]; \n");
442 source.append(
" __local unsigned int row_index_buffer[256]; \n");
444 source.append(
" unsigned int row_index; \n");
445 source.append(
" unsigned int col_index; \n");
446 source.append(
" "); source.append(numeric_string); source.append(
" matrix_entry; \n");
447 source.append(
" unsigned int nnz = row_indices[size]; \n");
448 source.append(
" unsigned int row_at_window_start = size; \n");
449 source.append(
" unsigned int row_at_window_end; \n");
450 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
452 source.append(
" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
453 source.append(
" { \n");
454 source.append(
" unsigned int i = (nnz - i2) - 1; \n");
455 source.append(
" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
456 source.append(
" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
457 source.append(
" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
459 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
461 source.append(
" if (i2 < nnz) \n");
462 source.append(
" { \n");
463 source.append(
" unsigned int row_index_dec = 0; \n");
464 source.append(
" while (row_index_lookahead[row_index_dec] > i) \n");
465 source.append(
" ++row_index_dec; \n");
466 source.append(
" row_index = row_at_window_start - row_index_dec; \n");
467 source.append(
" row_index_buffer[get_local_id(0)] = row_index; \n");
468 source.append(
" } \n");
469 source.append(
" else \n");
470 source.append(
" { \n");
471 source.append(
" row_index = size+1; \n");
472 source.append(
" row_index_buffer[get_local_id(0)] = 0; \n");
473 source.append(
" } \n");
475 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
477 source.append(
" row_at_window_start = row_index_buffer[0]; \n");
478 source.append(
" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
481 source.append(
" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
482 source.append(
" { \n");
483 source.append(
" unsigned int row = row_at_window_start - row2; \n");
484 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = vector[row] / diagonal_entries[row]; \n");
486 source.append(
" if ( (row_index == row) && (col_index < row) ) \n");
487 source.append(
" vector[col_index] -= result_entry * matrix_entry; \n");
489 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
490 source.append(
" } \n");
492 source.append(
" row_at_window_start = row_at_window_end; \n");
493 source.append(
" } \n");
496 source.append(
" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
497 source.append(
" vector[i] /= diagonal_entries[i]; \n");
498 source.append(
"} \n");
502 template <
typename StringType>
507 source.append(
"__kernel void trans_lu_forward( \n");
508 source.append(
" __global const unsigned int * row_indices, \n");
509 source.append(
" __global const unsigned int * column_indices, \n");
510 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
511 source.append(
" __global const "); source.append(numeric_string); source.append(
" * diagonal_entries, \n");
512 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
513 source.append(
" unsigned int size) \n");
514 source.append(
"{ \n");
515 source.append(
" __local unsigned int row_index_lookahead[256]; \n");
516 source.append(
" __local unsigned int row_index_buffer[256]; \n");
518 source.append(
" unsigned int row_index; \n");
519 source.append(
" unsigned int col_index; \n");
520 source.append(
" "); source.append(numeric_string); source.append(
" matrix_entry; \n");
521 source.append(
" unsigned int nnz = row_indices[size]; \n");
522 source.append(
" unsigned int row_at_window_start = 0; \n");
523 source.append(
" unsigned int row_at_window_end = 0; \n");
524 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
526 source.append(
" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
527 source.append(
" { \n");
528 source.append(
" col_index = (i < nnz) ? column_indices[i] : 0; \n");
529 source.append(
" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
530 source.append(
" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1; \n");
532 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
534 source.append(
" if (i < nnz) \n");
535 source.append(
" { \n");
536 source.append(
" unsigned int row_index_inc = 0; \n");
537 source.append(
" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
538 source.append(
" ++row_index_inc; \n");
539 source.append(
" row_index = row_at_window_start + row_index_inc; \n");
540 source.append(
" row_index_buffer[get_local_id(0)] = row_index; \n");
541 source.append(
" } \n");
542 source.append(
" else \n");
543 source.append(
" { \n");
544 source.append(
" row_index = size+1; \n");
545 source.append(
" row_index_buffer[get_local_id(0)] = size - 1; \n");
546 source.append(
" } \n");
548 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
550 source.append(
" row_at_window_start = row_index_buffer[0]; \n");
551 source.append(
" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
554 source.append(
" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
555 source.append(
" { \n");
556 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = vector[row] / diagonal_entries[row]; \n");
558 source.append(
" if ( (row_index == row) && (col_index > row) ) \n");
559 source.append(
" vector[col_index] -= result_entry * matrix_entry; \n");
561 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
562 source.append(
" } \n");
564 source.append(
" row_at_window_start = row_at_window_end; \n");
565 source.append(
" } \n");
568 source.append(
" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
569 source.append(
" vector[i] /= diagonal_entries[i]; \n");
570 source.append(
"} \n");
574 template <
typename StringType>
579 source.append(
"__kernel void trans_unit_lu_backward( \n");
580 source.append(
" __global const unsigned int * row_indices, \n");
581 source.append(
" __global const unsigned int * column_indices, \n");
582 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
583 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
584 source.append(
" unsigned int size) \n");
585 source.append(
"{ \n");
586 source.append(
" __local unsigned int row_index_lookahead[256]; \n");
587 source.append(
" __local unsigned int row_index_buffer[256]; \n");
589 source.append(
" unsigned int row_index; \n");
590 source.append(
" unsigned int col_index; \n");
591 source.append(
" "); source.append(numeric_string); source.append(
" matrix_entry; \n");
592 source.append(
" unsigned int nnz = row_indices[size]; \n");
593 source.append(
" unsigned int row_at_window_start = size; \n");
594 source.append(
" unsigned int row_at_window_end; \n");
595 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
597 source.append(
" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
598 source.append(
" { \n");
599 source.append(
" unsigned int i = (nnz - i2) - 1; \n");
600 source.append(
" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
601 source.append(
" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
602 source.append(
" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
604 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
606 source.append(
" if (i2 < nnz) \n");
607 source.append(
" { \n");
608 source.append(
" unsigned int row_index_dec = 0; \n");
609 source.append(
" while (row_index_lookahead[row_index_dec] > i) \n");
610 source.append(
" ++row_index_dec; \n");
611 source.append(
" row_index = row_at_window_start - row_index_dec; \n");
612 source.append(
" row_index_buffer[get_local_id(0)] = row_index; \n");
613 source.append(
" } \n");
614 source.append(
" else \n");
615 source.append(
" { \n");
616 source.append(
" row_index = size+1; \n");
617 source.append(
" row_index_buffer[get_local_id(0)] = 0; \n");
618 source.append(
" } \n");
620 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
622 source.append(
" row_at_window_start = row_index_buffer[0]; \n");
623 source.append(
" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
626 source.append(
" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
627 source.append(
" { \n");
628 source.append(
" unsigned int row = row_at_window_start - row2; \n");
629 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = vector[row]; \n");
631 source.append(
" if ( (row_index == row) && (col_index < row) ) \n");
632 source.append(
" vector[col_index] -= result_entry * matrix_entry; \n");
634 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
635 source.append(
" } \n");
637 source.append(
" row_at_window_start = row_at_window_end; \n");
638 source.append(
" } \n");
639 source.append(
"} \n");
644 template <
typename StringType>
649 source.append(
"__kernel void trans_unit_lu_forward( \n");
650 source.append(
" __global const unsigned int * row_indices, \n");
651 source.append(
" __global const unsigned int * column_indices, \n");
652 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
653 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
654 source.append(
" unsigned int size) \n");
655 source.append(
"{ \n");
656 source.append(
" __local unsigned int row_index_lookahead[256]; \n");
657 source.append(
" __local unsigned int row_index_buffer[256]; \n");
659 source.append(
" unsigned int row_index; \n");
660 source.append(
" unsigned int col_index; \n");
661 source.append(
" "); source.append(numeric_string); source.append(
" matrix_entry; \n");
662 source.append(
" unsigned int nnz = row_indices[size]; \n");
663 source.append(
" unsigned int row_at_window_start = 0; \n");
664 source.append(
" unsigned int row_at_window_end = 0; \n");
665 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
667 source.append(
" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
668 source.append(
" { \n");
669 source.append(
" col_index = (i < nnz) ? column_indices[i] : 0; \n");
670 source.append(
" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
671 source.append(
" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1; \n");
673 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
675 source.append(
" if (i < nnz) \n");
676 source.append(
" { \n");
677 source.append(
" unsigned int row_index_inc = 0; \n");
678 source.append(
" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
679 source.append(
" ++row_index_inc; \n");
680 source.append(
" row_index = row_at_window_start + row_index_inc; \n");
681 source.append(
" row_index_buffer[get_local_id(0)] = row_index; \n");
682 source.append(
" } \n");
683 source.append(
" else \n");
684 source.append(
" { \n");
685 source.append(
" row_index = size+1; \n");
686 source.append(
" row_index_buffer[get_local_id(0)] = size - 1; \n");
687 source.append(
" } \n");
689 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
691 source.append(
" row_at_window_start = row_index_buffer[0]; \n");
692 source.append(
" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
695 source.append(
" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
696 source.append(
" { \n");
697 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = vector[row]; \n");
699 source.append(
" if ( (row_index == row) && (col_index > row) ) \n");
700 source.append(
" vector[col_index] -= result_entry * matrix_entry; \n");
702 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
703 source.append(
" } \n");
705 source.append(
" row_at_window_start = row_at_window_end; \n");
706 source.append(
" } \n");
707 source.append(
"} \n");
711 template <
typename StringType>
716 source.append(
"__kernel void trans_unit_lu_forward_slow( \n");
717 source.append(
" __global const unsigned int * row_indices, \n");
718 source.append(
" __global const unsigned int * column_indices, \n");
719 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
720 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
721 source.append(
" unsigned int size) \n");
722 source.append(
"{ \n");
723 source.append(
" for (unsigned int row = 0; row < size; ++row) \n");
724 source.append(
" { \n");
725 source.append(
" "); source.append(numeric_string); source.append(
" result_entry = vector[row]; \n");
727 source.append(
" unsigned int row_start = row_indices[row]; \n");
728 source.append(
" unsigned int row_stop = row_indices[row + 1]; \n");
729 source.append(
" for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n");
730 source.append(
" { \n");
731 source.append(
" unsigned int col_index = column_indices[entry_index]; \n");
732 source.append(
" if (col_index > row) \n");
733 source.append(
" vector[col_index] -= result_entry * elements[entry_index]; \n");
734 source.append(
" } \n");
736 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
737 source.append(
" } \n");
738 source.append(
"} \n");
742 template <
typename StringType>
747 source.append(
"__kernel void unit_lu_backward( \n");
748 source.append(
" __global const unsigned int * row_indices, \n");
749 source.append(
" __global const unsigned int * column_indices, \n");
750 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
751 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
752 source.append(
" unsigned int size) \n");
753 source.append(
"{ \n");
754 source.append(
" __local unsigned int col_index_buffer[128]; \n");
755 source.append(
" __local "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
756 source.append(
" __local "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
758 source.append(
" unsigned int nnz = row_indices[size]; \n");
759 source.append(
" unsigned int current_row = size-1; \n");
760 source.append(
" unsigned int row_at_window_start = size-1; \n");
761 source.append(
" "); source.append(numeric_string); source.append(
" current_vector_entry = vector[size-1]; \n");
762 source.append(
" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
763 source.append(
" unsigned int next_row = row_indices[size-1]; \n");
765 source.append(
" unsigned int i = loop_end + get_local_id(0); \n");
766 source.append(
" while (1) \n");
767 source.append(
" { \n");
769 source.append(
" if (i < nnz) \n");
770 source.append(
" { \n");
771 source.append(
" element_buffer[get_local_id(0)] = elements[i]; \n");
772 source.append(
" unsigned int tmp = column_indices[i]; \n");
773 source.append(
" col_index_buffer[get_local_id(0)] = tmp; \n");
774 source.append(
" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
775 source.append(
" } \n");
777 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
780 source.append(
" if (get_local_id(0) == 0) \n");
781 source.append(
" { \n");
783 source.append(
" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
784 source.append(
" { \n");
785 source.append(
" unsigned int k = (get_local_size(0) - k2) - 1; \n");
787 source.append(
" if (i+k >= nnz) \n");
788 source.append(
" continue; \n");
790 source.append(
" if (col_index_buffer[k] > row_at_window_start) \n");
791 source.append(
" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
792 source.append(
" else if (col_index_buffer[k] > current_row) \n");
793 source.append(
" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
795 source.append(
" if (i+k == next_row) \n");
796 source.append(
" { \n");
797 source.append(
" vector[current_row] = current_vector_entry; \n");
798 source.append(
" if (current_row > 0) \n");
799 source.append(
" { \n");
800 source.append(
" --current_row; \n");
801 source.append(
" next_row = row_indices[current_row]; \n");
802 source.append(
" current_vector_entry = vector[current_row]; \n");
803 source.append(
" } \n");
804 source.append(
" } \n");
807 source.append(
" } \n");
809 source.append(
" row_at_window_start = current_row; \n");
810 source.append(
" } \n");
812 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
814 source.append(
" if (i < get_local_size(0)) \n");
815 source.append(
" break; \n");
817 source.append(
" i -= get_local_size(0); \n");
818 source.append(
" } \n");
819 source.append(
"} \n");
823 template <
typename StringType>
828 source.append(
"__kernel void unit_lu_forward( \n");
829 source.append(
" __global const unsigned int * row_indices, \n");
830 source.append(
" __global const unsigned int * column_indices, \n");
831 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
832 source.append(
" __global "); source.append(numeric_string); source.append(
" * vector, \n");
833 source.append(
" unsigned int size) \n");
834 source.append(
"{ \n");
835 source.append(
" __local unsigned int col_index_buffer[128]; \n");
836 source.append(
" __local "); source.append(numeric_string); source.append(
" element_buffer[128]; \n");
837 source.append(
" __local "); source.append(numeric_string); source.append(
" vector_buffer[128]; \n");
839 source.append(
" unsigned int nnz = row_indices[size]; \n");
840 source.append(
" unsigned int current_row = 0; \n");
841 source.append(
" unsigned int row_at_window_start = 0; \n");
842 source.append(
" "); source.append(numeric_string); source.append(
" current_vector_entry = vector[0]; \n");
843 source.append(
" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
844 source.append(
" unsigned int next_row = row_indices[1]; \n");
846 source.append(
" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
847 source.append(
" { \n");
849 source.append(
" if (i < nnz) \n");
850 source.append(
" { \n");
851 source.append(
" element_buffer[get_local_id(0)] = elements[i]; \n");
852 source.append(
" unsigned int tmp = column_indices[i]; \n");
853 source.append(
" col_index_buffer[get_local_id(0)] = tmp; \n");
854 source.append(
" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
855 source.append(
" } \n");
857 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
860 source.append(
" if (get_local_id(0) == 0) \n");
861 source.append(
" { \n");
863 source.append(
" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
864 source.append(
" { \n");
865 source.append(
" if (i+k == next_row) \n");
866 source.append(
" { \n");
867 source.append(
" vector[current_row] = current_vector_entry; \n");
868 source.append(
" ++current_row; \n");
869 source.append(
" if (current_row < size) //load next row's data \n");
870 source.append(
" { \n");
871 source.append(
" next_row = row_indices[current_row+1]; \n");
872 source.append(
" current_vector_entry = vector[current_row]; \n");
873 source.append(
" } \n");
874 source.append(
" } \n");
876 source.append(
" if (current_row < size && col_index_buffer[k] < current_row) \n");
877 source.append(
" { \n");
878 source.append(
" if (col_index_buffer[k] < row_at_window_start) \n");
879 source.append(
" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
880 source.append(
" else if (col_index_buffer[k] < current_row) \n");
881 source.append(
" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
882 source.append(
" } \n");
884 source.append(
" } \n");
886 source.append(
" row_at_window_start = current_row; \n");
887 source.append(
" } \n");
889 source.append(
" barrier(CLK_GLOBAL_MEM_FENCE); \n");
890 source.append(
" } //for i \n");
891 source.append(
"} \n");
895 template <
typename StringType>
899 source.append(
"__kernel void vec_mul( \n");
900 source.append(
" __global const unsigned int * row_indices, \n");
901 source.append(
" __global const unsigned int * column_indices, \n");
902 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
903 source.append(
" __global const "); source.append(numeric_string); source.append(
" * x, \n");
904 source.append(
" uint4 layout_x, \n");
905 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
906 source.append(
" uint4 layout_result) \n");
907 source.append(
"{ \n");
908 source.append(
" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
909 source.append(
" { \n");
910 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
911 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
912 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
913 source.append(
" dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
914 source.append(
" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
915 source.append(
" } \n");
916 source.append(
"} \n");
920 template <
typename StringType>
923 source.append(
"__kernel void vec_mul4( \n");
924 source.append(
" __global const unsigned int * row_indices, \n");
925 source.append(
" __global const uint4 * column_indices, \n");
926 source.append(
" __global const "); source.append(numeric_string); source.append(
"4 * elements, \n");
927 source.append(
" __global const "); source.append(numeric_string); source.append(
" * x, \n");
928 source.append(
" uint4 layout_x, \n");
929 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
930 source.append(
" uint4 layout_result) \n");
931 source.append(
"{ \n");
932 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod; \n");
933 source.append(
" unsigned int start, next_stop; \n");
934 source.append(
" uint4 col_idx; \n");
935 source.append(
" "); source.append(numeric_string); source.append(
"4 tmp_vec; \n");
936 source.append(
" "); source.append(numeric_string); source.append(
"4 tmp_entries; \n");
938 source.append(
" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
939 source.append(
" { \n");
940 source.append(
" dot_prod = 0; \n");
941 source.append(
" start = row_indices[row] / 4; \n");
942 source.append(
" next_stop = row_indices[row+1] / 4; \n");
944 source.append(
" for (unsigned int i = start; i < next_stop; ++i) \n");
945 source.append(
" { \n");
946 source.append(
" col_idx = column_indices[i]; \n");
948 source.append(
" tmp_entries = elements[i]; \n");
949 source.append(
" tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n");
950 source.append(
" tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n");
951 source.append(
" tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n");
952 source.append(
" tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n");
954 source.append(
" dot_prod += dot(tmp_entries, tmp_vec); \n");
955 source.append(
" } \n");
956 source.append(
" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
957 source.append(
" } \n");
958 source.append(
"} \n");
961 template <
typename StringType>
964 source.append(
"__kernel void vec_mul8( \n");
965 source.append(
" __global const unsigned int * row_indices, \n");
966 source.append(
" __global const uint8 * column_indices, \n");
967 source.append(
" __global const "); source.append(numeric_string); source.append(
"8 * elements, \n");
968 source.append(
" __global const "); source.append(numeric_string); source.append(
" * x, \n");
969 source.append(
" uint4 layout_x, \n");
970 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
971 source.append(
" uint4 layout_result) \n");
972 source.append(
"{ \n");
973 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod; \n");
974 source.append(
" unsigned int start, next_stop; \n");
975 source.append(
" uint8 col_idx; \n");
976 source.append(
" "); source.append(numeric_string); source.append(
"8 tmp_vec; \n");
977 source.append(
" "); source.append(numeric_string); source.append(
"8 tmp_entries; \n");
979 source.append(
" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
980 source.append(
" { \n");
981 source.append(
" dot_prod = 0; \n");
982 source.append(
" start = row_indices[row] / 8; \n");
983 source.append(
" next_stop = row_indices[row+1] / 8; \n");
985 source.append(
" for (unsigned int i = start; i < next_stop; ++i) \n");
986 source.append(
" { \n");
987 source.append(
" col_idx = column_indices[i]; \n");
989 source.append(
" tmp_entries = elements[i]; \n");
990 source.append(
" tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n");
991 source.append(
" tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n");
992 source.append(
" tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n");
993 source.append(
" tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n");
994 source.append(
" tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n");
995 source.append(
" tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n");
996 source.append(
" tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n");
997 source.append(
" tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n");
999 source.append(
" dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n");
1000 source.append(
" dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n");
1001 source.append(
" } \n");
1002 source.append(
" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
1003 source.append(
" } \n");
1004 source.append(
"} \n");
1007 template <
typename StringType>
1010 source.append(
"__kernel void vec_mul_cpu( \n");
1011 source.append(
" __global const unsigned int * row_indices, \n");
1012 source.append(
" __global const unsigned int * column_indices, \n");
1013 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1014 source.append(
" __global const "); source.append(numeric_string); source.append(
" * vector, \n");
1015 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
1016 source.append(
" unsigned int size) \n");
1017 source.append(
"{ \n");
1018 source.append(
" unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n");
1019 source.append(
" unsigned int row_start = get_global_id(0) * work_per_item; \n");
1020 source.append(
" unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n");
1021 source.append(
" for (unsigned int row = row_start; row < row_stop; ++row) \n");
1022 source.append(
" { \n");
1023 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = ("); source.append(numeric_string); source.append(
")0; \n");
1024 source.append(
" unsigned int row_end = row_indices[row+1]; \n");
1025 source.append(
" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
1026 source.append(
" dot_prod += elements[i] * vector[column_indices[i]]; \n");
1027 source.append(
" result[row] = dot_prod; \n");
1028 source.append(
" } \n");
1029 source.append(
"} \n");
1038 template <
typename NumericT>
1051 static std::map<cl_context, bool> init_done;
1055 source.reserve(1024);
1057 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1059 if (numeric_string ==
"float" || numeric_string ==
"double")
1082 #ifdef VIENNACL_BUILD_INFO
1083 std::cout <<
"Creating program " << prog_name << std::endl;
1085 ctx.add_program(source, prog_name);
1086 init_done[ctx.handle().get()] =
true;
void generate_compressed_matrix_lu_backward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:216
void generate_compressed_matrix_dense_matrix_mult(StringType &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Generate kernel for C = A * B with A being a compressed_matrix, B and C dense.
Definition: compressed_matrix.hpp:101
static std::string program_name()
Definition: compressed_matrix.hpp:1041
void generate_compressed_matrix_trans_lu_backward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:429
void generate_compressed_matrix_block_trans_lu_backward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:25
void generate_compressed_matrix_vec_mul_cpu(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:1008
Common implementations shared by OpenCL-based operations.
void generate_compressed_matrix_trans_unit_lu_forward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:645
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Definition: common.hpp:46
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:51
Provides OpenCL-related utilities.
void generate_compressed_matrix_trans_lu_forward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:503
void generate_compressed_matrix_unit_lu_backward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:743
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:476
void generate_compressed_matrix_dense_matrix_multiplication(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:169
const OCL_TYPE & get() const
Definition: handle.hpp:189
Main kernel class for generating OpenCL kernels for compressed_matrix.
Definition: compressed_matrix.hpp:1039
void generate_compressed_matrix_unit_lu_forward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:824
static void init(viennacl::ocl::context &ctx)
Definition: compressed_matrix.hpp:1046
void generate_compressed_matrix_row_info_extractor(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:374
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
void generate_compressed_matrix_jacobi(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:183
void generate_compressed_matrix_vec_mul4(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:921
Representation of an OpenCL kernel in ViennaCL.
void generate_compressed_matrix_trans_unit_lu_forward_slow(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:712
void generate_compressed_matrix_vec_mul(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:896
void generate_compressed_matrix_vec_mul8(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:962
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_compressed_matrix_trans_unit_lu_backward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:575
void generate_compressed_matrix_block_trans_unit_lu_forward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:64
void generate_compressed_matrix_lu_forward(StringType &source, std::string const &numeric_string)
Definition: compressed_matrix.hpp:299