ViennaCL - The Vienna Computing Library  1.5.2
matrix_prod.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_PROD_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_PROD_HPP
3 
7 #include "viennacl/ocl/utils.hpp"
8 
10 
13 namespace viennacl
14 {
15  namespace linalg
16  {
17  namespace opencl
18  {
19  namespace kernels
20  {
21 
22  template <typename StringType>
23  void generate_matrix_prod_blas3(StringType & source, std::string const & numeric_string,
24  bool row_major_A, bool row_major_B, bool row_major_C,
25  bool transpose_A, bool transpose_B)
26  {
27  //start OpenCL code:
28  source.append("__kernel void prod_");
29  if (transpose_A)
30  source.append("T");
31  else
32  source.append("A");
33  if (transpose_B)
34  source.append("T");
35  else
36  source.append("A");
37 
38  source.append("( \n");
39  source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
40  source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n");
41  source.append(" unsigned int A_row_start, \n");
42  source.append(" unsigned int A_col_start, \n");
43  source.append(" unsigned int A_row_inc, \n");
44  source.append(" unsigned int A_col_inc, \n");
45  source.append(" unsigned int A_row_size, \n"); //number of elements starting from row_start!
46  source.append(" unsigned int A_col_size, \n");
47  source.append(" unsigned int A_internal_rows, \n");
48  source.append(" unsigned int A_internal_cols, \n");
49 
50  source.append(" __global const "); source.append(numeric_string); source.append(" * B, \n");
51  source.append(" unsigned int B_row_start, \n");
52  source.append(" unsigned int B_col_start, \n");
53  source.append(" unsigned int B_row_inc, \n");
54  source.append(" unsigned int B_col_inc, \n");
55  source.append(" unsigned int B_row_size, \n");
56  source.append(" unsigned int B_col_size, \n");
57  source.append(" unsigned int B_internal_rows, \n");
58  source.append(" unsigned int B_internal_cols, \n");
59 
60  source.append(" "); source.append(numeric_string); source.append(" beta, \n");
61  source.append(" __global "); source.append(numeric_string); source.append(" * C, \n");
62  source.append(" unsigned int C_row_start, \n");
63  source.append(" unsigned int C_col_start, \n");
64  source.append(" unsigned int C_row_inc, \n");
65  source.append(" unsigned int C_col_inc, \n");
66  source.append(" unsigned int C_row_size, \n");
67  source.append(" unsigned int C_col_size, \n");
68  source.append(" unsigned int C_internal_rows, \n");
69  source.append(" unsigned int C_internal_cols) \n");
70  source.append("{ \n");
71 
72  source.append(" __local "); source.append(numeric_string); source.append(" bufA[272]; \n"); // 16 * 17
73  source.append(" __local "); source.append(numeric_string); source.append(" bufB[272]; \n"); // 16 * 17
74 
75  source.append(" size_t block_size = 16; \n"); //get_local_size(0);
76 
77  source.append(" size_t row_block_id = get_group_id(0); \n");
78  source.append(" size_t col_block_id = get_group_id(1); \n");
79  source.append(" size_t row_thread_id = get_local_id(0); \n");
80  source.append(" size_t col_thread_id = get_local_id(1); \n");
81 
82  //traverse block row of A (taking mem layout and transpose operation into account)
83  if (row_major_A && transpose_A)
84  {
85  source.append(" size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; \n");
86  source.append(" size_t aStep = block_size * A_row_inc * A_internal_cols; \n");
87  }
88  else if (row_major_A && !transpose_A)
89  {
90  source.append(" size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; \n");
91  source.append(" size_t aStep = block_size * A_col_inc; \n");
92  }
93  else if (!row_major_A && transpose_A)
94  {
95  source.append(" size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; \n");
96  source.append(" size_t aStep = block_size * A_row_inc; \n");
97  }
98  else if (!row_major_A && !transpose_A)
99  {
100  source.append(" size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; \n");
101  source.append(" size_t aStep = block_size * A_col_inc * A_internal_rows; \n");
102  }
103 
104 
105  if (row_major_B && transpose_B)
106  {
107  source.append(" size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; \n");
108  source.append(" size_t bStep = block_size * B_col_inc; \n");
109  }
110  else if (row_major_B && !transpose_B)
111  {
112  source.append(" size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; \n");
113  source.append(" size_t bStep = block_size * B_internal_cols * B_row_inc; \n");
114  }
115  else if (!row_major_B && transpose_B)
116  {
117  source.append(" size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; \n");
118  source.append(" size_t bStep = block_size * B_internal_rows * B_col_inc; \n");
119  }
120  else if (!row_major_B && !transpose_B)
121  {
122  source.append(" size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; \n");
123  source.append(" size_t bStep = block_size * B_row_inc; \n");
124  }
125 
126 
127  if (transpose_A)
128  source.append(" size_t block_num = (A_row_size + block_size - 1) / block_size; \n");
129  else
130  source.append(" size_t block_num = (A_col_size + block_size - 1) / block_size; \n");
131 
132  source.append(" "); source.append(numeric_string); source.append(" Csub = 0; \n");
133 
134  //offset of the the memory access by the thread relative to the beginning of the block:
135  if (row_major_A)
136  source.append(" size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; \n");
137  else
138  source.append(" size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; \n");
139 
140  if (row_major_B)
141  source.append(" size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; \n");
142  else
143  source.append(" size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; \n");
144 
145  source.append(" size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); \n");
146  source.append(" size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); \n");
147 
148  source.append(" for (size_t block = 0; \n");
149  source.append(" block < block_num; \n");
150  source.append(" ++block) \n");
151  source.append(" { \n");
152 
153  //read block from A and check for access within matrix:
154 
155  if (transpose_A && row_major_A)
156  source.append(" bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; \n");
157  else if (transpose_A && !row_major_A)
158  source.append(" bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; \n");
159  else if (!transpose_A && row_major_A)
160  source.append(" bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; \n");
161  else if (!transpose_A && !row_major_A)
162  source.append(" bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; \n");
163 
164 
165  if (transpose_B && row_major_B)
166  source.append(" bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; \n");
167  else if (transpose_B && !row_major_B)
168  source.append(" bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; \n");
169  else if (!transpose_B && row_major_B)
170  source.append(" bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; \n");
171  else if (!transpose_B && !row_major_B)
172  source.append(" bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; \n");
173 
174  //computation of block-matrix-matrix product is the same for all cases:
175  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
176 
177  //loop unrolling:
178  source.append(" __local "); source.append(numeric_string); source.append(" * bufAptr = bufA + row_thread_id_times_block_size; \n");
179  source.append(" __local "); source.append(numeric_string); source.append(" * bufBptr = bufB + col_thread_id_times_block_size; \n");
180 
181  for (size_t unroll = 0; unroll < 16; ++unroll) {
182  source.append(" Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; \n");
183  }
184 
185  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
186  source.append(" aBegin += aStep; \n");
187  source.append(" bBegin += bStep; \n");
188  source.append(" } \n");
189 
190 
191  if (transpose_A)
192  {
193  source.append(" if (get_global_id(0) < A_col_size && ");
194  }
195  else
196  {
197  source.append(" if (get_global_id(0) < A_row_size && ");
198  }
199 
200  if (transpose_B)
201  {
202  source.append("get_global_id(1) < B_row_size) \n");
203  }
204  else
205  {
206  source.append("get_global_id(1) < B_col_size) \n");
207  }
208 
209  if (row_major_C)
210  {
211  source.append(" C[(get_global_id(0) * C_row_inc + C_row_start) * C_internal_cols + get_global_id(1) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(get_global_id(0) * C_row_inc + C_row_start) * C_internal_cols + get_global_id(1) * C_col_inc + C_col_start]; \n");
212  }
213  else
214  {
215  source.append(" C[get_global_id(0) * C_row_inc + C_row_start + (get_global_id(1) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[get_global_id(0) * C_row_inc + C_row_start + (get_global_id(1) * C_col_inc + C_col_start) * C_internal_rows]; \n");
216  }
217  source.append("} \n");
218  }
219 
220  template <typename StringType>
221  void generate_matrix_prod16_blas3(StringType & source, std::string const & numeric_string,
222  bool row_major_A, bool row_major_B, bool row_major_C,
223  bool transpose_A, bool transpose_B)
224  {
225  //vcl_size_t vector_size = 4;
226  vcl_size_t block_size = 16;
227 
228  //start OpenCL code:
229  source.append("__kernel void prod16_");
230  if (transpose_A)
231  source.append("T");
232  else
233  source.append("A");
234  if (transpose_B)
235  source.append("T");
236  else
237  source.append("A");
238 
239  source.append("( "); source.append(numeric_string); source.append(" alpha, \n");
240  source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n");
241  source.append(" unsigned int A_row_start, \n");
242  source.append(" unsigned int A_col_start, \n");
243  source.append(" unsigned int A_row_inc, \n");
244  source.append(" unsigned int A_col_inc, \n");
245  source.append(" unsigned int A_row_size, \n"); //number of elements starting from row_start, using an increment of A_row_inc
246  source.append(" unsigned int A_col_size, \n");
247  source.append(" unsigned int A_internal_rows, \n");
248  source.append(" unsigned int A_internal_cols, \n");
249  source.append(" __global const "); source.append(numeric_string); source.append(" * B, \n");
250  source.append(" unsigned int B_row_start, \n");
251  source.append(" unsigned int B_col_start, \n");
252  source.append(" unsigned int B_row_inc, \n");
253  source.append(" unsigned int B_col_inc, \n");
254  source.append(" unsigned int B_row_size, \n");
255  source.append(" unsigned int B_col_size, \n");
256  source.append(" unsigned int B_internal_rows, \n");
257  source.append(" unsigned int B_internal_cols, \n");
258  source.append(" "); source.append(numeric_string); source.append(" beta, \n");
259  source.append(" __global "); source.append(numeric_string); source.append(" * C, \n");
260  source.append(" unsigned int C_row_start, \n");
261  source.append(" unsigned int C_col_start, \n");
262  source.append(" unsigned int C_row_inc, \n");
263  source.append(" unsigned int C_col_inc, \n");
264  source.append(" unsigned int C_row_size, \n");
265  source.append(" unsigned int C_col_size, \n");
266  source.append(" unsigned int C_internal_rows, \n");
267  source.append(" unsigned int C_internal_cols) \n");
268  source.append("{ \n");
269  //do not forgot to change block_size !!!
270  source.append(" size_t row_block_id = get_group_id(1); \n"); //refers to the row index in op(A), op(B)
271  source.append(" size_t col_block_id = get_group_id(0); \n"); //refers to the col index in op(A), op(B)
272  source.append(" size_t row_thread_id = get_local_id(1); \n");
273  source.append(" size_t col_thread_id = get_local_id(0); \n");
274 
275  source.append(" __local "); source.append(numeric_string); source.append(" As[256]; \n");
276 
277  source.append(" "); source.append(numeric_string); source.append(" cv[16] = {");
278  for (vcl_size_t i=0; i<block_size-1; ++i)
279  source.append("0,");
280  source.append("0}; \n");
281 
282  //traverse block row of A (taking mem layout and transpose operation into account)
283  if (row_major_A && transpose_A)
284  {
285  source.append(" size_t aBegin = (row_block_id * 16 * A_col_inc + A_col_start) + A_row_start * A_internal_cols; \n");
286  source.append(" size_t aStep = 16 * A_internal_cols * A_row_inc; \n");
287  source.append(" size_t aEnd = aBegin + A_internal_cols * A_row_inc * A_row_size; \n");
288  }
289  else if (row_major_A && !transpose_A)
290  {
291  source.append(" size_t aBegin = (row_block_id * 16 * A_row_inc + A_row_start) * A_internal_cols + A_col_start; \n");
292  source.append(" size_t aStep = 16 * A_col_inc; \n");
293  source.append(" size_t aEnd = aBegin + A_col_inc * A_col_size; \n");
294  }
295  else if (!row_major_A && transpose_A)
296  {
297  source.append(" size_t aBegin = (row_block_id * 16 * A_col_inc + A_col_start) * A_internal_rows + A_row_start; \n");
298  source.append(" size_t aStep = 16 * A_row_inc; \n");
299  source.append(" size_t aEnd = aBegin + A_row_inc * A_row_size; \n");
300  }
301  else if (!row_major_A && !transpose_A)
302  {
303  source.append(" size_t aBegin = (row_block_id * 16 * A_row_inc + A_row_start) + A_col_start * A_internal_rows; \n");
304  source.append(" size_t aStep = 16 * A_internal_rows * A_col_inc; \n");
305  source.append(" size_t aEnd = aBegin + A_internal_rows * A_col_inc * A_col_size; \n");
306  }
307 
308 
309  if (row_major_B && transpose_B)
310  {
311  source.append(" size_t bBegin = (col_block_id * 64 * B_row_inc + B_row_start) * B_internal_cols + B_col_start; \n");
312  source.append(" size_t bStep = 16 * B_col_inc; \n");
313  }
314  else if (row_major_B && !transpose_B)
315  {
316  source.append(" size_t bBegin = (col_block_id * 64 * B_col_inc + B_col_start) + B_row_start * B_internal_cols; \n");
317  source.append(" size_t bStep = 16 * B_row_inc * B_internal_cols; \n");
318  }
319  else if (!row_major_B && transpose_B)
320  {
321  source.append(" size_t bBegin = (col_block_id * 64 * B_row_inc + B_row_start) + B_col_start * B_internal_rows; \n");
322  source.append(" size_t bStep = 16 * B_col_inc * B_internal_rows; \n");
323  }
324  else if (!row_major_B && !transpose_B)
325  {
326  source.append(" size_t bBegin = (col_block_id * 64 * B_col_inc + B_col_start) * B_internal_rows + B_row_start; \n");
327  source.append(" size_t bStep = 16 * B_row_inc; \n");
328  }
329 
330  source.append(" for(size_t a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep) { \n");
331 
332  // copy blocks of op(A) to shared memory (op(A) is column-major in shared memory then)
333  source.append(" for(size_t i = 0; i < 4; i++) \n");
334  if (row_major_A && transpose_A)
335  source.append(" As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_col_inc * (i * 4 + row_thread_id) + A_internal_cols * A_row_inc * col_thread_id]);");
336  else if (row_major_A && !transpose_A)
337  source.append(" As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_internal_cols * A_row_inc * (i * 4 + row_thread_id) + A_col_inc * col_thread_id]);");
338  else if (!row_major_A && transpose_A)
339  source.append(" As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_internal_rows * A_col_inc * (i * 4 + row_thread_id) + A_row_inc * col_thread_id]);");
340  else if (!row_major_A && !transpose_A)
341  source.append(" As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_row_inc * (i * 4 + row_thread_id) + A_internal_rows * A_col_inc * col_thread_id]);");
342 
343  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
344 
345  // initialize memory pointers
346  source.append(" __local "); source.append(numeric_string); source.append(" *ap = As; \n");
347  if (row_major_B && transpose_B)
348  {
349  source.append(" __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_row_inc * B_internal_cols); \n");
350  }
351  else if (row_major_B && !transpose_B)
352  {
353  source.append(" __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_col_inc); \n");
354  }
355  else if (!row_major_B && transpose_B)
356  {
357  source.append(" __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_row_inc); \n");
358  }
359  else if (!row_major_B && !transpose_B)
360  {
361  source.append(" __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_col_inc * B_internal_rows); \n");
362  }
363 
364  // run computations
365  source.append(" for(size_t i = 0; i < 16; i++) { \n");
366  if (row_major_B && transpose_B)
367  {
368  source.append(" "); source.append(numeric_string); source.append(" bv = bp[i * B_col_inc]; \n");
369  }
370  else if (row_major_B && !transpose_B)
371  {
372  source.append(" "); source.append(numeric_string); source.append(" bv = bp[i * B_row_inc * B_internal_cols]; \n");
373  }
374  else if (!row_major_B && transpose_B)
375  {
376  source.append(" "); source.append(numeric_string); source.append(" bv = bp[i * B_col_inc * B_internal_rows]; \n");
377  }
378  else if (!row_major_B && !transpose_B)
379  {
380  source.append(" "); source.append(numeric_string); source.append(" bv = bp[i * B_row_inc]; \n");
381  }
382 
383  source.append(" for(size_t k = 0; k < 16; k++) \n");
384  source.append(" cv[k] += ap[k] * bv; \n");
385 
386  source.append(" ap += 16; \n");
387  source.append(" } \n");
388 
389  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
390  source.append(" } \n");
391 
392  // write to C
393  if (row_major_C)
394  {
395  source.append(" int c = C_internal_cols * (C_row_inc * 16 * row_block_id + C_row_start) + 64 * C_col_inc * col_block_id + C_col_start \n"); //block column index
396  source.append(" + C_col_inc * (16 * row_thread_id + col_thread_id); \n");
397  }
398  else
399  {
400  source.append(" int c = C_row_inc * 16 * row_block_id + C_row_start + (64 * C_col_inc * col_block_id + C_col_start) * C_internal_rows \n"); // block column index
401  source.append(" + C_internal_rows * C_col_inc * (16 * row_thread_id + col_thread_id); \n");
402  }
403 
404  source.append(" for(size_t i = 0; i < 16; i++) { \n");
405 
406  if (row_major_C)
407  {
408  source.append(" C[c] = (beta == 0) ? alpha * cv[i] : alpha * cv[i] + beta * C[c]; \n");
409  source.append(" c += C_internal_cols * C_row_inc; \n");
410  }
411  else
412  {
413  source.append(" C[c] = (beta == 0) ? alpha * cv[i] : alpha * cv[i] + beta * C[c]; \n");
414  source.append(" c += C_row_inc; \n");
415  }
416 
417  source.append(" } \n");
418  source.append("} \n");
419 
420  }
421 
422 
423  // main kernel class
430  template <class NumericT, typename F_A, typename F_B, typename F_C>
431  struct matrix_prod
432  {
433  static std::string program_name()
434  {
436  }
437 
438  static void init(viennacl::ocl::context & ctx)
439  {
441  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
442  bool row_major_A = viennacl::is_row_major<F_A>::value;
443  bool row_major_B = viennacl::is_row_major<F_B>::value;
444  bool row_major_C = viennacl::is_row_major<F_C>::value;
445 
446 
447  static std::map<cl_context, bool> init_done;
448  if (!init_done[ctx.handle().get()])
449  {
450  std::string source;
451  source.reserve(8192);
452 
453  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
454 
455  // only generate for floating points (forces error for integers)
456  if (numeric_string == "float" || numeric_string == "double")
457  {
458  generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, false);
459  generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, true);
460  generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, false);
461  generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, true);
462 
463  generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, false);
464  generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, true);
465  generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, false);
466  generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, true);
467 
468  }
469 
470  std::string prog_name = program_name();
471  #ifdef VIENNACL_BUILD_INFO
472  std::cout << "Creating program " << prog_name << std::endl;
473  #endif
474  ctx.add_program(source, prog_name);
475  init_done[ctx.handle().get()] = true;
476  } //if
477  } //init
478  };
479 
480  } // namespace kernels
481  } // namespace opencl
482  } // namespace linalg
483 } // namespace viennacl
484 #endif
485 
std::size_t vcl_size_t
Definition: forwards.h:58
Implements a OpenCL platform within ViennaCL.
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:399
Various little tools used here and there in ViennaCL.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:51
Provides OpenCL-related utilities.
void generate_matrix_prod_blas3(StringType &source, std::string const &numeric_string, bool row_major_A, bool row_major_B, bool row_major_C, bool transpose_A, bool transpose_B)
Definition: matrix_prod.hpp:23
Main kernel class for the generation of matrix-matrix product kernels C = A * B.
Definition: matrix_prod.hpp:431
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:476
const OCL_TYPE & get() const
Definition: handle.hpp:189
void generate_matrix_prod16_blas3(StringType &source, std::string const &numeric_string, bool row_major_A, bool row_major_B, bool row_major_C, bool transpose_A, bool transpose_B)
Definition: matrix_prod.hpp:221
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
static void init(viennacl::ocl::context &ctx)
Definition: matrix_prod.hpp:438
Representation of an OpenCL kernel in ViennaCL.
std::string type_to_string(viennacl::row_major)
Definition: matrix.hpp:868
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
static std::string program_name()
Definition: matrix_prod.hpp:433
Runtime generation of OpenCL kernels for matrix operations.