ViennaCL - The Vienna Computing Library  1.5.2
vector.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
3 
7 #include "viennacl/ocl/utils.hpp"
8 
11 namespace viennacl
12 {
13  namespace linalg
14  {
15  namespace opencl
16  {
17  namespace kernels
18  {
19 
21 
24  {
25  VIENNACL_AVBV_NONE = 0, // vector does not exist/contribute
28  };
29 
31  struct avbv_config
32  {
34 
36  std::string assign_op;
39  };
40 
41  // just returns the for-loop
42  template <typename StringType>
43  void generate_avbv_impl2(StringType & source, std::string const & /*numeric_string*/, avbv_config const & cfg, bool mult_alpha, bool mult_beta)
44  {
45  source.append(" for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n");
46  if (cfg.with_stride_and_range)
47  {
48  source.append(" vec1[i*size1.y+size1.x] "); source.append(cfg.assign_op); source.append(" vec2[i*size2.y+size2.x] ");
49  if (mult_alpha)
50  source.append("* alpha ");
51  else
52  source.append("/ alpha ");
53  if (cfg.b != VIENNACL_AVBV_NONE)
54  {
55  source.append("+ vec3[i*size3.y+size3.x] ");
56  if (mult_beta)
57  source.append("* beta");
58  else
59  source.append("/ beta");
60  }
61  }
62  else
63  {
64  source.append(" vec1[i] "); source.append(cfg.assign_op); source.append(" vec2[i] ");
65  if (mult_alpha)
66  source.append("* alpha ");
67  else
68  source.append("/ alpha ");
69  if (cfg.b != VIENNACL_AVBV_NONE)
70  {
71  source.append("+ vec3[i] ");
72  if (mult_beta)
73  source.append("* beta");
74  else
75  source.append("/ beta");
76  }
77  }
78  source.append("; \n");
79  }
80 
81  template <typename StringType>
82  void generate_avbv_impl(StringType & source, std::string const & numeric_string, avbv_config const & cfg)
83  {
84  source.append("__kernel void av");
85  if (cfg.b != VIENNACL_AVBV_NONE)
86  source.append("bv");
87  if (cfg.assign_op != "=")
88  source.append("_v");
89 
90  if (cfg.a == VIENNACL_AVBV_CPU)
91  source.append("_cpu");
92  else if (cfg.a == VIENNACL_AVBV_GPU)
93  source.append("_gpu");
94 
95  if (cfg.b == VIENNACL_AVBV_CPU)
96  source.append("_cpu");
97  else if (cfg.b == VIENNACL_AVBV_GPU)
98  source.append("_gpu");
99  source.append("( \n");
100  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
101  source.append(" uint4 size1, \n");
102  source.append(" \n");
103  if (cfg.a == VIENNACL_AVBV_CPU)
104  {
105  source.append(" "); source.append(numeric_string); source.append(" fac2, \n");
106  }
107  else if (cfg.a == VIENNACL_AVBV_GPU)
108  {
109  source.append(" __global "); source.append(numeric_string); source.append(" * fac2, \n");
110  }
111  source.append(" unsigned int options2, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
112  source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n");
113  source.append(" uint4 size2");
114 
115  if (cfg.b != VIENNACL_AVBV_NONE)
116  {
117  source.append(", \n\n");
118  if (cfg.b == VIENNACL_AVBV_CPU)
119  {
120  source.append(" "); source.append(numeric_string); source.append(" fac3, \n");
121  }
122  else if (cfg.b == VIENNACL_AVBV_GPU)
123  {
124  source.append(" __global "); source.append(numeric_string); source.append(" * fac3, \n");
125  }
126  source.append(" unsigned int options3, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
127  source.append(" __global const "); source.append(numeric_string); source.append(" * vec3, \n");
128  source.append(" uint4 size3 \n");
129  }
130  source.append(") { \n");
131 
132  if (cfg.a == VIENNACL_AVBV_CPU)
133  {
134  source.append(" "); source.append(numeric_string); source.append(" alpha = fac2; \n");
135  }
136  else if (cfg.a == VIENNACL_AVBV_GPU)
137  {
138  source.append(" "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
139  }
140  source.append(" if (options2 & (1 << 0)) \n");
141  source.append(" alpha = -alpha; \n");
142  source.append(" \n");
143 
144  if (cfg.b == VIENNACL_AVBV_CPU)
145  {
146  source.append(" "); source.append(numeric_string); source.append(" beta = fac3; \n");
147  }
148  else if (cfg.b == VIENNACL_AVBV_GPU)
149  {
150  source.append(" "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
151  }
152  if (cfg.b != VIENNACL_AVBV_NONE)
153  {
154  source.append(" if (options3 & (1 << 0)) \n");
155  source.append(" beta = -beta; \n");
156  source.append(" \n");
157  }
158  source.append(" if (options2 & (1 << 1)) { \n");
159  if (cfg.b != VIENNACL_AVBV_NONE)
160  {
161  source.append(" if (options3 & (1 << 1)) {\n");
162  generate_avbv_impl2(source, numeric_string, cfg, false, false);
163  source.append(" } else {\n");
164  generate_avbv_impl2(source, numeric_string, cfg, false, true);
165  source.append(" } \n");
166  }
167  else
168  generate_avbv_impl2(source, numeric_string, cfg, false, true);
169  source.append(" } else { \n");
170  if (cfg.b != VIENNACL_AVBV_NONE)
171  {
172  source.append(" if (options3 & (1 << 1)) {\n");
173  generate_avbv_impl2(source, numeric_string, cfg, true, false);
174  source.append(" } else {\n");
175  generate_avbv_impl2(source, numeric_string, cfg, true, true);
176  source.append(" } \n");
177  }
178  else
179  generate_avbv_impl2(source, numeric_string, cfg, true, true);
180  source.append(" } \n");
181  source.append("} \n");
182  }
183 
184  template <typename StringType>
185  void generate_avbv(StringType & source, std::string const & numeric_string)
186  {
187  avbv_config cfg;
188  cfg.assign_op = "=";
189  cfg.with_stride_and_range = true;
190 
191  // av
192  cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
193  cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
194 
195  // avbv
196  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
197  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
198  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
199  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
200 
201  // avbv
202  cfg.assign_op = "+=";
203 
204  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
205  cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
206  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
207  cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
208  }
209 
210  template <typename StringType>
211  void generate_plane_rotation(StringType & source, std::string const & numeric_string)
212  {
213  source.append("__kernel void plane_rotation( \n");
214  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
215  source.append(" unsigned int start1, \n");
216  source.append(" unsigned int inc1, \n");
217  source.append(" unsigned int size1, \n");
218  source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n");
219  source.append(" unsigned int start2, \n");
220  source.append(" unsigned int inc2, \n");
221  source.append(" unsigned int size2, \n");
222  source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
223  source.append(" "); source.append(numeric_string); source.append(" beta) \n");
224  source.append("{ \n");
225  source.append(" "); source.append(numeric_string); source.append(" tmp1 = 0; \n");
226  source.append(" "); source.append(numeric_string); source.append(" tmp2 = 0; \n");
227  source.append(" \n");
228  source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
229  source.append(" { \n");
230  source.append(" tmp1 = vec1[i*inc1+start1]; \n");
231  source.append(" tmp2 = vec2[i*inc2+start2]; \n");
232  source.append(" \n");
233  source.append(" vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2; \n");
234  source.append(" vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1; \n");
235  source.append(" } \n");
236  source.append(" \n");
237  source.append("} \n");
238  }
239 
240  template <typename StringType>
241  void generate_vector_swap(StringType & source, std::string const & numeric_string)
242  {
243  source.append("__kernel void swap( \n");
244  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
245  source.append(" unsigned int start1, \n");
246  source.append(" unsigned int inc1, \n");
247  source.append(" unsigned int size1, \n");
248  source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n");
249  source.append(" unsigned int start2, \n");
250  source.append(" unsigned int inc2, \n");
251  source.append(" unsigned int size2 \n");
252  source.append(" ) \n");
253  source.append("{ \n");
254  source.append(" "); source.append(numeric_string); source.append(" tmp; \n");
255  source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
256  source.append(" { \n");
257  source.append(" tmp = vec2[i*inc2+start2]; \n");
258  source.append(" vec2[i*inc2+start2] = vec1[i*inc1+start1]; \n");
259  source.append(" vec1[i*inc1+start1] = tmp; \n");
260  source.append(" } \n");
261  source.append("} \n");
262  }
263 
264  template <typename StringType>
265  void generate_assign_cpu(StringType & source, std::string const & numeric_string)
266  {
267  source.append("__kernel void assign_cpu( \n");
268  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
269  source.append(" unsigned int start1, \n");
270  source.append(" unsigned int inc1, \n");
271  source.append(" unsigned int size1, \n");
272  source.append(" unsigned int internal_size1, \n");
273  source.append(" "); source.append(numeric_string); source.append(" alpha) \n");
274  source.append("{ \n");
275  source.append(" for (unsigned int i = get_global_id(0); i < internal_size1; i += get_global_size(0)) \n");
276  source.append(" vec1[i*inc1+start1] = (i < size1) ? alpha : 0; \n");
277  source.append("} \n");
278 
279  }
280 
281  template <typename StringType>
282  void generate_inner_prod(StringType & source, std::string const & numeric_string, vcl_size_t vector_num)
283  {
284  std::stringstream ss;
285  ss << vector_num;
286  std::string vector_num_string = ss.str();
287 
288  source.append("__kernel void inner_prod"); source.append(vector_num_string); source.append("( \n");
289  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
290  source.append(" uint4 params_x, \n");
291  for (vcl_size_t i=0; i<vector_num; ++i)
292  {
293  ss.str("");
294  ss << i;
295  source.append(" __global const "); source.append(numeric_string); source.append(" * y"); source.append(ss.str()); source.append(", \n");
296  source.append(" uint4 params_y"); source.append(ss.str()); source.append(", \n");
297  }
298  source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
299  source.append(" __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
300  source.append("{ \n");
301  source.append(" unsigned int entries_per_thread = (params_x.z - 1) / get_global_size(0) + 1; \n");
302  source.append(" unsigned int vec_start_index = get_group_id(0) * get_local_size(0) * entries_per_thread; \n");
303  source.append(" unsigned int vec_stop_index = min((unsigned int)((get_group_id(0) + 1) * get_local_size(0) * entries_per_thread), params_x.z); \n");
304 
305  // compute partial results within group:
306  for (vcl_size_t i=0; i<vector_num; ++i)
307  {
308  ss.str("");
309  ss << i;
310  source.append(" "); source.append(numeric_string); source.append(" tmp"); source.append(ss.str()); source.append(" = 0; \n");
311  }
312  source.append(" for (unsigned int i = vec_start_index + get_local_id(0); i < vec_stop_index; i += get_local_size(0)) { \n");
313  source.append(" "); source.append(numeric_string); source.append(" val_x = x[i*params_x.y + params_x.x]; \n");
314  for (vcl_size_t i=0; i<vector_num; ++i)
315  {
316  ss.str("");
317  ss << i;
318  source.append(" tmp"); source.append(ss.str()); source.append(" += val_x * y"); source.append(ss.str()); source.append("[i * params_y"); source.append(ss.str()); source.append(".y + params_y"); source.append(ss.str()); source.append(".x]; \n");
319  }
320  source.append(" } \n");
321  for (vcl_size_t i=0; i<vector_num; ++i)
322  {
323  ss.str("");
324  ss << i;
325  source.append(" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] = tmp"); source.append(ss.str()); source.append("; \n");
326  }
327 
328  // now run reduction:
329  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
330  source.append(" { \n");
331  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
332  source.append(" if (get_local_id(0) < stride) { \n");
333  for (vcl_size_t i=0; i<vector_num; ++i)
334  {
335  ss.str("");
336  ss << i;
337  source.append(" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] += tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0) + stride]; \n");
338  }
339  source.append(" } \n");
340  source.append(" } \n");
341  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
342 
343  source.append(" if (get_local_id(0) == 0) { \n");
344  for (vcl_size_t i=0; i<vector_num; ++i)
345  {
346  ss.str("");
347  ss << i;
348  source.append(" group_buffer[get_group_id(0) + "); source.append(ss.str()); source.append(" * get_num_groups(0)] = tmp_buffer["); source.append(ss.str()); source.append(" * get_local_size(0)]; \n");
349  }
350  source.append(" } \n");
351  source.append("} \n");
352 
353  }
354 
355  template <typename StringType>
356  void generate_norm(StringType & source, std::string const & numeric_string)
357  {
358  bool is_float_or_double = (numeric_string == "float" || numeric_string == "double");
359 
360  source.append(numeric_string); source.append(" impl_norm( \n");
361  source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n");
362  source.append(" unsigned int start1, \n");
363  source.append(" unsigned int inc1, \n");
364  source.append(" unsigned int size1, \n");
365  source.append(" unsigned int norm_selector, \n");
366  source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer) \n");
367  source.append("{ \n");
368  source.append(" "); source.append(numeric_string); source.append(" tmp = 0; \n");
369  source.append(" if (norm_selector == 1) \n"); //norm_1
370  source.append(" { \n");
371  source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
372  if (is_float_or_double)
373  source.append(" tmp += fabs(vec[i*inc1 + start1]); \n");
374  else
375  source.append(" tmp += abs(vec[i*inc1 + start1]); \n");
376  source.append(" } \n");
377  source.append(" else if (norm_selector == 2) \n"); //norm_2
378  source.append(" { \n");
379  source.append(" "); source.append(numeric_string); source.append(" vec_entry = 0; \n");
380  source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
381  source.append(" { \n");
382  source.append(" vec_entry = vec[i*inc1 + start1]; \n");
383  source.append(" tmp += vec_entry * vec_entry; \n");
384  source.append(" } \n");
385  source.append(" } \n");
386  source.append(" else if (norm_selector == 0) \n"); //norm_inf
387  source.append(" { \n");
388  source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
389  if (is_float_or_double)
390  source.append(" tmp = fmax(fabs(vec[i*inc1 + start1]), tmp); \n");
391  else
392  {
393  source.append(" tmp = max(("); source.append(numeric_string); source.append(")abs(vec[i*inc1 + start1]), tmp); \n");
394  }
395  source.append(" } \n");
396 
397  source.append(" tmp_buffer[get_local_id(0)] = tmp; \n");
398 
399  source.append(" if (norm_selector > 0) \n"); //norm_1 or norm_2:
400  source.append(" { \n");
401  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
402  source.append(" { \n");
403  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
404  source.append(" if (get_local_id(0) < stride) \n");
405  source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride]; \n");
406  source.append(" } \n");
407  source.append(" return tmp_buffer[0]; \n");
408  source.append(" } \n");
409 
410  //norm_inf:
411  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
412  source.append(" { \n");
413  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
414  source.append(" if (get_local_id(0) < stride) \n");
415  if (is_float_or_double)
416  source.append(" tmp_buffer[get_local_id(0)] = fmax(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
417  else
418  source.append(" tmp_buffer[get_local_id(0)] = max(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
419  source.append(" } \n");
420 
421  source.append(" return tmp_buffer[0]; \n");
422  source.append("}; \n");
423 
424  source.append("__kernel void norm( \n");
425  source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n");
426  source.append(" unsigned int start1, \n");
427  source.append(" unsigned int inc1, \n");
428  source.append(" unsigned int size1, \n");
429  source.append(" unsigned int norm_selector, \n");
430  source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
431  source.append(" __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
432  source.append("{ \n");
433  source.append(" "); source.append(numeric_string); source.append(" tmp = impl_norm(vec, \n");
434  source.append(" ( get_group_id(0) * size1) / get_num_groups(0) * inc1 + start1, \n");
435  source.append(" inc1, \n");
436  source.append(" ( (1 + get_group_id(0)) * size1) / get_num_groups(0) \n");
437  source.append(" - ( get_group_id(0) * size1) / get_num_groups(0), \n");
438  source.append(" norm_selector, \n");
439  source.append(" tmp_buffer); \n");
440 
441  source.append(" if (get_local_id(0) == 0) \n");
442  source.append(" group_buffer[get_group_id(0)] = tmp; \n");
443  source.append("} \n");
444 
445  }
446 
447  template <typename StringType>
448  void generate_inner_prod_sum(StringType & source, std::string const & numeric_string)
449  {
450  // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
451  source.append("__kernel void sum_inner_prod( \n");
452  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
453  source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
454  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
455  source.append(" unsigned int start_result, \n");
456  source.append(" unsigned int inc_result) \n");
457  source.append("{ \n");
458  source.append(" tmp_buffer[get_local_id(0)] = vec1[get_global_id(0)]; \n");
459 
460  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
461  source.append(" { \n");
462  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
463  source.append(" if (get_local_id(0) < stride) \n");
464  source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
465  source.append(" } \n");
466  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
467 
468  source.append(" if (get_local_id(0) == 0) \n");
469  source.append(" result[start_result + inc_result * get_group_id(0)] = tmp_buffer[0]; \n");
470  source.append("} \n");
471 
472  }
473 
474  template <typename StringType>
475  void generate_sum(StringType & source, std::string const & numeric_string)
476  {
477  // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
478  source.append("__kernel void sum( \n");
479  source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
480  source.append(" unsigned int start1, \n");
481  source.append(" unsigned int inc1, \n");
482  source.append(" unsigned int size1, \n");
483  source.append(" unsigned int option, \n"); //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
484  source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
485  source.append(" __global "); source.append(numeric_string); source.append(" * result) \n");
486  source.append("{ \n");
487  source.append(" "); source.append(numeric_string); source.append(" thread_sum = 0; \n");
488  source.append(" "); source.append(numeric_string); source.append(" tmp = 0; \n");
489  source.append(" for (unsigned int i = get_local_id(0); i<size1; i += get_local_size(0)) \n");
490  source.append(" { \n");
491  source.append(" if (option > 0) \n");
492  source.append(" thread_sum += vec1[i*inc1+start1]; \n");
493  source.append(" else \n");
494  source.append(" { \n");
495  source.append(" tmp = vec1[i*inc1+start1]; \n");
496  source.append(" tmp = (tmp < 0) ? -tmp : tmp; \n");
497  source.append(" thread_sum = (thread_sum > tmp) ? thread_sum : tmp; \n");
498  source.append(" } \n");
499  source.append(" } \n");
500 
501  source.append(" tmp_buffer[get_local_id(0)] = thread_sum; \n");
502 
503  source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
504  source.append(" { \n");
505  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
506  source.append(" if (get_local_id(0) < stride) \n");
507  source.append(" { \n");
508  source.append(" if (option > 0) \n");
509  source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
510  source.append(" else \n");
511  source.append(" tmp_buffer[get_local_id(0)] = (tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride]) ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
512  source.append(" } \n");
513  source.append(" } \n");
514  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
515 
516  source.append(" if (get_global_id(0) == 0) \n");
517  source.append(" { \n");
518  if (numeric_string == "float" || numeric_string == "double")
519  {
520  source.append(" if (option == 2) \n");
521  source.append(" *result = sqrt(tmp_buffer[0]); \n");
522  source.append(" else \n");
523  }
524  source.append(" *result = tmp_buffer[0]; \n");
525  source.append(" } \n");
526  source.append("} \n");
527 
528  }
529 
530  template <typename StringType>
531  void generate_index_norm_inf(StringType & source, std::string const & numeric_string)
532  {
533  //index_norm_inf:
534  source.append("unsigned int index_norm_inf_impl( \n");
535  source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n");
536  source.append(" unsigned int start1, \n");
537  source.append(" unsigned int inc1, \n");
538  source.append(" unsigned int size1, \n");
539  source.append(" __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
540  source.append(" __local unsigned int * index_buffer) \n");
541  source.append("{ \n");
542  //step 1: fill buffer:
543  source.append(" "); source.append(numeric_string); source.append(" cur_max = 0; \n");
544  source.append(" "); source.append(numeric_string); source.append(" tmp; \n");
545  source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
546  source.append(" { \n");
547  if (numeric_string == "float" || numeric_string == "double")
548  source.append(" tmp = fabs(vec[i*inc1+start1]); \n");
549  else
550  source.append(" tmp = abs(vec[i*inc1+start1]); \n");
551  source.append(" if (cur_max < tmp) \n");
552  source.append(" { \n");
553  source.append(" entry_buffer[get_global_id(0)] = tmp; \n");
554  source.append(" index_buffer[get_global_id(0)] = i; \n");
555  source.append(" cur_max = tmp; \n");
556  source.append(" } \n");
557  source.append(" } \n");
558 
559  //step 2: parallel reduction:
560  source.append(" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2) \n");
561  source.append(" { \n");
562  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
563  source.append(" if (get_global_id(0) < stride) \n");
564  source.append(" { \n");
565  //find the first occurring index
566  source.append(" if (entry_buffer[get_global_id(0)] < entry_buffer[get_global_id(0)+stride]) \n");
567  source.append(" { \n");
568  source.append(" index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride]; \n");
569  source.append(" entry_buffer[get_global_id(0)] = entry_buffer[get_global_id(0)+stride]; \n");
570  source.append(" } \n");
571  source.append(" } \n");
572  source.append(" } \n");
573  source.append(" \n");
574  source.append(" return index_buffer[0]; \n");
575  source.append("} \n");
576 
577  source.append("__kernel void index_norm_inf( \n");
578  source.append(" __global "); source.append(numeric_string); source.append(" * vec, \n");
579  source.append(" unsigned int start1, \n");
580  source.append(" unsigned int inc1, \n");
581  source.append(" unsigned int size1, \n");
582  source.append(" __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
583  source.append(" __local unsigned int * index_buffer, \n");
584  source.append(" __global unsigned int * result) \n");
585  source.append("{ \n");
586  source.append(" entry_buffer[get_global_id(0)] = 0; \n");
587  source.append(" index_buffer[get_global_id(0)] = 0; \n");
588  source.append(" unsigned int tmp = index_norm_inf_impl(vec, start1, inc1, size1, entry_buffer, index_buffer); \n");
589  source.append(" if (get_global_id(0) == 0) *result = tmp; \n");
590  source.append("} \n");
591 
592  }
593 
594 
596 
597  // main kernel class
599  template <class TYPE>
600  struct vector
601  {
602  static std::string program_name()
603  {
604  return viennacl::ocl::type_to_string<TYPE>::apply() + "_vector";
605  }
606 
607  static void init(viennacl::ocl::context & ctx)
608  {
610  std::string numeric_string = viennacl::ocl::type_to_string<TYPE>::apply();
611 
612  static std::map<cl_context, bool> init_done;
613  if (!init_done[ctx.handle().get()])
614  {
615  std::string source;
616  source.reserve(8192);
617 
618  viennacl::ocl::append_double_precision_pragma<TYPE>(ctx, source);
619 
620  // fully parametrized kernels:
621  generate_avbv(source, numeric_string);
622 
623  // kernels with mostly predetermined skeleton:
624  generate_plane_rotation(source, numeric_string);
625  generate_vector_swap(source, numeric_string);
626  generate_assign_cpu(source, numeric_string);
627 
628  generate_inner_prod(source, numeric_string, 1);
629  generate_norm(source, numeric_string);
630  generate_sum(source, numeric_string);
631  generate_index_norm_inf(source, numeric_string);
632 
633  std::string prog_name = program_name();
634  #ifdef VIENNACL_BUILD_INFO
635  std::cout << "Creating program " << prog_name << std::endl;
636  #endif
637  ctx.add_program(source, prog_name);
638  init_done[ctx.handle().get()] = true;
639  } //if
640  } //init
641  };
642 
643  // class with kernels for multiple inner products.
645  template <class TYPE>
647  {
648  static std::string program_name()
649  {
650  return viennacl::ocl::type_to_string<TYPE>::apply() + "_vector_multi";
651  }
652 
653  static void init(viennacl::ocl::context & ctx)
654  {
656  std::string numeric_string = viennacl::ocl::type_to_string<TYPE>::apply();
657 
658  static std::map<cl_context, bool> init_done;
659  if (!init_done[ctx.handle().get()])
660  {
661  std::string source;
662  source.reserve(8192);
663 
664  viennacl::ocl::append_double_precision_pragma<TYPE>(ctx, source);
665 
666  generate_inner_prod(source, numeric_string, 2);
667  generate_inner_prod(source, numeric_string, 3);
668  generate_inner_prod(source, numeric_string, 4);
669  generate_inner_prod(source, numeric_string, 8);
670 
671  generate_inner_prod_sum(source, numeric_string);
672 
673  std::string prog_name = program_name();
674  #ifdef VIENNACL_BUILD_INFO
675  std::cout << "Creating program " << prog_name << std::endl;
676  #endif
677  ctx.add_program(source, prog_name);
678  init_done[ctx.handle().get()] = true;
679  } //if
680  } //init
681  };
682 
683  } // namespace kernels
684  } // namespace opencl
685  } // namespace linalg
686 } // namespace viennacl
687 #endif
688 
avbv_scalar_type a
Definition: vector.hpp:37
std::size_t vcl_size_t
Definition: forwards.h:58
Implements a OpenCL platform within ViennaCL.
void generate_inner_prod(StringType &source, std::string const &numeric_string, vcl_size_t vector_num)
Definition: vector.hpp:282
void generate_index_norm_inf(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:531
avbv_scalar_type
Enumeration for the scalar type in avbv-like operations.
Definition: vector.hpp:23
Various little tools used here and there in ViennaCL.
void generate_vector_swap(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:241
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:51
static std::string program_name()
Definition: vector.hpp:602
Main kernel class for generating OpenCL kernels for multiple inner products on/with viennacl::vector<...
Definition: vector.hpp:646
void generate_assign_cpu(StringType &source, std::string const &numeric_string, bool is_row_major)
Definition: matrix.hpp:257
Provides OpenCL-related utilities.
void generate_plane_rotation(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:211
std::string assign_op
Definition: vector.hpp:36
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:476
avbv_scalar_type b
Definition: vector.hpp:38
void generate_avbv(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:185
void generate_sum(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:475
const OCL_TYPE & get() const
Definition: handle.hpp:189
bool with_stride_and_range
Definition: vector.hpp:35
Configuration struct for generating OpenCL kernels for linear combinations of vectors.
Definition: vector.hpp:31
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
void generate_avbv_impl(StringType &source, std::string const &numeric_string, avbv_config const &cfg)
Definition: vector.hpp:82
void generate_avbv_impl2(StringType &source, std::string const &, avbv_config const &cfg, bool mult_alpha, bool mult_beta)
Definition: vector.hpp:43
void generate_inner_prod_sum(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:448
Representation of an OpenCL kernel in ViennaCL.
void generate_norm(StringType &source, std::string const &numeric_string)
Definition: vector.hpp:356
static std::string program_name()
Definition: vector.hpp:648
static void init(viennacl::ocl::context &ctx)
Definition: vector.hpp:607
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
static void init(viennacl::ocl::context &ctx)
Definition: vector.hpp:653
Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without inv...
Definition: vector.hpp:600