ViennaCL - The Vienna Computing Library  1.5.2
matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
29 #include "viennacl/tools/tools.hpp"
33 #include "viennacl/traits/size.hpp"
37 
39 
45 
46 namespace viennacl
47 {
48  namespace linalg
49  {
50  namespace cuda
51  {
52  //
53  // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
54  //
55 
56  template <typename NumericT, typename F,
57  typename ScalarType1>
59  matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
60  {
61  typedef NumericT value_type;
62 
63  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
64 
65  value_type temporary_alpha = 0;
67  temporary_alpha = alpha;
68 
70  {
71  am_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
72  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
73  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
74  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
75  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
76 
77  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
78  options_alpha,
79  detail::cuda_arg<value_type>(mat2),
80  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
81  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
82  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
83  );
84  VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel");
85  }
86  else
87  {
88  am_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
89  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
90  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
91  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
92  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
93 
94  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
95  options_alpha,
96  detail::cuda_arg<value_type>(mat2),
97  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
98  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
99  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
100  );
101  VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel");
102  }
103  }
104 
105 
106  template <typename NumericT, typename F,
107  typename ScalarType1, typename ScalarType2>
109  matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
110  matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
111  {
112  typedef NumericT value_type;
113 
114  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
115 
116  value_type temporary_alpha = 0;
118  temporary_alpha = alpha;
119 
120 
121  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
122 
123  value_type temporary_beta = 0;
125  temporary_beta = beta;
126 
127 
129  {
130  ambm_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
131  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
132  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
133  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
134  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
135 
136  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
137  options_alpha,
138  detail::cuda_arg<value_type>(mat2),
139  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
140  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
141  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
142 
143  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
144  options_beta,
145  detail::cuda_arg<value_type>(mat3),
146  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
147  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
148  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
149  );
150  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel");
151  }
152  else
153  {
154  ambm_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
155  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
156  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
157  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
158  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
159 
160  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
161  options_alpha,
162  detail::cuda_arg<value_type>(mat2),
163  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
164  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
165  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
166 
167  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
168  options_beta,
169  detail::cuda_arg<value_type>(mat3),
170  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
171  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
172  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
173  );
174  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel");
175  }
176 
177  }
178 
179 
180  template <typename NumericT, typename F,
181  typename ScalarType1, typename ScalarType2>
183  matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
184  matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
185  {
186  typedef NumericT value_type;
187 
188  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
189 
190  value_type temporary_alpha = 0;
192  temporary_alpha = alpha;
193 
194 
195  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
196 
197  value_type temporary_beta = 0;
199  temporary_beta = beta;
200 
201 
203  {
204  ambm_m_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
205  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
206  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
207  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
208  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
209 
210  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
211  options_alpha,
212  detail::cuda_arg<value_type>(mat2),
213  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
214  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
215  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
216 
217  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
218  options_beta,
219  detail::cuda_arg<value_type>(mat3),
220  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
221  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
222  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
223  );
224  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel");
225  }
226  else
227  {
228  ambm_m_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
229  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
230  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
231  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
232  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
233 
234  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
235  options_alpha,
236  detail::cuda_arg<value_type>(mat2),
237  static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
238  static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
239  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
240 
241  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
242  options_beta,
243  detail::cuda_arg<value_type>(mat3),
244  static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
245  static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
246  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
247  );
248  VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel");
249  }
250 
251  }
252 
253 
254 
255 
256  template <typename NumericT, typename F>
257  void matrix_assign(matrix_base<NumericT, F> & mat, NumericT s, bool clear = false)
258  {
259  typedef NumericT value_type;
260  value_type alpha = s;
261 
262  unsigned int s1 = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
263  unsigned int s2 = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
264 
266  {
267 
268  matrix_row_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
269  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
270  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
271  s1, s2,
272  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
273  alpha);
274  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel");
275  }
276  else
277  {
278  matrix_col_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
279  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
280  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
281  s1, s2,
282  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
283  alpha);
284  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel");
285  }
286  }
287 
288  template <typename NumericT, typename F>
290  {
291  typedef NumericT value_type;
292  value_type alpha = s;
293 
295  {
296  matrix_row_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
297  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
298  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
299  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
300  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
301  alpha);
302  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel");
303  }
304  else
305  {
306  matrix_col_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
307  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
308  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
309  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
310  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
311  alpha);
312  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel");
313  }
314  }
315 
316 
317  template <typename NumericT, typename F>
319  {
320  typedef NumericT value_type;
321 
322  // Step 1: assign zero matrix:
323  matrix_assign(mat, NumericT(0));
324 
325  // Step 2: Assign diagonal:
326  unsigned int options_alpha = 0;
327 
328  vcl_size_t mat_start = 0;
329  vcl_size_t mat_stride = 0;
330  vcl_size_t mat_size = viennacl::traits::size(vec);
332  {
333  vcl_size_t first_row_index = 0;
334  vcl_size_t first_col_index = 0;
335  if (k < 0)
336  first_row_index = vcl_size_t(-k);
337  else
338  first_col_index = vcl_size_t(k);
339  mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
340  + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
342  }
343  else
344  {
345  vcl_size_t first_row_index = 0;
346  vcl_size_t first_col_index = 0;
347  if (k < 0)
348  first_row_index = vcl_size_t(-k);
349  else
350  first_col_index = vcl_size_t(k);
351  mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
354  }
355 
356  av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
357  static_cast<unsigned int>(mat_start),
358  static_cast<unsigned int>(mat_stride),
359  static_cast<unsigned int>(mat_size),
360 
361  detail::cuda_arg<value_type>(NumericT(1)),
362  options_alpha,
363  detail::cuda_arg<value_type>(vec),
364  static_cast<unsigned int>(viennacl::traits::start(vec)),
365  static_cast<unsigned int>(viennacl::traits::stride(vec)) );
366  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
367  }
368 
369  template <typename NumericT, typename F>
371  {
372  typedef NumericT value_type;
373 
374  unsigned int options_alpha = 0;
375 
376  vcl_size_t mat_start = 0;
377  vcl_size_t mat_stride = 0;
379  {
380  vcl_size_t first_row_index = 0;
381  vcl_size_t first_col_index = 0;
382  if (k < 0)
383  first_row_index = vcl_size_t(-k);
384  else
385  first_col_index = vcl_size_t(k);
386  mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
387  + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
389  }
390  else
391  {
392  vcl_size_t first_row_index = 0;
393  vcl_size_t first_col_index = 0;
394  if (k < 0)
395  first_row_index = vcl_size_t(-k);
396  else
397  first_col_index = vcl_size_t(k);
398  mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
401  }
402 
403  av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
404  static_cast<unsigned int>(viennacl::traits::start(vec)),
405  static_cast<unsigned int>(viennacl::traits::stride(vec)),
406  static_cast<unsigned int>(viennacl::traits::size(vec)),
407 
408  detail::cuda_arg<value_type>(NumericT(1)),
409  options_alpha,
410  detail::cuda_arg<value_type>(mat),
411  static_cast<unsigned int>(mat_start),
412  static_cast<unsigned int>(mat_stride));
413  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
414  }
415 
416  template <typename NumericT, typename F>
417  void matrix_row(const matrix_base<NumericT, F> & mat, unsigned int i, vector_base<NumericT> & vec)
418  {
419  typedef NumericT value_type;
420 
421  unsigned int options_alpha = 0;
422 
423  vcl_size_t mat_start = 0;
424  vcl_size_t mat_stride = 0;
426  {
428  mat_stride = viennacl::traits::stride2(mat);
429  }
430  else
431  {
434  }
435 
436  av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
437  static_cast<unsigned int>(viennacl::traits::start(vec)),
438  static_cast<unsigned int>(viennacl::traits::stride(vec)),
439  static_cast<unsigned int>(viennacl::traits::size(vec)),
440 
441  detail::cuda_arg<value_type>(NumericT(1)),
442  options_alpha,
443  detail::cuda_arg<value_type>(mat),
444  static_cast<unsigned int>(mat_start),
445  static_cast<unsigned int>(mat_stride));
446  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
447  }
448 
449  template <typename NumericT, typename F>
450  void matrix_column(const matrix_base<NumericT, F> & mat, unsigned int j, vector_base<NumericT> & vec)
451  {
452  typedef NumericT value_type;
453 
454  unsigned int options_alpha = 0;
455 
456  vcl_size_t mat_start = 0;
457  vcl_size_t mat_stride = 0;
459  {
462  }
463  else
464  {
466  mat_stride = viennacl::traits::stride2(mat);
467  }
468 
469  av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
470  static_cast<unsigned int>(viennacl::traits::start(vec)),
471  static_cast<unsigned int>(viennacl::traits::stride(vec)),
472  static_cast<unsigned int>(viennacl::traits::size(vec)),
473 
474  detail::cuda_arg<value_type>(NumericT(1)),
475  options_alpha,
476  detail::cuda_arg<value_type>(mat),
477  static_cast<unsigned int>(mat_start),
478  static_cast<unsigned int>(mat_stride));
479  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
480  }
481 
482 
483  //
485  //
486 
487 
488  template <typename T, typename F, typename OP>
491  {
492  typedef T value_type;
493 
494  unsigned int op_type = 2; //0: product, 1: division, 2: power
496  op_type = 1;
498  op_type = 0;
499 
501  {
502  element_op_int_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
503  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
504  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
505  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
506  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
507 
508  detail::cuda_arg<value_type>(proxy.lhs()),
509  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
510  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
511  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
512 
513  detail::cuda_arg<value_type>(proxy.rhs()),
514  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
515  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
516  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
517 
518  op_type
519  );
520  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
521  }
522  else
523  {
524  element_op_int_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
525  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
526  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
527  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
528  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
529 
530  detail::cuda_arg<value_type>(proxy.lhs()),
531  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
532  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
533  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
534 
535  detail::cuda_arg<value_type>(proxy.rhs()),
536  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
537  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
538  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
539 
540  op_type
541  );
542  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
543  }
544  }
545 
546  template <typename F, typename OP>
549  {
550  typedef float value_type;
551 
552  unsigned int op_type = 2; //0: product, 1: division, 2: power
554  op_type = 1;
556  op_type = 0;
557 
559  {
560  element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
561  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
562  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
563  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
564  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
565 
566  detail::cuda_arg<value_type>(proxy.lhs()),
567  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
568  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
569  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
570 
571  detail::cuda_arg<value_type>(proxy.rhs()),
572  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
573  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
574  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
575 
576  op_type
577  );
578  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
579  }
580  else
581  {
582  element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
583  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
584  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
585  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
586  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
587 
588  detail::cuda_arg<value_type>(proxy.lhs()),
589  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
590  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
591  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
592 
593  detail::cuda_arg<value_type>(proxy.rhs()),
594  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
595  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
596  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
597 
598  op_type
599  );
600  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
601  }
602  }
603 
604  template <typename F, typename OP>
607  {
608  typedef double value_type;
609 
610  unsigned int op_type = 2; //0: product, 1: division, 2: power
612  op_type = 1;
614  op_type = 0;
615 
617  {
618  element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
619  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
620  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
621  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
622  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
623 
624  detail::cuda_arg<value_type>(proxy.lhs()),
625  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
626  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
627  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
628 
629  detail::cuda_arg<value_type>(proxy.rhs()),
630  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
631  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
632  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
633 
634  op_type
635  );
636  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
637  }
638  else
639  {
640  element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
641  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
642  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
643  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
644  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
645 
646  detail::cuda_arg<value_type>(proxy.lhs()),
647  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
648  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
649  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
650 
651  detail::cuda_arg<value_type>(proxy.rhs()),
652  static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
653  static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
654  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
655 
656  op_type
657  );
658  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
659  }
660  }
661 
662  //
664  //
665 
666  // Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2),
667  // we could not find a more 'automatic' way of generating the overloads below...
668 
669  // abs
670  template <typename T, typename F>
673  {
674  typedef T value_type;
675 
677  {
678  matrix_row_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
679  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
680  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
681  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
682  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
683 
684  detail::cuda_arg<value_type>(proxy.lhs()),
685  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
686  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
687  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
688  );
689  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel");
690  }
691  else
692  {
693  matrix_col_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
694  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
695  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
696  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
697  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
698 
699  detail::cuda_arg<value_type>(proxy.lhs()),
700  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
701  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
702  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
703  );
704  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel");
705  }
706  }
707 
708 
709  // acos
710  template <typename T, typename F>
713  {
714  typedef T value_type;
715 
717  {
718  matrix_row_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
719  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
720  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
721  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
722  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
723 
724  detail::cuda_arg<value_type>(proxy.lhs()),
725  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
726  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
727  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
728  );
729  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel");
730  }
731  else
732  {
733  matrix_col_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
734  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
735  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
736  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
737  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
738 
739  detail::cuda_arg<value_type>(proxy.lhs()),
740  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
741  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
742  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
743  );
744  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel");
745  }
746  }
747 
748 
749  // asin
750  template <typename T, typename F>
753  {
754  typedef T value_type;
755 
757  {
758  matrix_row_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
759  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
760  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
761  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
762  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
763 
764  detail::cuda_arg<value_type>(proxy.lhs()),
765  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
766  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
767  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
768  );
769  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel");
770  }
771  else
772  {
773  matrix_col_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
774  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
775  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
776  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
777  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
778 
779  detail::cuda_arg<value_type>(proxy.lhs()),
780  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
781  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
782  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
783  );
784  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
785  }
786  }
787 
788 
789  // atan
790  template <typename T, typename F>
793  {
794  typedef T value_type;
795 
797  {
798  matrix_row_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
799  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
800  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
801  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
802  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
803 
804  detail::cuda_arg<value_type>(proxy.lhs()),
805  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
806  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
807  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
808  );
809  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel");
810  }
811  else
812  {
813  matrix_col_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
814  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
815  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
816  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
817  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
818 
819  detail::cuda_arg<value_type>(proxy.lhs()),
820  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
821  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
822  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
823  );
824  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel");
825  }
826  }
827 
828 
829  // ceil
830  template <typename T, typename F>
833  {
834  typedef T value_type;
835 
837  {
838  matrix_row_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
839  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
840  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
841  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
842  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
843 
844  detail::cuda_arg<value_type>(proxy.lhs()),
845  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
846  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
847  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
848  );
849  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel");
850  }
851  else
852  {
853  matrix_col_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
854  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
855  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
856  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
857  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
858 
859  detail::cuda_arg<value_type>(proxy.lhs()),
860  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
861  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
862  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
863  );
864  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel");
865  }
866  }
867 
868 
869  // cos
870  template <typename T, typename F>
873  {
874  typedef T value_type;
875 
877  {
878  matrix_row_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
879  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
880  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
881  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
882  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
883 
884  detail::cuda_arg<value_type>(proxy.lhs()),
885  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
886  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
887  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
888  );
889  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel");
890  }
891  else
892  {
893  matrix_col_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
894  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
895  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
896  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
897  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
898 
899  detail::cuda_arg<value_type>(proxy.lhs()),
900  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
901  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
902  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
903  );
904  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel");
905  }
906  }
907 
908 
909  // cosh
910  template <typename T, typename F>
913  {
914  typedef T value_type;
915 
917  {
918  matrix_row_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
919  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
920  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
921  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
922  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
923 
924  detail::cuda_arg<value_type>(proxy.lhs()),
925  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
926  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
927  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
928  );
929  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel");
930  }
931  else
932  {
933  matrix_col_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
934  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
935  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
936  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
937  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
938 
939  detail::cuda_arg<value_type>(proxy.lhs()),
940  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
941  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
942  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
943  );
944  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel");
945  }
946  }
947 
948 
949  // exp
950  template <typename T, typename F>
953  {
954  typedef T value_type;
955 
957  {
958  matrix_row_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
959  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
960  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
961  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
962  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
963 
964  detail::cuda_arg<value_type>(proxy.lhs()),
965  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
966  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
967  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
968  );
969  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel");
970  }
971  else
972  {
973  matrix_col_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
974  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
975  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
976  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
977  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
978 
979  detail::cuda_arg<value_type>(proxy.lhs()),
980  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
981  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
982  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
983  );
984  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel");
985  }
986  }
987 
988 
989  // fabs
990  template <typename T, typename F>
993  {
994  typedef T value_type;
995 
997  {
998  matrix_row_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
999  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1000  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1001  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1002  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1003 
1004  detail::cuda_arg<value_type>(proxy.lhs()),
1005  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1006  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1007  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1008  );
1009  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel");
1010  }
1011  else
1012  {
1013  matrix_col_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1014  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1015  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1016  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1017  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1018 
1019  detail::cuda_arg<value_type>(proxy.lhs()),
1020  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1021  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1022  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1023  );
1024  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel");
1025  }
1026  }
1027 
1028 
1029  // floor
1030  template <typename T, typename F>
1033  {
1034  typedef T value_type;
1035 
1037  {
1038  matrix_row_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1039  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1040  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1041  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1042  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1043 
1044  detail::cuda_arg<value_type>(proxy.lhs()),
1045  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1046  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1047  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1048  );
1049  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel");
1050  }
1051  else
1052  {
1053  matrix_col_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1054  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1055  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1056  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1057  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1058 
1059  detail::cuda_arg<value_type>(proxy.lhs()),
1060  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1061  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1062  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1063  );
1064  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel");
1065  }
1066  }
1067 
1068 
1069  // log
1070  template <typename T, typename F>
1073  {
1074  typedef T value_type;
1075 
1077  {
1078  matrix_row_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1079  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1080  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1081  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1082  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1083 
1084  detail::cuda_arg<value_type>(proxy.lhs()),
1085  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1086  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1087  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1088  );
1089  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel");
1090  }
1091  else
1092  {
1093  matrix_col_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1094  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1095  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1096  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1097  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1098 
1099  detail::cuda_arg<value_type>(proxy.lhs()),
1100  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1101  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1102  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1103  );
1104  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel");
1105  }
1106  }
1107 
1108 
1109  // log10
1110  template <typename T, typename F>
1113  {
1114  typedef T value_type;
1115 
1117  {
1118  matrix_row_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1119  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1120  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1121  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1122  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1123 
1124  detail::cuda_arg<value_type>(proxy.lhs()),
1125  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1126  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1127  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1128  );
1129  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel");
1130  }
1131  else
1132  {
1133  matrix_col_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1134  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1135  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1136  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1137  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1138 
1139  detail::cuda_arg<value_type>(proxy.lhs()),
1140  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1141  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1142  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1143  );
1144  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel");
1145  }
1146  }
1147 
1148 
1149  // sin
1150  template <typename T, typename F>
1153  {
1154  typedef T value_type;
1155 
1157  {
1158  matrix_row_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1159  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1160  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1161  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1162  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1163 
1164  detail::cuda_arg<value_type>(proxy.lhs()),
1165  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1166  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1167  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1168  );
1169  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel");
1170  }
1171  else
1172  {
1173  matrix_col_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1174  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1175  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1176  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1177  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1178 
1179  detail::cuda_arg<value_type>(proxy.lhs()),
1180  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1181  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1182  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1183  );
1184  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
1185  }
1186  }
1187 
1188 
1189  // sinh
1190  template <typename T, typename F>
1193  {
1194  typedef T value_type;
1195 
1197  {
1198  matrix_row_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1199  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1200  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1201  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1202  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1203 
1204  detail::cuda_arg<value_type>(proxy.lhs()),
1205  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1206  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1207  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1208  );
1209  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel");
1210  }
1211  else
1212  {
1213  matrix_col_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1214  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1215  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1216  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1217  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1218 
1219  detail::cuda_arg<value_type>(proxy.lhs()),
1220  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1221  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1222  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1223  );
1224  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel");
1225  }
1226  }
1227 
1228 
1229  // sqrt
1230  template <typename T, typename F>
1233  {
1234  typedef T value_type;
1235 
1237  {
1238  matrix_row_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1239  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1240  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1241  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1242  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1243 
1244  detail::cuda_arg<value_type>(proxy.lhs()),
1245  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1246  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1247  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1248  );
1249  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sqrt_kernel");
1250  }
1251  else
1252  {
1253  matrix_col_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1254  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1255  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1256  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1257  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1258 
1259  detail::cuda_arg<value_type>(proxy.lhs()),
1260  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1261  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1262  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1263  );
1264  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sqrt_kernel");
1265  }
1266  }
1267 
1268 
1269  // tan
1270  template <typename T, typename F>
1273  {
1274  typedef T value_type;
1275 
1277  {
1278  matrix_row_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1279  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1280  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1281  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1282  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1283 
1284  detail::cuda_arg<value_type>(proxy.lhs()),
1285  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1286  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1287  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1288  );
1289  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tan_kernel");
1290  }
1291  else
1292  {
1293  matrix_col_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1294  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1295  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1296  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1297  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1298 
1299  detail::cuda_arg<value_type>(proxy.lhs()),
1300  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1301  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1302  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1303  );
1304  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tan_kernel");
1305  }
1306  }
1307 
1308 
1309  // tanh
1310  template <typename T, typename F>
1313  {
1314  typedef T value_type;
1315 
1317  {
1318  matrix_row_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1319  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1320  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1321  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1322  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1323 
1324  detail::cuda_arg<value_type>(proxy.lhs()),
1325  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1326  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1327  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1328  );
1329  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tanh_kernel");
1330  }
1331  else
1332  {
1333  matrix_col_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1334  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1335  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1336  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1337  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1338 
1339  detail::cuda_arg<value_type>(proxy.lhs()),
1340  static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
1341  static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
1342  static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
1343  );
1344  VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tanh_kernel");
1345  }
1346  }
1347 
1348 
1349  //
1351  //
1352 
1353  // A * x
1354 
1363  template <typename NumericT, typename F>
1365  const vector_base<NumericT> & vec,
1366  vector_base<NumericT> & result)
1367  {
1368  typedef NumericT value_type;
1369 
1370  assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace matrix-vector product possible. Introduce a temporary!"));
1371 
1373  {
1374  vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1375  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
1376  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
1377  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
1378  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
1379 
1380  detail::cuda_arg<value_type>(vec),
1381  static_cast<unsigned int>(viennacl::traits::start(vec)),
1382  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1383  static_cast<unsigned int>(viennacl::traits::size(vec)),
1384 
1385  detail::cuda_arg<value_type>(result),
1386  static_cast<unsigned int>(viennacl::traits::start(result)),
1387  static_cast<unsigned int>(viennacl::traits::stride(result)),
1388  static_cast<unsigned int>(viennacl::traits::size(result))
1389  );
1390  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_row_kernel");
1391  }
1392  else
1393  {
1394  vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1395  static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
1396  static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
1397  static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
1398  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
1399 
1400  detail::cuda_arg<value_type>(vec),
1401  static_cast<unsigned int>(viennacl::traits::start(vec)),
1402  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1403  static_cast<unsigned int>(viennacl::traits::size(vec)),
1404 
1405  detail::cuda_arg<value_type>(result),
1406  static_cast<unsigned int>(viennacl::traits::start(result)),
1407  static_cast<unsigned int>(viennacl::traits::stride(result)),
1408  static_cast<unsigned int>(viennacl::traits::size(result))
1409  );
1410  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_col_kernel");
1411  }
1412  }
1413 
1414 
1415  // trans(A) * x
1416 
1425  template <typename NumericT, typename F>
1427  const vector_base<NumericT> & vec,
1428  vector_base<NumericT> & result)
1429  {
1430  assert( (viennacl::traits::size1(mat_trans) == viennacl::traits::size(result)) && bool("Size check failed for transposed matrix-vector product: size1(A^T) == size(result)"));
1431  assert( (viennacl::traits::size2(mat_trans) == viennacl::traits::size(vec)) && bool("Size check failed for transposed matrix-vector product: size2(A^T) == size(x)")); //remember: mat is transposed!
1432 
1433  typedef NumericT value_type;
1434 
1435 
1436  // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
1437  assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace transposed matrix-vector product possible. Introduce a temporary!"));
1438 
1440  {
1441  trans_vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat_trans.lhs()),
1442  static_cast<unsigned int>(viennacl::traits::start1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::start2(mat_trans.lhs())),
1443  static_cast<unsigned int>(viennacl::traits::stride1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(mat_trans.lhs())),
1444  static_cast<unsigned int>(viennacl::traits::size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::size2(mat_trans.lhs())),
1445  static_cast<unsigned int>(viennacl::traits::internal_size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(mat_trans.lhs())),
1446 
1447  detail::cuda_arg<value_type>(vec),
1448  static_cast<unsigned int>(viennacl::traits::start(vec)),
1449  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1450  static_cast<unsigned int>(viennacl::traits::size(vec)),
1451 
1452  detail::cuda_arg<value_type>(result),
1453  static_cast<unsigned int>(viennacl::traits::start(result)),
1454  static_cast<unsigned int>(viennacl::traits::stride(result)),
1455  static_cast<unsigned int>(viennacl::traits::size(result))
1456  );
1457  VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_row_kernel");
1458  }
1459  else
1460  {
1461  trans_vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat_trans.lhs()),
1462  static_cast<unsigned int>(viennacl::traits::start1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::start2(mat_trans.lhs())),
1463  static_cast<unsigned int>(viennacl::traits::stride1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(mat_trans.lhs())),
1464  static_cast<unsigned int>(viennacl::traits::size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::size2(mat_trans.lhs())),
1465  static_cast<unsigned int>(viennacl::traits::internal_size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(mat_trans.lhs())),
1466 
1467  detail::cuda_arg<value_type>(vec),
1468  static_cast<unsigned int>(viennacl::traits::start(vec)),
1469  static_cast<unsigned int>(viennacl::traits::stride(vec)),
1470  static_cast<unsigned int>(viennacl::traits::size(vec)),
1471 
1472  detail::cuda_arg<value_type>(result),
1473  static_cast<unsigned int>(viennacl::traits::start(result)),
1474  static_cast<unsigned int>(viennacl::traits::stride(result)),
1475  static_cast<unsigned int>(viennacl::traits::size(result))
1476  );
1477  VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_col_kernel");
1478  }
1479  }
1480 
1481 
1482  //
1484  //
1485 
1486  namespace detail
1487  {
1488  // C = A * B and possibly transposed variants
1489  template <typename T1, typename T2, typename T3, typename ScalarType >
1490  void prod_slow_kernel(const T1 & A, bool transposed_A,
1491  const T2 & B, bool transposed_B,
1492  T3 & C,
1493  ScalarType alpha,
1494  ScalarType beta)
1495  {
1497 
1498  cpu_value_type converted_alpha = static_cast<cpu_value_type>(alpha);
1499  cpu_value_type converted_beta = static_cast<cpu_value_type>(beta);
1500 
1501  dim3 threads(16, 16);
1502  dim3 grid( (viennacl::traits::size1(C) - 1) / 16 + 1,
1503  (viennacl::traits::size2(C) - 1) / 16 + 1);
1504 
1505  bool row_major_A = viennacl::is_row_major<T1>::value;
1506  bool row_major_B = viennacl::is_row_major<T2>::value;
1507  bool row_major_C = viennacl::is_row_major<T3>::value;
1508 
1509 
1510  if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
1511  {
1512  matrix_matrix_col_col_col_prod_AA_kernel<<<grid, threads>>>
1513  (converted_alpha,
1514  detail::cuda_arg<cpu_value_type>(A),
1515  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1516  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1517  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1518  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1519 
1520  detail::cuda_arg<cpu_value_type>(B),
1521  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1522  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1523  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1524  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1525 
1526  converted_beta,
1527  detail::cuda_arg<cpu_value_type>(C),
1528  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1529  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1530  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1531  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1532  }
1533  else if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
1534  {
1535  matrix_matrix_col_col_col_prod_AT_kernel<<<grid, threads>>>
1536  (converted_alpha,
1537  detail::cuda_arg<cpu_value_type>(A),
1538  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1539  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1540  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1541  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1542 
1543  detail::cuda_arg<cpu_value_type>(B),
1544  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1545  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1546  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1547  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1548 
1549  converted_beta,
1550  detail::cuda_arg<cpu_value_type>(C),
1551  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1552  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1553  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1554  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1555  }
1556  else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
1557  {
1558  matrix_matrix_col_col_col_prod_TA_kernel<<<grid, threads>>>
1559  (converted_alpha,
1560  detail::cuda_arg<cpu_value_type>(A),
1561  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1562  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1563  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1564  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1565 
1566  detail::cuda_arg<cpu_value_type>(B),
1567  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1568  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1569  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1570  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1571 
1572  converted_beta,
1573  detail::cuda_arg<cpu_value_type>(C),
1574  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1575  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1576  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1577  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1578  }
1579  else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
1580  {
1581  matrix_matrix_col_col_col_prod_TT_kernel<<<grid, threads>>>
1582  (converted_alpha,
1583  detail::cuda_arg<cpu_value_type>(A),
1584  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1585  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1586  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1587  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1588 
1589  detail::cuda_arg<cpu_value_type>(B),
1590  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1591  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1592  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1593  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1594 
1595  converted_beta,
1596  detail::cuda_arg<cpu_value_type>(C),
1597  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1598  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1599  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1600  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1601  }
1603 
1604  else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
1605  {
1606  matrix_matrix_col_col_row_prod_AA_kernel<<<grid, threads>>>
1607  (converted_alpha,
1608  detail::cuda_arg<cpu_value_type>(A),
1609  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1610  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1611  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1612  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1613 
1614  detail::cuda_arg<cpu_value_type>(B),
1615  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1616  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1617  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1618  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1619 
1620  converted_beta,
1621  detail::cuda_arg<cpu_value_type>(C),
1622  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1623  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1624  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1625  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1626  }
1627  else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
1628  {
1629  matrix_matrix_col_col_row_prod_AT_kernel<<<grid, threads>>>
1630  (converted_alpha,
1631  detail::cuda_arg<cpu_value_type>(A),
1632  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1633  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1634  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1635  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1636 
1637  detail::cuda_arg<cpu_value_type>(B),
1638  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1639  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1640  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1641  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1642 
1643  converted_beta,
1644  detail::cuda_arg<cpu_value_type>(C),
1645  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1646  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1647  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1648  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1649  }
1650  else if (!row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
1651  {
1652  matrix_matrix_col_col_row_prod_TA_kernel<<<grid, threads>>>
1653  (converted_alpha,
1654  detail::cuda_arg<cpu_value_type>(A),
1655  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1656  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1657  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1658  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1659 
1660  detail::cuda_arg<cpu_value_type>(B),
1661  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1662  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1663  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1664  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1665 
1666  converted_beta,
1667  detail::cuda_arg<cpu_value_type>(C),
1668  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1669  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1670  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1671  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1672  }
1673  else if (!row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
1674  {
1675  matrix_matrix_col_col_row_prod_TT_kernel<<<grid, threads>>>
1676  (converted_alpha,
1677  detail::cuda_arg<cpu_value_type>(A),
1678  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1679  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1680  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1681  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1682 
1683  detail::cuda_arg<cpu_value_type>(B),
1684  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1685  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1686  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1687  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1688 
1689  converted_beta,
1690  detail::cuda_arg<cpu_value_type>(C),
1691  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1692  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1693  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1694  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1695  }
1697 
1698  else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
1699  {
1700  matrix_matrix_col_row_col_prod_AA_kernel<<<grid, threads>>>
1701  (converted_alpha,
1702  detail::cuda_arg<cpu_value_type>(A),
1703  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1704  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1705  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1706  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1707 
1708  detail::cuda_arg<cpu_value_type>(B),
1709  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1710  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1711  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1712  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1713 
1714  converted_beta,
1715  detail::cuda_arg<cpu_value_type>(C),
1716  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1717  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1718  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1719  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1720  }
1721  else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
1722  {
1723  matrix_matrix_col_row_col_prod_AT_kernel<<<grid, threads>>>
1724  (converted_alpha,
1725  detail::cuda_arg<cpu_value_type>(A),
1726  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1727  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1728  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1729  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1730 
1731  detail::cuda_arg<cpu_value_type>(B),
1732  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1733  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1734  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1735  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1736 
1737  converted_beta,
1738  detail::cuda_arg<cpu_value_type>(C),
1739  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1740  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1741  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1742  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1743  }
1744  else if (!row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
1745  {
1746  matrix_matrix_col_row_col_prod_TA_kernel<<<grid, threads>>>
1747  (converted_alpha,
1748  detail::cuda_arg<cpu_value_type>(A),
1749  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1750  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1751  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1752  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1753 
1754  detail::cuda_arg<cpu_value_type>(B),
1755  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1756  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1757  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1758  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1759 
1760  converted_beta,
1761  detail::cuda_arg<cpu_value_type>(C),
1762  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1763  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1764  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1765  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1766  }
1767  else if (!row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
1768  {
1769  matrix_matrix_col_row_col_prod_TT_kernel<<<grid, threads>>>
1770  (converted_alpha,
1771  detail::cuda_arg<cpu_value_type>(A),
1772  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1773  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1774  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1775  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1776 
1777  detail::cuda_arg<cpu_value_type>(B),
1778  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1779  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1780  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1781  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1782 
1783  converted_beta,
1784  detail::cuda_arg<cpu_value_type>(C),
1785  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1786  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1787  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1788  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1789  }
1791 
1792  else if (!row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
1793  {
1794  matrix_matrix_col_row_row_prod_AA_kernel<<<grid, threads>>>
1795  (converted_alpha,
1796  detail::cuda_arg<cpu_value_type>(A),
1797  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1798  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1799  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1800  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1801 
1802  detail::cuda_arg<cpu_value_type>(B),
1803  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1804  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1805  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1806  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1807 
1808  converted_beta,
1809  detail::cuda_arg<cpu_value_type>(C),
1810  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1811  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1812  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1813  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1814  }
1815  else if (!row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
1816  {
1817  matrix_matrix_col_row_row_prod_AT_kernel<<<grid, threads>>>
1818  (converted_alpha,
1819  detail::cuda_arg<cpu_value_type>(A),
1820  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1821  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1822  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1823  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1824 
1825  detail::cuda_arg<cpu_value_type>(B),
1826  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1827  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1828  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1829  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1830 
1831  converted_beta,
1832  detail::cuda_arg<cpu_value_type>(C),
1833  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1834  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1835  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1836  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1837  }
1838  else if (!row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
1839  {
1840  matrix_matrix_col_row_row_prod_TA_kernel<<<grid, threads>>>
1841  (converted_alpha,
1842  detail::cuda_arg<cpu_value_type>(A),
1843  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1844  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1845  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1846  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1847 
1848  detail::cuda_arg<cpu_value_type>(B),
1849  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1850  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1851  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1852  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1853 
1854  converted_beta,
1855  detail::cuda_arg<cpu_value_type>(C),
1856  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1857  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1858  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1859  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1860  }
1861  else if (!row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
1862  {
1863  matrix_matrix_col_row_row_prod_TT_kernel<<<grid, threads>>>
1864  (converted_alpha,
1865  detail::cuda_arg<cpu_value_type>(A),
1866  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1867  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1868  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1869  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1870 
1871  detail::cuda_arg<cpu_value_type>(B),
1872  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1873  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1874  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1875  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1876 
1877  converted_beta,
1878  detail::cuda_arg<cpu_value_type>(C),
1879  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1880  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1881  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1882  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1883  }
1885 
1886  else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
1887  {
1888  matrix_matrix_row_col_col_prod_AA_kernel<<<grid, threads>>>
1889  (converted_alpha,
1890  detail::cuda_arg<cpu_value_type>(A),
1891  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1892  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1893  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1894  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1895 
1896  detail::cuda_arg<cpu_value_type>(B),
1897  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1898  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1899  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1900  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1901 
1902  converted_beta,
1903  detail::cuda_arg<cpu_value_type>(C),
1904  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1905  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1906  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1907  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1908  }
1909  else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
1910  {
1911  matrix_matrix_row_col_col_prod_AT_kernel<<<grid, threads>>>
1912  (converted_alpha,
1913  detail::cuda_arg<cpu_value_type>(A),
1914  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1915  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1916  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1917  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1918 
1919  detail::cuda_arg<cpu_value_type>(B),
1920  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1921  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1922  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1923  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1924 
1925  converted_beta,
1926  detail::cuda_arg<cpu_value_type>(C),
1927  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1928  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1929  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1930  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1931  }
1932  else if (row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
1933  {
1934  matrix_matrix_row_col_col_prod_TA_kernel<<<grid, threads>>>
1935  (converted_alpha,
1936  detail::cuda_arg<cpu_value_type>(A),
1937  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1938  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1939  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1940  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1941 
1942  detail::cuda_arg<cpu_value_type>(B),
1943  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1944  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1945  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1946  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1947 
1948  converted_beta,
1949  detail::cuda_arg<cpu_value_type>(C),
1950  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1951  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1952  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1953  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1954  }
1955  else if (row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
1956  {
1957  matrix_matrix_row_col_col_prod_TT_kernel<<<grid, threads>>>
1958  (converted_alpha,
1959  detail::cuda_arg<cpu_value_type>(A),
1960  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1961  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1962  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1963  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1964 
1965  detail::cuda_arg<cpu_value_type>(B),
1966  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1967  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1968  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1969  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1970 
1971  converted_beta,
1972  detail::cuda_arg<cpu_value_type>(C),
1973  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1974  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
1975  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
1976  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
1977  }
1979 
1980  else if (row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
1981  {
1982  matrix_matrix_row_col_row_prod_AA_kernel<<<grid, threads>>>
1983  (converted_alpha,
1984  detail::cuda_arg<cpu_value_type>(A),
1985  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
1986  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
1987  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
1988  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
1989 
1990  detail::cuda_arg<cpu_value_type>(B),
1991  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
1992  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
1993  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
1994  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
1995 
1996  converted_beta,
1997  detail::cuda_arg<cpu_value_type>(C),
1998  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
1999  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2000  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2001  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2002  }
2003  else if (row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
2004  {
2005  matrix_matrix_row_col_row_prod_AT_kernel<<<grid, threads>>>
2006  (converted_alpha,
2007  detail::cuda_arg<cpu_value_type>(A),
2008  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2009  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2010  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2011  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2012 
2013  detail::cuda_arg<cpu_value_type>(B),
2014  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2015  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2016  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2017  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2018 
2019  converted_beta,
2020  detail::cuda_arg<cpu_value_type>(C),
2021  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2022  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2023  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2024  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2025  }
2026  else if (row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
2027  {
2028  matrix_matrix_row_col_row_prod_TA_kernel<<<grid, threads>>>
2029  (converted_alpha,
2030  detail::cuda_arg<cpu_value_type>(A),
2031  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2032  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2033  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2034  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2035 
2036  detail::cuda_arg<cpu_value_type>(B),
2037  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2038  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2039  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2040  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2041 
2042  converted_beta,
2043  detail::cuda_arg<cpu_value_type>(C),
2044  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2045  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2046  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2047  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2048  }
2049  else if (row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
2050  {
2051  matrix_matrix_row_col_row_prod_TT_kernel<<<grid, threads>>>
2052  (converted_alpha,
2053  detail::cuda_arg<cpu_value_type>(A),
2054  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2055  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2056  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2057  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2058 
2059  detail::cuda_arg<cpu_value_type>(B),
2060  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2061  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2062  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2063  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2064 
2065  converted_beta,
2066  detail::cuda_arg<cpu_value_type>(C),
2067  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2068  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2069  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2070  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2071  }
2073 
2074  else if (row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
2075  {
2076  matrix_matrix_row_row_col_prod_AA_kernel<<<grid, threads>>>
2077  (converted_alpha,
2078  detail::cuda_arg<cpu_value_type>(A),
2079  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2080  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2081  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2082  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2083 
2084  detail::cuda_arg<cpu_value_type>(B),
2085  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2086  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2087  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2088  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2089 
2090  converted_beta,
2091  detail::cuda_arg<cpu_value_type>(C),
2092  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2093  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2094  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2095  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2096  }
2097  else if (row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
2098  {
2099  matrix_matrix_row_row_col_prod_AT_kernel<<<grid, threads>>>
2100  (converted_alpha,
2101  detail::cuda_arg<cpu_value_type>(A),
2102  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2103  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2104  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2105  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2106 
2107  detail::cuda_arg<cpu_value_type>(B),
2108  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2109  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2110  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2111  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2112 
2113  converted_beta,
2114  detail::cuda_arg<cpu_value_type>(C),
2115  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2116  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2117  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2118  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2119  }
2120  else if (row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
2121  {
2122  matrix_matrix_row_row_col_prod_TA_kernel<<<grid, threads>>>
2123  (converted_alpha,
2124  detail::cuda_arg<cpu_value_type>(A),
2125  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2126  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2127  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2128  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2129 
2130  detail::cuda_arg<cpu_value_type>(B),
2131  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2132  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2133  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2134  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2135 
2136  converted_beta,
2137  detail::cuda_arg<cpu_value_type>(C),
2138  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2139  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2140  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2141  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2142  }
2143  else if (row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
2144  {
2145  matrix_matrix_row_row_col_prod_TT_kernel<<<grid, threads>>>
2146  (converted_alpha,
2147  detail::cuda_arg<cpu_value_type>(A),
2148  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2149  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2150  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2151  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2152 
2153  detail::cuda_arg<cpu_value_type>(B),
2154  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2155  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2156  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2157  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2158 
2159  converted_beta,
2160  detail::cuda_arg<cpu_value_type>(C),
2161  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2162  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2163  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2164  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2165  }
2166 
2167 
2169 
2170  else if (row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
2171  {
2172  matrix_matrix_row_row_row_prod_AA_kernel<<<grid, threads>>>
2173  (converted_alpha,
2174  detail::cuda_arg<cpu_value_type>(A),
2175  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2176  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2177  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2178  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2179 
2180  detail::cuda_arg<cpu_value_type>(B),
2181  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2182  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2183  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2184  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2185 
2186  converted_beta,
2187  detail::cuda_arg<cpu_value_type>(C),
2188  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2189  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2190  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2191  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2192  }
2193  else if (row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
2194  {
2195  matrix_matrix_row_row_row_prod_AT_kernel<<<grid, threads>>>
2196  (converted_alpha,
2197  detail::cuda_arg<cpu_value_type>(A),
2198  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2199  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2200  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2201  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2202 
2203  detail::cuda_arg<cpu_value_type>(B),
2204  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2205  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2206  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2207  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2208 
2209  converted_beta,
2210  detail::cuda_arg<cpu_value_type>(C),
2211  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2212  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2213  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2214  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2215  }
2216  else if (row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
2217  {
2218  matrix_matrix_row_row_row_prod_TA_kernel<<<grid, threads>>>
2219  (converted_alpha,
2220  detail::cuda_arg<cpu_value_type>(A),
2221  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2222  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2223  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2224  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2225 
2226  detail::cuda_arg<cpu_value_type>(B),
2227  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2228  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2229  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2230  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2231 
2232  converted_beta,
2233  detail::cuda_arg<cpu_value_type>(C),
2234  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2235  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2236  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2237  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2238  }
2239  else if (row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
2240  {
2241  matrix_matrix_row_row_row_prod_TT_kernel<<<grid, threads>>>
2242  (converted_alpha,
2243  detail::cuda_arg<cpu_value_type>(A),
2244  static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
2245  static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
2246  static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
2247  static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
2248 
2249  detail::cuda_arg<cpu_value_type>(B),
2250  static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
2251  static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
2252  static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
2253  static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
2254 
2255  converted_beta,
2256  detail::cuda_arg<cpu_value_type>(C),
2257  static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)),
2258  static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)),
2259  static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)),
2260  static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
2261  }
2262 
2263  }
2264 
2265  // C = A * B, using fast kernel
2266  template <typename T1, typename T2, typename T3, typename ScalarType >
2267  void prod_fast_kernel(const T1 & A,
2268  const T2 & B,
2269  T3 & C,
2270  ScalarType alpha,
2271  ScalarType beta,
2272  std::string kernel_name)
2273  {
2275 
2276  cpu_value_type cl_alpha = static_cast<cpu_value_type>(alpha);
2277  cpu_value_type cl_beta = static_cast<cpu_value_type>(beta);
2278 
2279  /*viennacl::ocl::enqueue(k(cl_alpha,
2280  viennacl::traits::opencl_handle(A),
2281  cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)),
2282  cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)),
2283  cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)),
2284  cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
2285 
2286  viennacl::traits::opencl_handle(B),
2287  cl_uint(viennacl::traits::start1(B)), cl_uint(viennacl::traits::start2(B)),
2288  cl_uint(viennacl::traits::stride1(B)), cl_uint(viennacl::traits::stride2(B)),
2289  cl_uint(viennacl::traits::size1(B)), cl_uint(viennacl::traits::size2(B)),
2290  cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B)),
2291 
2292  cl_beta,
2293  viennacl::traits::opencl_handle(C),
2294  cl_uint(viennacl::traits::start1(C)), cl_uint(viennacl::traits::start2(C)),
2295  cl_uint(viennacl::traits::stride1(C)), cl_uint(viennacl::traits::stride2(C)),
2296  cl_uint(viennacl::traits::size1(C)), cl_uint(viennacl::traits::size2(C)),
2297  cl_uint(viennacl::traits::internal_size1(C)), cl_uint(viennacl::traits::internal_size2(C))
2298  )
2299  );*/
2300 
2301  throw "not implemented yet";
2302  }
2303 
2304  template <typename T1, typename T2, typename T3, typename ScalarType >
2305  void prod(const T1 & A, bool transposed_A,
2306  const T2 & B, bool transposed_B,
2307  T3 & C,
2308  ScalarType alpha,
2309  ScalarType beta)
2310  {
2311  if ( (viennacl::traits::size1(A) < 64)
2312  || (viennacl::traits::size2(A) < 64)
2313  || (viennacl::traits::size1(B) < 64) ) //there is most likely not enough to compute, rendering kernel launch overhead considerable
2314  {
2315  prod_slow_kernel(A, transposed_A,
2316  B, transposed_B,
2317  C, alpha, beta);
2318  }
2319  /*else if ( (viennacl::traits::size1(A) % 64 == 0)
2320  && (viennacl::traits::size2(A) % 64 == 0)
2321  && (viennacl::traits::size1(B) % 64 == 0) ) // allows the use of the fast kernel only
2322  {
2323  prod_fast_kernel(A, B, C, alpha, beta);
2324  //prod_slow_kernel(A, B, C, slow_kernel_name);
2325  }*/
2326  else //TODO: use four kernels
2327  {
2328  prod_slow_kernel(A, transposed_A,
2329  B, transposed_B,
2330  C, alpha, beta);
2331  }
2332 
2333  }
2334  } // namespace detail
2335 
2336 
2342  template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
2344  const matrix_base<NumericT, F2> & B,
2346  ScalarType alpha,
2347  ScalarType beta)
2348  {
2349  assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(A, B): size1(A) != size1(C)"));
2350  assert( (viennacl::traits::size2(A) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(A, B): size2(A) != size1(B)"));
2351  assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(A, B): size2(B) != size2(C)"));
2352 
2353  // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
2354  /*assert( (viennacl::traits::handle(C) != viennacl::traits::handle(A))
2355  && (viennacl::traits::handle(C) != viennacl::traits::handle(B))
2356  && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/
2357 
2358 
2359  detail::prod(A, false,
2360  B, false,
2361  C, alpha, beta);
2362  }
2363 
2364 
2365 
2371  template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
2374  op_trans> & A,
2375  const matrix_base<NumericT, F2> & B,
2377  ScalarType alpha,
2378  ScalarType beta)
2379  {
2380  //std::cout << "size2(A): " << viennacl::traits::size2(A.lhs()) << std::endl;
2381  //std::cout << "size1(C): " << viennacl::traits::size1(C) << std::endl;
2382  assert( (viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(A) != size1(C)"));
2383  assert( (viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(trans(A), B): size1(A) != size1(B)"));
2384  assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(B) != size2(C)"));
2385 
2386  // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
2387  assert( (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs()))
2389  && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));
2390 
2391  detail::prod(A.lhs(), true,
2392  B, false,
2393  C, alpha, beta);
2394  }
2395 
2396 
2397 
2398 
2404  template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
2408  ScalarType alpha,
2409  ScalarType beta)
2410  {
2411  assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(A, trans(B)): size1(A) != size1(C)"));
2412  assert( (viennacl::traits::size2(A) == viennacl::traits::size2(B.lhs())) && bool("Size mismatch in C = prod(A, trans(B)): size2(A) != size2(B)"));
2413  assert( (viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(A, trans(B)): size1(B) != size2(C)"));
2414 
2415  // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
2416  detail::prod(A, false,
2417  B.lhs(), true,
2418  C, alpha, beta);
2419  }
2420 
2421 
2422 
2428  template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
2432  ScalarType alpha,
2433  ScalarType beta)
2434  {
2435  assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C) && bool("Size mismatch in C = prod(trans(A), trans(B)): size2(A) != size1(C)"));
2436  assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()) && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(A) != size2(B)"));
2437  assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C) && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(B) != size2(C)"));
2438 
2439  // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
2440  assert( (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs()))
2442  && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));
2443 
2444  detail::prod(A.lhs(), true,
2445  B.lhs(), true,
2446  C, alpha, beta);
2447  }
2448 
2449 
2450 
2451 
2452  //
2454  //
2455 
2456 
2469  template <typename NumericT, typename F, typename S1>
2471  S1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
2472  const vector_base<NumericT> & vec1,
2473  const vector_base<NumericT> & vec2)
2474  {
2475  assert( (viennacl::traits::size1(mat1) == viennacl::traits::size(vec1)) && bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)"));
2476  assert( (viennacl::traits::size2(mat1) == viennacl::traits::size(vec2)) && bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)"));
2477 
2478  typedef NumericT value_type;
2479 
2480  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
2481 
2482  value_type temporary_alpha = 0;
2484  temporary_alpha = alpha;
2485 
2487  {
2488  scaled_rank1_update_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
2489  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
2490  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
2491  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
2492  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
2493 
2494  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
2495  options_alpha,
2496 
2497  detail::cuda_arg<value_type>(vec1),
2498  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2499  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2500  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2501 
2502  detail::cuda_arg<value_type>(vec2),
2503  static_cast<unsigned int>(viennacl::traits::start(vec2)),
2504  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
2505  static_cast<unsigned int>(viennacl::traits::size(vec2))
2506  );
2507  VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_row_kernel");
2508  }
2509  else
2510  {
2511  scaled_rank1_update_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
2512  static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
2513  static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
2514  static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
2515  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
2516 
2517  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
2518  options_alpha,
2519 
2520  detail::cuda_arg<value_type>(vec1),
2521  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2522  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2523  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2524 
2525  detail::cuda_arg<value_type>(vec2),
2526  static_cast<unsigned int>(viennacl::traits::start(vec2)),
2527  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
2528  static_cast<unsigned int>(viennacl::traits::size(vec2))
2529  );
2530  VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_col_kernel");
2531  }
2532  }
2533 
2534  } // namespace opencl
2535  } //namespace linalg
2536 } //namespace viennacl
2537 
2538 
2539 #endif
std::size_t vcl_size_t
Definition: forwards.h:58
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
Definition: common.hpp:37
result_of::size_type< matrix_base< NumericT, F > >::type stride2(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:68
Generic size and resize functionality for different vector and matrix types.
Common routines for CUDA execution.
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:399
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< T > &s, U)
Definition: common.hpp:127
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
Various little tools used here and there in ViennaCL.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
void matrix_diagonal_assign(matrix_base< NumericT, F > &mat, NumericT s)
Definition: matrix_operations.hpp:289
A dense matrix class.
Definition: forwards.h:290
void prod(const T1 &A, bool transposed_A, const T2 &B, bool transposed_B, T3 &C, ScalarType alpha, ScalarType beta)
Definition: matrix_operations.hpp:2305
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:283
Implementations of row-major dense matrix related operations, including matrix-vector products...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
void clear(VectorType &vec)
Generic routine for setting all entries of a vector to zero. This is the version for non-ViennaCL obj...
Definition: clear.hpp:57
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
Determines row and column increments for matrices and matrix proxies.
void prod_fast_kernel(const T1 &A, const T2 &B, T3 &C, ScalarType alpha, ScalarType beta, std::string kernel_name)
Definition: matrix_operations.hpp:2267
Implementations of column-major dense matrix related operations, including matrix-vector products...
void matrix_diag_from_vector(const vector_base< NumericT > &vec, int k, matrix_base< NumericT, F > &mat)
Definition: matrix_operations.hpp:318
void matrix_assign(matrix_base< NumericT, F > &mat, NumericT s, bool clear=false)
Definition: matrix_operations.hpp:257
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
result_of::size_type< matrix_base< NumericT, F > >::type stride1(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:57
void ambm_m(matrix_base< NumericT, F > &mat1, matrix_base< NumericT, F > const &mat2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT, F > const &mat3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Definition: matrix_operations.hpp:182
void scaled_rank_1_update(matrix_base< NumericT, F > &mat1, S1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, const vector_base< NumericT > &vec1, const vector_base< NumericT > &vec2)
The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update...
Definition: matrix_operations.hpp:2470
void element_op(matrix_base< T, F > &A, matrix_expression< const matrix_base< T, F >, const matrix_base< T, F >, op_element_binary< OP > > const &proxy)
Definition: matrix_operations.hpp:489
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
Definition: forwards.h:363
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:43
Common base class for dense vectors, vector ranges, and vector slices.
Definition: forwards.h:205
Dense matrix-matrix product CUDA kernels reside here.
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
Definition: predicate.hpp:448
vcl_size_t internal_size2(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:287
Proxy classes for vectors.
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void matrix_diag_to_vector(const matrix_base< NumericT, F > &mat, int k, vector_base< NumericT > &vec)
Definition: matrix_operations.hpp:370
T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T type
Definition: result_of.hpp:276
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void prod_slow_kernel(const T1 &A, bool transposed_A, const T2 &B, bool transposed_B, T3 &C, ScalarType alpha, ScalarType beta)
Definition: matrix_operations.hpp:1490
A tag class representing transposed matrices.
Definition: forwards.h:165
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
Definition: forwards.h:86
vcl_size_t internal_size1(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
viennacl::backend::mem_handle & handle(T &obj)
Returns the generic memory handle of an object. Non-const version.
Definition: handle.hpp:41
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
Definition: predicate.hpp:418
void am(matrix_base< NumericT, F > &mat1, matrix_base< NumericT, F > const &mat2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
Definition: matrix_operations.hpp:58
void matrix_row(const matrix_base< NumericT, F > &mat, unsigned int i, vector_base< NumericT > &vec)
Definition: matrix_operations.hpp:417
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Definition: forwards.h:90
Implementation of the ViennaCL scalar class.
Implementations of vector operations using a plain single-threaded execution on CPU.
void ambm(matrix_base< NumericT, F > &mat1, matrix_base< NumericT, F > const &mat2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT, F > const &mat3, ScalarType2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Definition: matrix_operations.hpp:108
A collection of compile time type deductions.
void matrix_column(const matrix_base< NumericT, F > &mat, unsigned int j, vector_base< NumericT > &vec)
Definition: matrix_operations.hpp:450
void prod_impl(const matrix_base< NumericT, F > &mat, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
Definition: matrix_operations.hpp:1364
Simple enable-if variant that uses the SFINAE pattern.