ViennaCL - The Vienna Computing Library  1.5.2
sparse_matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
28 #include "viennacl/tools/tools.hpp"
30 
32 
33 namespace viennacl
34 {
35  namespace linalg
36  {
37  namespace cuda
38  {
39  //
40  // Compressed matrix
41  //
42 
43  namespace detail
44  {
45 
46  template <typename T>
48  const unsigned int * row_indices,
49  const unsigned int * column_indices,
50  const T * elements,
51  T * result,
52  unsigned int size,
53  unsigned int option)
54  {
55  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
56  row < size;
57  row += gridDim.x * blockDim.x)
58  {
59  T value = 0;
60  unsigned int row_end = row_indices[row+1];
61 
62  switch (option)
63  {
64  case 0: //inf-norm
65  for (unsigned int i = row_indices[row]; i < row_end; ++i)
66  value = max(value, fabs(elements[i]));
67  break;
68 
69  case 1: //1-norm
70  for (unsigned int i = row_indices[row]; i < row_end; ++i)
71  value += fabs(elements[i]);
72  break;
73 
74  case 2: //2-norm
75  for (unsigned int i = row_indices[row]; i < row_end; ++i)
76  value += elements[i] * elements[i];
77  value = sqrt(value);
78  break;
79 
80  case 3: //diagonal entry
81  for (unsigned int i = row_indices[row]; i < row_end; ++i)
82  {
83  if (column_indices[i] == row)
84  {
85  value = elements[i];
86  break;
87  }
88  }
89  break;
90 
91  default:
92  break;
93  }
94  result[row] = value;
95  }
96  }
97 
98 
99  template<typename ScalarType, unsigned int MAT_ALIGNMENT>
103  {
104  csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
105  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
106  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
107  detail::cuda_arg<ScalarType>(vec),
108  static_cast<unsigned int>(mat.size1()),
109  static_cast<unsigned int>(info_selector)
110  );
111  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel");
112  }
113 
114  } //namespace detail
115 
116 
117  template <typename T>
119  const unsigned int * row_indices,
120  const unsigned int * column_indices,
121  const T * elements,
122  const T * x,
123  unsigned int start_x,
124  unsigned int inc_x,
125  T * result,
126  unsigned int start_result,
127  unsigned int inc_result,
128  unsigned int size_result)
129  {
130  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
131  row < size_result;
132  row += gridDim.x * blockDim.x)
133  {
134  T dot_prod = (T)0;
135  unsigned int row_end = row_indices[row+1];
136  for (unsigned int i = row_indices[row]; i < row_end; ++i)
137  dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
138  result[row * inc_result + start_result] = dot_prod;
139  }
140  }
141 
142 
143 
144 
145 
154  template<class ScalarType, unsigned int ALIGNMENT>
158  {
159  compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
160  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
161  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
162  detail::cuda_arg<ScalarType>(vec),
163  static_cast<unsigned int>(vec.start()),
164  static_cast<unsigned int>(vec.stride()),
165  detail::cuda_arg<ScalarType>(result),
166  static_cast<unsigned int>(result.start()),
167  static_cast<unsigned int>(result.stride()),
168  static_cast<unsigned int>(result.size())
169  );
170  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
171  }
172 
177  template <typename LayoutT>
179  {
180  static __device__ unsigned int apply(unsigned int i, unsigned int j,
181  unsigned int row_start, unsigned int row_inc,
182  unsigned int col_start, unsigned int col_inc,
183  unsigned int internal_rows, unsigned int internal_cols)
184  {
185  return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
186  }
187  };
188 
190  template <>
191  struct mat_mult_matrix_index<viennacl::column_major>
192  {
193  static __device__ unsigned int apply(unsigned int i, unsigned int j,
194  unsigned int row_start, unsigned int row_inc,
195  unsigned int col_start, unsigned int col_inc,
196  unsigned int internal_rows, unsigned int internal_cols)
197  {
198  return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
199  }
200  };
204  template <typename DMatIndexT, typename ResultIndexT, typename T>
206  const unsigned int * sp_mat_row_indices,
207  const unsigned int * sp_mat_col_indices,
208  const T * sp_mat_elements,
209  const T * d_mat,
210  unsigned int d_mat_row_start,
211  unsigned int d_mat_col_start,
212  unsigned int d_mat_row_inc,
213  unsigned int d_mat_col_inc,
214  unsigned int d_mat_row_size,
215  unsigned int d_mat_col_size,
216  unsigned int d_mat_internal_rows,
217  unsigned int d_mat_internal_cols,
218  T * result,
219  unsigned int result_row_start,
220  unsigned int result_col_start,
221  unsigned int result_row_inc,
222  unsigned int result_col_inc,
223  unsigned int result_row_size,
224  unsigned int result_col_size,
225  unsigned int result_internal_rows,
226  unsigned int result_internal_cols) {
227 
228  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x) {
229 
230  unsigned int row_start = sp_mat_row_indices[row];
231  unsigned int row_end = sp_mat_row_indices[row+1];
232 
233  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) {
234 
235  T r = 0;
236 
237  for (unsigned int k = row_start; k < row_end; k++) {
238 
239  unsigned int j = sp_mat_col_indices[k];
240  T x = sp_mat_elements[k];
241  T y = d_mat[ DMatIndexT::apply(j, col,
242  d_mat_row_start, d_mat_row_inc,
243  d_mat_col_start, d_mat_col_inc,
244  d_mat_internal_rows, d_mat_internal_cols) ];
245 
246  r += x * y;
247  }
248 
249  result [ ResultIndexT::apply(row, col,
250  result_row_start, result_row_inc,
251  result_col_start, result_col_inc,
252  result_internal_rows, result_internal_cols) ] = r;
253  }
254 
255  }
256 
257  }
258 
259 
268  template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2>
270  const viennacl::matrix_base<TYPE, F1> & d_mat,
272  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
273  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
274  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
275  detail::cuda_arg<TYPE>(sp_mat.handle().cuda_handle()),
276 
277  detail::cuda_arg<TYPE>(d_mat),
278  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
279  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
280  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
281  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
282 
283  detail::cuda_arg<TYPE>(result),
284  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
285  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
286  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
287  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
288  );
289  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
290  }
291 
292 
293  template <typename DMatIndexT, typename ResultIndexT, typename T>
295  const unsigned int * sp_mat_row_indices,
296  const unsigned int * sp_mat_col_indices,
297  const T * sp_mat_elements,
298  const T * d_mat,
299  unsigned int d_mat_row_start,
300  unsigned int d_mat_col_start,
301  unsigned int d_mat_row_inc,
302  unsigned int d_mat_col_inc,
303  unsigned int d_mat_row_size,
304  unsigned int d_mat_col_size,
305  unsigned int d_mat_internal_rows,
306  unsigned int d_mat_internal_cols,
307  T * result,
308  unsigned int result_row_start,
309  unsigned int result_col_start,
310  unsigned int result_row_inc,
311  unsigned int result_col_inc,
312  unsigned int result_row_size,
313  unsigned int result_col_size,
314  unsigned int result_internal_rows,
315  unsigned int result_internal_cols) {
316 
317  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x) {
318 
319  unsigned int row_start = sp_mat_row_indices[row];
320  unsigned int row_end = sp_mat_row_indices[row+1];
321 
322  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) {
323 
324  T r = 0;
325 
326  for (unsigned int k = row_start; k < row_end; k++) {
327 
328  unsigned int j = sp_mat_col_indices[k];
329  T x = sp_mat_elements[k];
330  T y = d_mat[ DMatIndexT::apply(col, j,
331  d_mat_row_start, d_mat_row_inc,
332  d_mat_col_start, d_mat_col_inc,
333  d_mat_internal_rows, d_mat_internal_cols) ];
334 
335  r += x * y;
336  }
337 
338  result [ ResultIndexT::apply(row, col,
339  result_row_start, result_row_inc,
340  result_col_start, result_col_inc,
341  result_internal_rows, result_internal_cols) ] = r;
342  }
343  }
344 
345  }
346 
356  template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2>
360  viennacl::op_trans > & d_mat,
362 
363  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
364  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
365  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
366  detail::cuda_arg<TYPE>(sp_mat.handle().cuda_handle()),
367 
368  detail::cuda_arg<TYPE>(d_mat.lhs()),
369  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
370  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
371  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
372  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
373 
374  detail::cuda_arg<TYPE>(result),
375  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
376  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
377  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
378  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
379  );
380  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
381  }
382 
383 
384  //
385  // triangular solves for compressed_matrix
386  //
387 
388  template <typename T>
390  const unsigned int * row_indices,
391  const unsigned int * column_indices,
392  const T * elements,
393  T * result,
394  unsigned int size)
395  {
396  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
397  row < size;
398  row += gridDim.x * blockDim.x)
399  {
400  T diag = (T)0;
401  unsigned int row_end = row_indices[row+1];
402  for (unsigned int i = row_indices[row]; i < row_end; ++i)
403  {
404  unsigned int col_index = column_indices[i];
405  if (col_index == row)
406  {
407  diag = elements[i];
408  break;
409  }
410  }
411  result[row] = diag;
412  }
413  }
414 
415 
421  template<typename SparseMatrixType, class ScalarType>
423  inplace_solve(const SparseMatrixType & mat,
426  {
427  csr_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
428  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
429  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
430  detail::cuda_arg<ScalarType>(vec),
431  static_cast<unsigned int>(mat.size1())
432  );
433  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel");
434  }
435 
436 
442  template<typename SparseMatrixType, class ScalarType>
444  inplace_solve(const SparseMatrixType & mat,
447  {
448  csr_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
449  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
450  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
451  detail::cuda_arg<ScalarType>(vec),
452  static_cast<unsigned int>(mat.size1())
453  );
454  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel");
455  }
456 
457 
458 
464  template<typename SparseMatrixType, class ScalarType>
466  inplace_solve(const SparseMatrixType & mat,
469  {
470  csr_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
471  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
472  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
473  detail::cuda_arg<ScalarType>(vec),
474  static_cast<unsigned int>(mat.size1())
475  );
476  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel");
477  }
478 
479 
485  template<typename SparseMatrixType, class ScalarType>
487  inplace_solve(const SparseMatrixType & mat,
490  {
491  csr_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
492  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
493  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
494  detail::cuda_arg<ScalarType>(vec),
495  static_cast<unsigned int>(mat.size1())
496  );
497  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel");
498  }
499 
500 
501 
502  // transposed
503 
509  template<typename SparseMatrixType, class ScalarType>
514  {
515  csr_trans_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
516  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
517  detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
518  detail::cuda_arg<ScalarType>(vec),
519  static_cast<unsigned int>(mat.lhs().size1())
520  );
521  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel");
522  }
523 
524 
530  template<typename SparseMatrixType, class ScalarType>
535  {
536  viennacl::vector<ScalarType> diagonal(vec.size());
537 
538  compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
539  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
540  detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
541  detail::cuda_arg<ScalarType>(diagonal),
542  static_cast<unsigned int>(mat.size1())
543  );
544 
545  csr_trans_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
546  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
547  detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
548  detail::cuda_arg<ScalarType>(diagonal),
549  detail::cuda_arg<ScalarType>(vec),
550  static_cast<unsigned int>(mat.lhs().size1())
551  );
552  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel");
553  }
554 
555 
561  template<typename SparseMatrixType, class ScalarType>
566  {
567  csr_trans_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
568  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
569  detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
570  detail::cuda_arg<ScalarType>(vec),
571  static_cast<unsigned int>(mat.lhs().size1())
572  );
573  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel");
574  }
575 
576 
582  template<typename SparseMatrixType, class ScalarType>
587  {
588  viennacl::vector<ScalarType> diagonal(vec.size());
589 
590  compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
591  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
592  detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
593  detail::cuda_arg<ScalarType>(diagonal),
594  static_cast<unsigned int>(mat.size1())
595  );
596 
597  csr_trans_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
598  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
599  detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
600  detail::cuda_arg<ScalarType>(diagonal),
601  detail::cuda_arg<ScalarType>(vec),
602  static_cast<unsigned int>(mat.lhs().size1())
603  );
604  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel");
605  }
606 
607  namespace detail
608  {
609  //
610  // block solves
611  //
612  template<typename ScalarType, unsigned int MAT_ALIGNMENT>
615  op_trans> & L,
616  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
617  vector_base<ScalarType> const & /* L_diagonal */, //ignored
620  {
621  csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(L.lhs().handle1().cuda_handle()),
622  detail::cuda_arg<unsigned int>(L.lhs().handle2().cuda_handle()),
623  detail::cuda_arg<ScalarType>(L.lhs().handle().cuda_handle()),
624  detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
625  detail::cuda_arg<ScalarType>(vec),
626  static_cast<unsigned int>(L.lhs().size1())
627  );
628  }
629 
630 
631  template<typename ScalarType, unsigned int MAT_ALIGNMENT>
634  op_trans> & U,
635  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
636  vector_base<ScalarType> const & U_diagonal,
639  {
640  csr_block_trans_lu_backward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(U.lhs().handle1().cuda_handle()),
641  detail::cuda_arg<unsigned int>(U.lhs().handle2().cuda_handle()),
642  detail::cuda_arg<ScalarType>(U.lhs().handle().cuda_handle()),
643  detail::cuda_arg<ScalarType>(U_diagonal.handle().cuda_handle()),
644  detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
645  detail::cuda_arg<ScalarType>(vec),
646  static_cast<unsigned int>(U.lhs().size1())
647  );
648  }
649 
650 
651  }
652 
653 
654  //
655  // Compressed Compressed Matrix
656  //
657 
658  template <typename T>
660  const unsigned int * row_jumper,
661  const unsigned int * row_indices,
662  const unsigned int * column_indices,
663  const T * elements,
664  unsigned int nonzero_rows,
665  const T * x,
666  unsigned int start_x,
667  unsigned int inc_x,
668  T * result,
669  unsigned int start_result,
670  unsigned int inc_result,
671  unsigned int size_result)
672  {
673  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
674  i < size_result;
675  i += gridDim.x * blockDim.x)
676  {
677  result[i * inc_result + start_result] = 0;
678  }
679 
680  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
681  i < nonzero_rows;
682  i += gridDim.x * blockDim.x)
683  {
684  T dot_prod = (T)0;
685  unsigned int row_end = row_jumper[i+1];
686  for (unsigned int j = row_jumper[i]; j < row_end; ++j)
687  dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
688  result[row_indices[i] * inc_result + start_result] = dot_prod;
689  }
690  }
691 
692 
701  template<class ScalarType>
705  {
706  compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
707  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
708  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
709  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
710  static_cast<unsigned int>(mat.nnz1()),
711  detail::cuda_arg<ScalarType>(vec),
712  static_cast<unsigned int>(vec.start()),
713  static_cast<unsigned int>(vec.stride()),
714  detail::cuda_arg<ScalarType>(result),
715  static_cast<unsigned int>(result.start()),
716  static_cast<unsigned int>(result.stride()),
717  static_cast<unsigned int>(result.size())
718  );
719  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel");
720  }
721 
722  //
723  // Coordinate Matrix
724  //
725 
726 
727  namespace detail
728  {
729 
730  template <typename T>
731  __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index)
732  const T * elements,
733  const unsigned int * group_boundaries,
734  T * result,
735  unsigned int option)
736  {
737  __shared__ unsigned int shared_rows[128];
738  __shared__ T inter_results[128];
739 
740  uint2 tmp;
741  T val;
742  unsigned int last_index = blockDim.x - 1;
743  unsigned int group_start = group_boundaries[blockIdx.x];
744  unsigned int group_end = group_boundaries[blockIdx.x + 1];
745  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
746 
747  unsigned int local_index = 0;
748 
749  for (unsigned int k = 0; k < k_end; ++k)
750  {
751  local_index = group_start + k * blockDim.x + threadIdx.x;
752 
753  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
754  val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
755 
756  //check for carry from previous loop run:
757  if (threadIdx.x == 0 && k > 0)
758  {
759  if (tmp.x == shared_rows[last_index])
760  {
761  switch (option)
762  {
763  case 0: //inf-norm
764  case 3: //diagonal entry
765  val = max(val, fabs(inter_results[last_index]));
766  break;
767 
768  case 1: //1-norm
769  val = fabs(val) + inter_results[last_index];
770  break;
771 
772  case 2: //2-norm
773  val = sqrt(val * val + inter_results[last_index]);
774  break;
775 
776  default:
777  break;
778  }
779  }
780  else
781  {
782  switch (option)
783  {
784  case 0: //inf-norm
785  case 1: //1-norm
786  case 3: //diagonal entry
787  result[shared_rows[last_index]] = inter_results[last_index];
788  break;
789 
790  case 2: //2-norm
791  result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
792  default:
793  break;
794  }
795  }
796  }
797 
798  //segmented parallel reduction begin
799  __syncthreads();
800  shared_rows[threadIdx.x] = tmp.x;
801  switch (option)
802  {
803  case 0:
804  case 3:
805  inter_results[threadIdx.x] = val;
806  break;
807  case 1:
808  inter_results[threadIdx.x] = fabs(val);
809  break;
810  case 2:
811  inter_results[threadIdx.x] = val * val;
812  default:
813  break;
814  }
815  T left = 0;
816  __syncthreads();
817 
818  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
819  {
820  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
821  __syncthreads();
822  switch (option)
823  {
824  case 0: //inf-norm
825  case 3: //diagonal entry
826  inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
827  break;
828 
829  case 1: //1-norm
830  inter_results[threadIdx.x] += left;
831  break;
832 
833  case 2: //2-norm
834  inter_results[threadIdx.x] += left;
835  break;
836 
837  default:
838  break;
839  }
840  __syncthreads();
841  }
842  //segmented parallel reduction end
843 
844  if (threadIdx.x != last_index &&
845  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
846  inter_results[threadIdx.x] != 0)
847  {
848  result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
849  }
850 
851  __syncthreads();
852  } //for k
853 
854  if (threadIdx.x == last_index && inter_results[last_index] != 0)
855  result[tmp.x] = (option == 2) ? sqrt(inter_results[last_index]) : inter_results[last_index];
856  }
857 
858  template<typename ScalarType, unsigned int MAT_ALIGNMENT>
862  {
863  coo_row_info_extractor<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
864  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
865  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
866  detail::cuda_arg<ScalarType>(vec),
867  static_cast<unsigned int>(info_selector)
868  );
869  VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor");
870  }
871 
872  } //namespace detail
873 
874 
875  template <typename T>
876  __global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
877  const T * elements,
878  const unsigned int * group_boundaries,
879  const T * x,
880  unsigned int start_x,
881  unsigned int inc_x,
882  T * result,
883  unsigned int start_result,
884  unsigned int inc_result
885  )
886  {
887  __shared__ unsigned int shared_rows[128];
888  __shared__ T inter_results[128];
889 
890  uint2 tmp;
891  T val;
892  unsigned int group_start = group_boundaries[blockIdx.x];
893  unsigned int group_end = group_boundaries[blockIdx.x + 1];
894  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
895 
896  unsigned int local_index = 0;
897 
898  for (unsigned int k = 0; k < k_end; ++k)
899  {
900  local_index = group_start + k * blockDim.x + threadIdx.x;
901 
902  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
903  val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
904 
905  //check for carry from previous loop run:
906  if (threadIdx.x == 0 && k > 0)
907  {
908  if (tmp.x == shared_rows[blockDim.x-1])
909  val += inter_results[blockDim.x-1];
910  else
911  result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
912  }
913 
914  //segmented parallel reduction begin
915  __syncthreads();
916  shared_rows[threadIdx.x] = tmp.x;
917  inter_results[threadIdx.x] = val;
918  T left = 0;
919  __syncthreads();
920 
921  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
922  {
923  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
924  __syncthreads();
925  inter_results[threadIdx.x] += left;
926  __syncthreads();
927  }
928  //segmented parallel reduction end
929 
930  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
931  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
932  {
933  result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
934  }
935 
936  __syncthreads();
937  } //for k
938 
939  if (local_index + 1 == group_end)
940  result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
941  }
942 
943 
952  template<class ScalarType, unsigned int ALIGNMENT>
956  {
957  result.clear();
958 
959  coordinate_matrix_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
960  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
961  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
962  detail::cuda_arg<ScalarType>(vec),
963  static_cast<unsigned int>(vec.start()),
964  static_cast<unsigned int>(vec.stride()),
965  detail::cuda_arg<ScalarType>(result),
966  static_cast<unsigned int>(result.start()),
967  static_cast<unsigned int>(result.stride())
968  );
969  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel");
970  }
971 
972 
973 
974 
975  template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT>
976  __global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
977  const ScalarType * elements,
978  const unsigned int * group_boundaries,
979  const NumericT * d_mat,
980  unsigned int d_mat_row_start,
981  unsigned int d_mat_col_start,
982  unsigned int d_mat_row_inc,
983  unsigned int d_mat_col_inc,
984  unsigned int d_mat_row_size,
985  unsigned int d_mat_col_size,
986  unsigned int d_mat_internal_rows,
987  unsigned int d_mat_internal_cols,
988  NumericT * result,
989  unsigned int result_row_start,
990  unsigned int result_col_start,
991  unsigned int result_row_inc,
992  unsigned int result_col_inc,
993  unsigned int result_row_size,
994  unsigned int result_col_size,
995  unsigned int result_internal_rows,
996  unsigned int result_internal_cols)
997  {
998  __shared__ unsigned int shared_rows[128];
999  __shared__ NumericT inter_results[128];
1000 
1001  uint2 tmp;
1002  NumericT val;
1003  unsigned int group_start = group_boundaries[blockIdx.x];
1004  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1005  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1006 
1007  unsigned int local_index = 0;
1008 
1009  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1010  {
1011  for (unsigned int k = 0; k < k_end; ++k)
1012  {
1013  local_index = group_start + k * blockDim.x + threadIdx.x;
1014 
1015  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1016  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1017  d_mat_row_start, d_mat_row_inc,
1018  d_mat_col_start, d_mat_col_inc,
1019  d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1020 
1021  //check for carry from previous loop run:
1022  if (threadIdx.x == 0 && k > 0)
1023  {
1024  if (tmp.x == shared_rows[blockDim.x-1])
1025  val += inter_results[blockDim.x-1];
1026  else
1027  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1028  result_row_start, result_row_inc,
1029  result_col_start, result_col_inc,
1030  result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1031  }
1032 
1033  //segmented parallel reduction begin
1034  __syncthreads();
1035  shared_rows[threadIdx.x] = tmp.x;
1036  inter_results[threadIdx.x] = val;
1037  NumericT left = 0;
1038  __syncthreads();
1039 
1040  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1041  {
1042  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1043  __syncthreads();
1044  inter_results[threadIdx.x] += left;
1045  __syncthreads();
1046  }
1047  //segmented parallel reduction end
1048 
1049  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1050  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1051  {
1052  result[ResultIndexT::apply(tmp.x, result_col,
1053  result_row_start, result_row_inc,
1054  result_col_start, result_col_inc,
1055  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1056  }
1057 
1058  __syncthreads();
1059  } //for k
1060 
1061  if (local_index + 1 == group_end)
1062  result[ResultIndexT::apply(tmp.x, result_col,
1063  result_row_start, result_row_inc,
1064  result_col_start, result_col_inc,
1065  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1066  }
1067  }
1068 
1069 
1078  template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
1082 
1083  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<64, 128>>>
1084  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1085  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1086  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1087 
1088  detail::cuda_arg<NumericT>(d_mat),
1089  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1090  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1091  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1092  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1093 
1094  detail::cuda_arg<NumericT>(result),
1095  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1096  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1097  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1098  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1099  );
1100 
1101  }
1102 
1103  template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT>
1104  __global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1105  const ScalarType * elements,
1106  const unsigned int * group_boundaries,
1107  const NumericT * d_mat,
1108  unsigned int d_mat_row_start,
1109  unsigned int d_mat_col_start,
1110  unsigned int d_mat_row_inc,
1111  unsigned int d_mat_col_inc,
1112  unsigned int d_mat_row_size,
1113  unsigned int d_mat_col_size,
1114  unsigned int d_mat_internal_rows,
1115  unsigned int d_mat_internal_cols,
1116  NumericT * result,
1117  unsigned int result_row_start,
1118  unsigned int result_col_start,
1119  unsigned int result_row_inc,
1120  unsigned int result_col_inc,
1121  unsigned int result_row_size,
1122  unsigned int result_col_size,
1123  unsigned int result_internal_rows,
1124  unsigned int result_internal_cols)
1125  {
1126  __shared__ unsigned int shared_rows[128];
1127  __shared__ NumericT inter_results[128];
1128 
1129  uint2 tmp;
1130  NumericT val;
1131  unsigned int group_start = group_boundaries[blockIdx.x];
1132  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1133  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1134 
1135  unsigned int local_index = 0;
1136 
1137  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1138  {
1139  for (unsigned int k = 0; k < k_end; ++k)
1140  {
1141  local_index = group_start + k * blockDim.x + threadIdx.x;
1142 
1143  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1144  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1145  d_mat_row_start, d_mat_row_inc,
1146  d_mat_col_start, d_mat_col_inc,
1147  d_mat_internal_rows, d_mat_internal_cols)] : 0;
1148 
1149  //check for carry from previous loop run:
1150  if (threadIdx.x == 0 && k > 0)
1151  {
1152  if (tmp.x == shared_rows[blockDim.x-1])
1153  val += inter_results[blockDim.x-1];
1154  else
1155  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1156  result_row_start, result_row_inc,
1157  result_col_start, result_col_inc,
1158  result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1159  }
1160 
1161  //segmented parallel reduction begin
1162  __syncthreads();
1163  shared_rows[threadIdx.x] = tmp.x;
1164  inter_results[threadIdx.x] = val;
1165  NumericT left = 0;
1166  __syncthreads();
1167 
1168  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1169  {
1170  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1171  __syncthreads();
1172  inter_results[threadIdx.x] += left;
1173  __syncthreads();
1174  }
1175  //segmented parallel reduction end
1176 
1177  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1178  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1179  {
1180  result[ ResultIndexT::apply(tmp.x, result_col,
1181  result_row_start, result_row_inc,
1182  result_col_start, result_col_inc,
1183  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1184  }
1185 
1186  __syncthreads();
1187  } //for k
1188 
1189  if (local_index + 1 == group_end)
1190  result[ ResultIndexT::apply(tmp.x, result_col,
1191  result_row_start, result_row_inc,
1192  result_col_start, result_col_inc,
1193  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1194  }
1195  }
1196 
1205  template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2>
1209  viennacl::op_trans > & d_mat,
1211 
1212  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<64, 128>>>
1213  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1214  detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()),
1215  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1216 
1217  detail::cuda_arg<NumericT>(d_mat.lhs()),
1218  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1219  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1220  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1221  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1222 
1223  detail::cuda_arg<NumericT>(result),
1224  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1225  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1226  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1227  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1228  );
1229 
1230  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1231  }
1232 
1233 
1234  //
1235  // ELL Matrix
1236  //
1237 
1238  template <typename T>
1239  __global__ void ell_matrix_vec_mul_kernel(const unsigned int * coords,
1240  const T * elements,
1241  const T * x,
1242  unsigned int start_x,
1243  unsigned int inc_x,
1244  T * result,
1245  unsigned int start_result,
1246  unsigned int inc_result,
1247  unsigned int row_num,
1248  unsigned int col_num,
1249  unsigned int internal_row_num,
1250  unsigned int items_per_row,
1251  unsigned int aligned_items_per_row
1252  )
1253  {
1254  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1255  unsigned int glb_sz = gridDim.x * blockDim.x;
1256 
1257  for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1258  {
1259  T sum = 0;
1260 
1261  unsigned int offset = row_id;
1262  for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1263  {
1264  T val = elements[offset];
1265 
1266  if(val != (T)0)
1267  {
1268  int col = coords[offset];
1269  sum += (x[col * inc_x + start_x] * val);
1270  }
1271  }
1272 
1273  result[row_id * inc_result + start_result] = sum;
1274  }
1275  }
1276 
1277 
1286  template<class ScalarType, unsigned int ALIGNMENT>
1290  {
1291  ell_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
1292  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
1293  detail::cuda_arg<ScalarType>(vec),
1294  static_cast<unsigned int>(vec.start()),
1295  static_cast<unsigned int>(vec.stride()),
1296  detail::cuda_arg<ScalarType>(result),
1297  static_cast<unsigned int>(result.start()),
1298  static_cast<unsigned int>(result.stride()),
1299  static_cast<unsigned int>(mat.size1()),
1300  static_cast<unsigned int>(mat.size2()),
1301  static_cast<unsigned int>(mat.internal_size1()),
1302  static_cast<unsigned int>(mat.maxnnz()),
1303  static_cast<unsigned int>(mat.internal_maxnnz())
1304  );
1305  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_vec_mul_kernel");
1306  }
1307 
1308  template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT >
1309  __global__ void ell_matrix_d_mat_mul_kernel(const unsigned int * sp_mat_coords,
1310  const ScalarType * sp_mat_elements,
1311  unsigned int sp_mat_row_num,
1312  unsigned int sp_mat_col_num,
1313  unsigned int sp_mat_internal_row_num,
1314  unsigned int sp_mat_items_per_row,
1315  unsigned int sp_mat_aligned_items_per_row,
1316  const NumericT * d_mat,
1317  unsigned int d_mat_row_start,
1318  unsigned int d_mat_col_start,
1319  unsigned int d_mat_row_inc,
1320  unsigned int d_mat_col_inc,
1321  unsigned int d_mat_row_size,
1322  unsigned int d_mat_col_size,
1323  unsigned int d_mat_internal_rows,
1324  unsigned int d_mat_internal_cols,
1325  NumericT * result,
1326  unsigned int result_row_start,
1327  unsigned int result_col_start,
1328  unsigned int result_row_inc,
1329  unsigned int result_col_inc,
1330  unsigned int result_row_size,
1331  unsigned int result_col_size,
1332  unsigned int result_internal_rows,
1333  unsigned int result_internal_cols) {
1334 
1335 
1336  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1337  unsigned int glb_sz = gridDim.x * blockDim.x;
1338  for( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz) {
1339  unsigned int row = rc % sp_mat_row_num;
1340  unsigned int col = rc / sp_mat_row_num;
1341 
1342  unsigned int offset = row;
1343  NumericT r = (NumericT)0;
1344 
1345  for(unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) {
1346 
1347  unsigned int j = sp_mat_coords[offset];
1348  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
1349 
1350  if(x != (NumericT)0) {
1351 
1352  NumericT y = d_mat[ DMatIndexT::apply(j, col,
1353  d_mat_row_start, d_mat_row_inc,
1354  d_mat_col_start, d_mat_col_inc,
1355  d_mat_internal_rows, d_mat_internal_cols) ];
1356 
1357  r += x*y;
1358  }
1359  }
1360  result [ ResultIndexT::apply(row, col,
1361  result_row_start, result_row_inc,
1362  result_col_start, result_col_inc,
1363  result_internal_rows, result_internal_cols) ] = r;
1364  }
1365 
1366  }
1367 
1377  template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2 >
1381 
1382  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
1383  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1384  detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()),
1385  static_cast<unsigned int>(sp_mat.size1()),
1386  static_cast<unsigned int>(sp_mat.size2()),
1387  static_cast<unsigned int>(sp_mat.internal_size1()),
1388  static_cast<unsigned int>(sp_mat.maxnnz()),
1389  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1390  detail::cuda_arg<NumericT>(d_mat),
1391  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1392  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1393  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1394  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1395 
1396  detail::cuda_arg<NumericT>(result),
1397  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1398  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1399  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1400  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1401  );
1402  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1403  }
1404 
1405  template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT >
1406  __global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int * sp_mat_coords,
1407  const ScalarType * sp_mat_elements,
1408  unsigned int sp_mat_row_num,
1409  unsigned int sp_mat_col_num,
1410  unsigned int sp_mat_internal_row_num,
1411  unsigned int sp_mat_items_per_row,
1412  unsigned int sp_mat_aligned_items_per_row,
1413  const NumericT * d_mat,
1414  unsigned int d_mat_row_start,
1415  unsigned int d_mat_col_start,
1416  unsigned int d_mat_row_inc,
1417  unsigned int d_mat_col_inc,
1418  unsigned int d_mat_row_size,
1419  unsigned int d_mat_col_size,
1420  unsigned int d_mat_internal_rows,
1421  unsigned int d_mat_internal_cols,
1422  NumericT * result,
1423  unsigned int result_row_start,
1424  unsigned int result_col_start,
1425  unsigned int result_row_inc,
1426  unsigned int result_col_inc,
1427  unsigned int result_row_size,
1428  unsigned int result_col_size,
1429  unsigned int result_internal_rows,
1430  unsigned int result_internal_cols) {
1431 
1432 
1433  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1434  unsigned int glb_sz = gridDim.x * blockDim.x;
1435  for( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz) {
1436  unsigned int row = rc % sp_mat_row_num;
1437  unsigned int col = rc / sp_mat_row_num;
1438 
1439  unsigned int offset = row;
1440  NumericT r = (NumericT)0;
1441 
1442  for(unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) {
1443 
1444  unsigned int j = sp_mat_coords[offset];
1445  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
1446 
1447  if(x != (NumericT)0) {
1448 
1449  NumericT y = d_mat[ DMatIndexT::apply(col, j,
1450  d_mat_row_start, d_mat_row_inc,
1451  d_mat_col_start, d_mat_col_inc,
1452  d_mat_internal_rows, d_mat_internal_cols) ];
1453 
1454  r += x*y;
1455  }
1456  }
1457  result [ ResultIndexT::apply(row, col,
1458  result_row_start, result_row_inc,
1459  result_col_start, result_col_inc,
1460  result_internal_rows, result_internal_cols) ] = r;
1461  }
1462 
1463  }
1464 
1474  template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2 >
1478  viennacl::op_trans > & d_mat,
1480 
1481  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
1482  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1483  detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()),
1484  static_cast<unsigned int>(sp_mat.size1()),
1485  static_cast<unsigned int>(sp_mat.size2()),
1486  static_cast<unsigned int>(sp_mat.internal_size1()),
1487  static_cast<unsigned int>(sp_mat.maxnnz()),
1488  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1489 
1490  detail::cuda_arg<NumericT>(d_mat.lhs()),
1491  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1492  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1493  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1494  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1495 
1496  detail::cuda_arg<NumericT>(result),
1497  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1498  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1499  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1500  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1501  );
1502 
1503  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1504  }
1505 
1506  //
1507  // Hybrid Matrix
1508  //
1509 
1510 
1511  template <typename T>
1512  __global__ void hyb_matrix_vec_mul_kernel(const unsigned int * ell_coords,
1513  const T * ell_elements,
1514  const unsigned int * csr_rows,
1515  const unsigned int * csr_cols,
1516  const T * csr_elements,
1517  const T * x,
1518  unsigned int start_x,
1519  unsigned int inc_x,
1520  T * result,
1521  unsigned int start_result,
1522  unsigned int inc_result,
1523  unsigned int row_num,
1524  unsigned int internal_row_num,
1525  unsigned int items_per_row,
1526  unsigned int aligned_items_per_row
1527  )
1528  {
1529  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1530  unsigned int glb_sz = gridDim.x * blockDim.x;
1531 
1532  for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1533  {
1534  T sum = 0;
1535 
1536  unsigned int offset = row_id;
1537  for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1538  {
1539  T val = ell_elements[offset];
1540 
1541 
1542  if(val != 0.0f)
1543  {
1544  int col = ell_coords[offset];
1545  sum += (x[col * inc_x + start_x] * val);
1546  }
1547  }
1548 
1549  unsigned int col_begin = csr_rows[row_id];
1550  unsigned int col_end = csr_rows[row_id + 1];
1551 
1552  for(unsigned int item_id = col_begin; item_id < col_end; item_id++)
1553  {
1554  sum += (x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id]);
1555  }
1556 
1557  result[row_id * inc_result + start_result] = sum;
1558  }
1559  }
1560 
1561 
1562 
1571  template<class ScalarType, unsigned int ALIGNMENT>
1575  {
1576  hyb_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
1577  detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
1578  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
1579  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
1580  detail::cuda_arg<ScalarType>(mat.handle5().cuda_handle()),
1581  detail::cuda_arg<ScalarType>(vec),
1582  static_cast<unsigned int>(vec.start()),
1583  static_cast<unsigned int>(vec.stride()),
1584  detail::cuda_arg<ScalarType>(result),
1585  static_cast<unsigned int>(result.start()),
1586  static_cast<unsigned int>(result.stride()),
1587  static_cast<unsigned int>(mat.size1()),
1588  static_cast<unsigned int>(mat.internal_size1()),
1589  static_cast<unsigned int>(mat.ell_nnz()),
1590  static_cast<unsigned int>(mat.internal_ellnnz())
1591  );
1592  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
1593  }
1594 
1595 
1596 
1597  template <typename DMatIndexT, typename ResultIndexT, typename NumericT>
1598  __global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int * ell_coords,
1599  const NumericT * ell_elements,
1600  const unsigned int * csr_rows,
1601  const unsigned int * csr_cols,
1602  const NumericT * csr_elements,
1603  unsigned int row_num,
1604  unsigned int internal_row_num,
1605  unsigned int items_per_row,
1606  unsigned int aligned_items_per_row,
1607  const NumericT * d_mat,
1608  unsigned int d_mat_row_start,
1609  unsigned int d_mat_col_start,
1610  unsigned int d_mat_row_inc,
1611  unsigned int d_mat_col_inc,
1612  unsigned int d_mat_row_size,
1613  unsigned int d_mat_col_size,
1614  unsigned int d_mat_internal_rows,
1615  unsigned int d_mat_internal_cols,
1616  NumericT * result,
1617  unsigned int result_row_start,
1618  unsigned int result_col_start,
1619  unsigned int result_row_inc,
1620  unsigned int result_col_inc,
1621  unsigned int result_row_size,
1622  unsigned int result_col_size,
1623  unsigned int result_internal_rows,
1624  unsigned int result_internal_cols)
1625  {
1626  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1627  unsigned int glb_sz = gridDim.x * blockDim.x;
1628 
1629  for(unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1630  {
1631  for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1632  {
1633  NumericT sum = 0;
1634 
1635  unsigned int offset = row_id;
1636  for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1637  {
1638  NumericT val = ell_elements[offset];
1639 
1640  if(val != 0.0f)
1641  {
1642  sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
1643  d_mat_row_start, d_mat_row_inc,
1644  d_mat_col_start, d_mat_col_inc,
1645  d_mat_internal_rows, d_mat_internal_cols)] * val;
1646  }
1647  }
1648 
1649  unsigned int col_begin = csr_rows[row_id];
1650  unsigned int col_end = csr_rows[row_id + 1];
1651 
1652  for(unsigned int item_id = col_begin; item_id < col_end; item_id++)
1653  {
1654  sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
1655  d_mat_row_start, d_mat_row_inc,
1656  d_mat_col_start, d_mat_col_inc,
1657  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
1658  }
1659 
1660  result[ResultIndexT::apply(row_id, result_col,
1661  result_row_start, result_row_inc,
1662  result_col_start, result_col_inc,
1663  result_internal_rows, result_internal_cols)] = sum;
1664  }
1665  }
1666  }
1667 
1668 
1669 
1678  template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
1682  {
1683  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<256, 128>>>(
1684  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
1685  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
1686  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
1687  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
1688  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
1689  static_cast<unsigned int>(mat.size1()),
1690  static_cast<unsigned int>(mat.internal_size1()),
1691  static_cast<unsigned int>(mat.ell_nnz()),
1692  static_cast<unsigned int>(mat.internal_ellnnz()),
1693 
1694  detail::cuda_arg<NumericT>(d_mat),
1695  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1696  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1697  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1698  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1699 
1700  detail::cuda_arg<NumericT>(result),
1701  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1702  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1703  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1704  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1705  );
1706  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
1707  }
1708 
1709 
1710 
1711  template <typename DMatIndexT, typename ResultIndexT, typename NumericT>
1712  __global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int * ell_coords,
1713  const NumericT * ell_elements,
1714  const unsigned int * csr_rows,
1715  const unsigned int * csr_cols,
1716  const NumericT * csr_elements,
1717  unsigned int row_num,
1718  unsigned int internal_row_num,
1719  unsigned int items_per_row,
1720  unsigned int aligned_items_per_row,
1721  const NumericT * d_mat,
1722  unsigned int d_mat_row_start,
1723  unsigned int d_mat_col_start,
1724  unsigned int d_mat_row_inc,
1725  unsigned int d_mat_col_inc,
1726  unsigned int d_mat_row_size,
1727  unsigned int d_mat_col_size,
1728  unsigned int d_mat_internal_rows,
1729  unsigned int d_mat_internal_cols,
1730  NumericT * result,
1731  unsigned int result_row_start,
1732  unsigned int result_col_start,
1733  unsigned int result_row_inc,
1734  unsigned int result_col_inc,
1735  unsigned int result_row_size,
1736  unsigned int result_col_size,
1737  unsigned int result_internal_rows,
1738  unsigned int result_internal_cols)
1739  {
1740  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1741  unsigned int glb_sz = gridDim.x * blockDim.x;
1742 
1743  for(unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1744  {
1745  for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1746  {
1747  NumericT sum = 0;
1748 
1749  unsigned int offset = row_id;
1750  for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1751  {
1752  NumericT val = ell_elements[offset];
1753 
1754  if(val != 0.0f)
1755  {
1756  sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
1757  d_mat_row_start, d_mat_row_inc,
1758  d_mat_col_start, d_mat_col_inc,
1759  d_mat_internal_rows, d_mat_internal_cols)] * val;
1760  }
1761  }
1762 
1763  unsigned int col_begin = csr_rows[row_id];
1764  unsigned int col_end = csr_rows[row_id + 1];
1765 
1766  for(unsigned int item_id = col_begin; item_id < col_end; item_id++)
1767  {
1768  sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
1769  d_mat_row_start, d_mat_row_inc,
1770  d_mat_col_start, d_mat_col_inc,
1771  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
1772  }
1773 
1774  result[ResultIndexT::apply(row_id, result_col,
1775  result_row_start, result_row_inc,
1776  result_col_start, result_col_inc,
1777  result_internal_rows, result_internal_cols)] = sum;
1778  }
1779  }
1780  }
1781 
1782 
1783 
1792  template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
1796  viennacl::op_trans > & d_mat,
1798  {
1799  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<256, 128>>>(
1800  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
1801  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
1802  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
1803  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
1804  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
1805  static_cast<unsigned int>(mat.size1()),
1806  static_cast<unsigned int>(mat.internal_size1()),
1807  static_cast<unsigned int>(mat.ell_nnz()),
1808  static_cast<unsigned int>(mat.internal_ellnnz()),
1809 
1810  detail::cuda_arg<NumericT>(d_mat.lhs()),
1811  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1812  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1813  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1814  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1815 
1816  detail::cuda_arg<NumericT>(result),
1817  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1818  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1819  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1820  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1821  );
1822  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
1823  }
1824 
1825 
1826  } // namespace cuda
1827  } //namespace linalg
1828 } //namespace viennacl
1829 
1830 
1831 #endif
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:321
Simple enable-if variant that uses the SFINAE pattern.
Definition: enable_if.hpp:29
vcl_size_t size1() const
Definition: hyb_matrix.hpp:74
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Definition: compressed_compressed_matrix.hpp:452
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const ScalarType *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:1104
std::size_t vcl_size_t
Definition: forwards.h:58
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
Definition: sparse_matrix_operations.hpp:180
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: compressed_compressed_matrix.hpp:456
const handle_type & handle3() const
Definition: hyb_matrix.hpp:83
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector.hpp:837
result_of::size_type< matrix_base< NumericT, F > >::type stride2(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:68
const handle_type & handle4() const
Definition: hyb_matrix.hpp:84
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const T *elements, const T *x, unsigned int start_x, unsigned int inc_x, T *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Definition: sparse_matrix_operations.hpp:118
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const T *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const T *csr_elements, const T *x, unsigned int start_x, unsigned int inc_x, T *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
Definition: sparse_matrix_operations.hpp:1512
Common routines for CUDA execution.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Definition: compressed_compressed_matrix.hpp:450
Various little tools used here and there in ViennaCL.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
A tag class representing a lower triangular matrix.
Definition: forwards.h:703
vcl_size_t size1() const
Definition: ell_matrix.hpp:80
vcl_size_t size2() const
Definition: ell_matrix.hpp:81
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:77
A dense matrix class.
Definition: forwards.h:290
void row_info(compressed_matrix< ScalarType, MAT_ALIGNMENT > const &mat, vector_base< ScalarType > &vec, viennacl::linalg::detail::row_info_types info_selector)
Definition: sparse_matrix_operations.hpp:100
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:283
void inplace_solve(const matrix_base< NumericT, F1 > &A, matrix_base< NumericT, F2 > &B, SOLVERTAG tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
Definition: direct_solve.hpp:297
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:83
size_type start() const
Returns the offset within the buffer.
Definition: vector.hpp:845
const handle_type & handle2() const
Definition: hyb_matrix.hpp:82
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Definition: compressed_matrix.hpp:701
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:71
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const ScalarType *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:1406
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
Helper struct for accessing an element of a row- or column-major matrix.
Definition: sparse_matrix_operations.hpp:178
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const T *sp_mat_elements, const T *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, T *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:294
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: coordinate_matrix.hpp:354
const handle_type & handle() const
Definition: hyb_matrix.hpp:81
result_of::size_type< matrix_base< NumericT, F > >::type stride1(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:57
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
Definition: compressed_compressed_matrix.hpp:445
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const T *elements, const unsigned int *group_boundaries, const T *x, unsigned int start_x, unsigned int inc_x, T *result, unsigned int start_result, unsigned int inc_result)
Definition: sparse_matrix_operations.hpp:876
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:863
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:1712
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Definition: coordinate_matrix.hpp:356
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:78
A tag class representing an upper triangular matrix.
Definition: forwards.h:708
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const T *sp_mat_elements, const T *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, T *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:205
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const ScalarType *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:1309
__global__ void coo_row_info_extractor(const unsigned int *coords, const T *elements, const unsigned int *group_boundaries, T *result, unsigned int option)
Definition: sparse_matrix_operations.hpp:731
handle_type & handle()
Definition: ell_matrix.hpp:89
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
Definition: compressed_compressed_matrix.hpp:263
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
Definition: compressed_compressed_matrix.hpp:454
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
Definition: coordinate_matrix.hpp:352
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:1598
vcl_size_t size1() const
Returns the size of the result vector.
Definition: matrix.hpp:180
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const T *elements, T *result, unsigned int size, unsigned int option)
Definition: sparse_matrix_operations.hpp:47
const vcl_size_t & size1() const
Returns the number of rows.
Definition: compressed_matrix.hpp:692
vcl_size_t internal_size2(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:287
void block_inplace_solve(const matrix_expression< const compressed_matrix< ScalarType, MAT_ALIGNMENT >, const compressed_matrix< ScalarType, MAT_ALIGNMENT >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< ScalarType > const &, vector_base< ScalarType > &vec, viennacl::linalg::unit_lower_tag)
Definition: sparse_matrix_operations.hpp:613
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
Implementations of direct triangular solvers for sparse matrices using CUDA.
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const T *elements, unsigned int nonzero_rows, const T *x, unsigned int start_x, unsigned int inc_x, T *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Definition: sparse_matrix_operations.hpp:659
vector_expression< const matrix_base< NumericT, F >, const int, op_matrix_diag > diag(const matrix_base< NumericT, F > &A, int k=0)
Definition: matrix.hpp:895
void dot_prod(const MatrixType &A, unsigned int beg_ind, ScalarType &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:154
row_info_types
Definition: forwards.h:691
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const T *elements, T *result, unsigned int size)
Definition: sparse_matrix_operations.hpp:389
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:84
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
handle_type & handle2()
Definition: ell_matrix.hpp:92
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:713
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:62
A tag class representing transposed matrices.
Definition: forwards.h:165
A tag for column-major storage of a dense matrix.
Definition: forwards.h:263
vcl_size_t internal_size1(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
LHS & lhs() const
Get left hand side operand.
Definition: matrix.hpp:174
const handle_type & handle() const
Returns the memory handle.
Definition: vector.hpp:856
const handle_type & handle5() const
Definition: hyb_matrix.hpp:85
Implementation of the ViennaCL scalar class.
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const ScalarType *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:976
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:77
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(SCALARTYPE))
Definition: vector.hpp:849
void prod_impl(const matrix_base< NumericT, F > &mat, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
Definition: matrix_operations.hpp:1364
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:718
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const T *elements, const T *x, unsigned int start_x, unsigned int inc_x, T *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
Definition: sparse_matrix_operations.hpp:1239
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
Definition: coordinate_matrix.hpp:186
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: compressed_matrix.hpp:703
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Definition: compressed_matrix.hpp:699