1 #ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
2 #define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
43 unsigned int A_start1,
unsigned int A_start2,
44 unsigned int A_inc1,
unsigned int A_inc2,
45 unsigned int A_size1,
unsigned int A_size2,
46 unsigned int A_internal_size1,
unsigned int A_internal_size2,
51 unsigned int B_start1,
unsigned int B_start2,
52 unsigned int B_inc1,
unsigned int B_inc2,
53 unsigned int B_size1,
unsigned int B_size2,
54 unsigned int B_internal_size1,
unsigned int B_internal_size2,
63 for (
unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
65 unsigned int row = A_size1 - 1 - row_cnt;
73 if (row_major_B && transpose_B)
74 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
75 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
76 else if (row_major_B && !transpose_B)
77 B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
78 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
79 else if (!row_major_B && transpose_B)
80 B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
81 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
83 B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
84 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
90 if (row_major_B && transpose_B)
91 temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)];
92 else if (row_major_B && !transpose_B)
93 temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
94 else if (!row_major_B && transpose_B)
95 temp = B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1];
97 temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
100 for (
unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
102 if (row_major_A && transpose_A)
103 entry_A = A[(row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)];
104 else if (row_major_A && !transpose_A)
105 entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
106 else if (!row_major_A && transpose_A)
107 entry_A = A[(row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1];
109 entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
111 if (row_major_B && transpose_B)
112 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A;
113 else if (row_major_B && !transpose_B)
114 B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
115 else if (!row_major_B && transpose_B)
116 B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
118 B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
126 template <
typename T>
129 unsigned int A_start1,
unsigned int A_start2,
130 unsigned int A_inc1,
unsigned int A_inc2,
131 unsigned int A_size1,
unsigned int A_size2,
132 unsigned int A_internal_size1,
unsigned int A_internal_size2,
137 unsigned int B_start1,
unsigned int B_start2,
138 unsigned int B_inc1,
unsigned int B_inc2,
139 unsigned int B_size1,
unsigned int B_size2,
140 unsigned int B_internal_size1,
unsigned int B_internal_size2,
149 for (
unsigned int row = 0;
row < A_size1; ++
row)
156 if (threadIdx.x == 0)
158 if (row_major_B && transpose_B)
159 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (
row * B_inc2 + B_start2)] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
160 : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
161 else if (row_major_B && !transpose_B)
162 B[(
row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
163 : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
164 else if (!row_major_B && transpose_B)
165 B[(blockIdx.x * B_inc1 + B_start1) + (
row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
166 : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
168 B[(
row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(
row * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)]
169 : A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2)*A_internal_size1];
175 if (row_major_B && transpose_B)
176 temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (
row * B_inc2 + B_start2)];
177 else if (row_major_B && !transpose_B)
178 temp = B[(
row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
179 else if (!row_major_B && transpose_B)
180 temp = B[(blockIdx.x * B_inc1 + B_start1) + (
row * B_inc2 + B_start2) * B_internal_size1];
182 temp = B[(
row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
185 for (
unsigned int elim =
row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x)
187 if (row_major_A && transpose_A)
188 entry_A = A[(
row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)];
189 else if (row_major_A && !transpose_A)
190 entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (
row * A_inc2 + A_start2)];
191 else if (!row_major_A && transpose_A)
192 entry_A = A[(
row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1];
194 entry_A = A[(elim * A_inc1 + A_start1) + (
row * A_inc2 + A_start2) * A_internal_size1];
196 if (row_major_B && transpose_B)
197 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A;
198 else if (row_major_B && !transpose_B)
199 B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
200 else if (!row_major_B && transpose_B)
201 B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
203 B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
216 template <
typename T>
222 template <
typename T>
228 template <
typename M1,
typename M2,
typename SolverTag>
230 M2 & B,
bool transpose_B,
231 SolverTag
const & tag)
236 dim3 grid( transpose_B ? B.size1() : B.size2() );
240 matrix_matrix_upper_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A),
248 detail::cuda_arg<value_type>(B),
261 matrix_matrix_lower_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A),
269 detail::cuda_arg<value_type>(B),
296 template <
typename NumericT,
typename F1,
typename F2,
typename SOLVERTAG>
309 template <
typename NumericT,
typename F1,
typename F2,
typename SOLVERTAG>
325 template <
typename NumericT,
typename F1,
typename F2,
typename SOLVERTAG>
340 template <
typename NumericT,
typename F1,
typename F2,
typename SOLVERTAG>
355 template <
typename T>
358 unsigned int A_start1,
unsigned int A_start2,
359 unsigned int A_inc1,
unsigned int A_inc2,
360 unsigned int A_size1,
unsigned int A_size2,
361 unsigned int A_internal_size1,
unsigned int A_internal_size2,
363 unsigned int v_start,
367 unsigned int options)
370 unsigned int unit_diagonal_flag = (options & (1 << 0));
371 unsigned int transposed_access_A = (options & (1 << 1));
372 unsigned int is_lower_solve = (options & (1 << 2));
374 for (
unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)
376 row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
377 if (!unit_diagonal_flag)
380 if (threadIdx.x == 0)
381 v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
386 temp = v[row * v_inc + v_start];
388 for (
int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
389 elim < (is_lower_solve ? A_size1 : row);
391 v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2))
392 : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))];
397 template <
typename T>
400 unsigned int A_start1,
unsigned int A_start2,
401 unsigned int A_inc1,
unsigned int A_inc2,
402 unsigned int A_size1,
unsigned int A_size2,
403 unsigned int A_internal_size1,
unsigned int A_internal_size2,
405 unsigned int v_start,
408 unsigned int options)
411 unsigned int unit_diagonal_flag = (options & (1 << 0));
412 unsigned int transposed_access_A = (options & (1 << 1));
413 unsigned int is_lower_solve = (options & (1 << 2));
415 for (
unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)
417 row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
418 if (!unit_diagonal_flag)
421 if (threadIdx.x == 0)
422 v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
427 temp = v[row * v_inc + v_start];
429 for (
int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
430 elim < (is_lower_solve ? A_size1 : row);
432 v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1)
433 : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)];
445 template <
typename MatrixType,
typename VectorType>
448 unsigned int options)
454 triangular_substitute_inplace_row_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat),
459 detail::cuda_arg<value_type>(vec),
468 triangular_substitute_inplace_col_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat),
473 detail::cuda_arg<value_type>(vec),
489 template <
typename NumericT,
typename F,
typename SOLVERTAG>
507 template <
typename NumericT,
typename F,
typename SOLVERTAG>
result_of::size_type< matrix_base< NumericT, F > >::type stride2(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:68
__global__ void triangular_substitute_inplace_col_kernel(T const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
Definition: direct_solve.hpp:398
__global__ void matrix_matrix_upper_solve_kernel(const T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, bool transpose_A, T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool transpose_B, bool unit_diagonal)
Definition: direct_solve.hpp:41
Common routines for CUDA execution.
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:399
Implementation of the dense matrix class.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
A tag class representing a lower triangular matrix.
Definition: forwards.h:703
A dense matrix class.
Definition: forwards.h:290
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:283
void inplace_solve(const matrix_base< NumericT, F1 > &A, matrix_base< NumericT, F2 > &B, SOLVERTAG tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
Definition: direct_solve.hpp:297
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
__global__ void matrix_matrix_lower_solve_kernel(const T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, bool row_major_A, bool transpose_A, T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_size1, unsigned int B_size2, unsigned int B_internal_size1, unsigned int B_internal_size2, bool row_major_B, bool transpose_B, bool unit_diagonal)
Definition: direct_solve.hpp:127
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
result_of::size_type< matrix_base< NumericT, F > >::type stride1(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:57
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
bool is_unit_solve(T const &tag)
Definition: direct_solve.hpp:217
void inplace_solve_vector_impl(MatrixType const &mat, VectorType &vec, unsigned int options)
Definition: direct_solve.hpp:446
__global__ void triangular_substitute_inplace_row_kernel(T const *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, unsigned int options)
Definition: direct_solve.hpp:356
A tag class representing an upper triangular matrix.
Definition: forwards.h:708
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:43
Common base class for dense vectors, vector ranges, and vector slices.
Definition: forwards.h:205
void inplace_solve_impl(M1 const &A, bool transpose_A, M2 &B, bool transpose_B, SolverTag const &tag)
Definition: direct_solve.hpp:229
vcl_size_t internal_size2(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:287
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T type
Definition: result_of.hpp:276
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:713
A tag class representing transposed matrices.
Definition: forwards.h:165
unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag)
Definition: direct_solve.hpp:440
bool is_upper_solve(T const &tag)
Definition: direct_solve.hpp:223
vcl_size_t internal_size1(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:718