1 #ifndef VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_
27 #ifdef VIENNACL_WITH_OPENMP
49 template <
typename ScalarType>
60 template <
typename ScalarType>
65 if(A[i+j][j]!=ScalarType(0))
72 template <
typename ScalarType>
78 ScalarType norm=
_nrm2(&A[row][row+from],to-from);
80 if(norm != ScalarType(0))
85 if(std::abs(A[row][row+from]-ScalarType(1))>std::abs(A[row][row+from]+ScalarType(1)))
89 A[
row][row+from]+=ScalarType(1);
96 ScalarType s=
_dotc(to-from,&A[row][row+from],&A[j][row+from]);
97 s=-s/A[
row][row+from];
98 _axpy(&A[row][row+from],&A[j][row+from],to-from,s);
111 ss[i]=-ss[i]/A[row][row+from];
113 _axpy(ss,&A[row+i][row],width,A[row][row+i]);
126 template<
typename ScalarType>
133 ScalarType *ss=
new ScalarType[bandwidth+belowDiagonal];
137 for(;k<n-belowDiagonal;k++)
144 for(
vcl_size_t bulgeStart=k+1;bulgeStart<n-belowDiagonal;bulgeStart+=belowDiagonal)
146 eliminateHermitian(A,bulgeStart+i,belowDiagonal,std::min(n-bulgeStart-i,belowDiagonal*2-i),std::min(n-bulgeStart-i,bandwidth+belowDiagonal),ss);
158 template<
typename ScalarType>
161 ScalarType* norms=
new ScalarType[block_size];
162 ScalarType* ss=
new ScalarType[n];
164 for (
vcl_size_t k=0;k<n-block_size;k+=block_size)
166 for(
vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
170 norms[bi]=
_nrm2(&A[k+bi][k+bi+block_size],n-k-bi-block_size);
172 if(norms[bi]!=ScalarType(0))
177 if(std::abs(A[k+bi][k+bi+block_size]-ScalarType(1))>std::abs(A[k+bi][k+bi+block_size]+ScalarType(1)))
178 norms[bi]=-norms[bi];
180 A[k+bi][i]/=norms[bi];
181 A[k+bi][k+bi+block_size]+=ScalarType(1);
186 ScalarType s=
_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
187 s=-s/A[k+bi][k+bi+block_size];
188 _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
196 #ifdef VIENNACL_WITH_OPENMP
197 #pragma omp parallel for
198 for(
int j=k+block_size;j<(int)n;j++)
203 for(
vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
205 if(norms[bi]!=ScalarType(0))
207 ScalarType s=
_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
208 s=-s/A[k+bi][k+bi+block_size];
209 _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
220 #ifdef VIENNACL_WITH_OPENMP
221 #pragma omp parallel for
222 for(
int section=0;section<(int)num_threads;section++)
224 for(
vcl_size_t section=0;section<num_threads;section++)
228 vcl_size_t end =((n-k)*(section+1))/num_threads+k;
230 for(
vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
232 if(norms[bi]!=ScalarType(0))
239 ss[i]=-ss[i]/A[k+bi][k+bi+block_size];
241 _axpy(ss+start,&A[i][start],length,A[k+bi][i]);
265 template<
typename ScalarType>
269 std::cerr <<
"ViennaCL: Warning in inplace_tred2(): Matrix is not hermitian (or real symmetric)" << std::endl;
275 if(bandwidth*bandwidth*num_threads<n*4 || 2*block_size+1>bandwidth)
294 template <
typename ScalarType>
304 for(
vcl_size_t j=0; j<std::min(m,n); j+=block_size)
306 block_size=std::min(std::min(m-j,n-j),block_size);
317 if(std::abs(A[i][j+bi])>std::abs(A[p][j+bi]))
324 ScalarType t=A[p][k];
338 ScalarType elimVal=A[j+bi][j+bi];
339 if(elimVal==ScalarType(0))
344 if(A[
row][j+bi_]!=ScalarType(0))
345 _axpy(&(A[j+bi_][j+block_size]),&(A[
row][j+block_size]),n-j-block_size,-A[
row][j+bi_]);
350 ScalarType multiplier=A[
row][j+bi]/elimVal;
351 for(
vcl_size_t col=j+bi;col<j+block_size;col++)
352 A[
row][col]-=multiplier*A[j+bi][col];
353 A[
row][j+bi]=multiplier;
371 if(A[
row][j+bi]!=ScalarType(0))
372 _axpy(&(A[j+bi][j+block_size]),&(A[
row][j+block_size]),n-j-block_size,-A[
row][j+bi]);
386 #ifdef VIENNACL_OPENMP
387 #pragma omp parallel for
388 for(
int row=j+block_size;
row<(int)m;
row++)
393 if(A[
row][j+bi]!=ScalarType(0))
394 _axpy(&(A[j+bi][j+block_size]),&(A[
row][j+block_size]),n-j-block_size,-A[
row][j+bi]);
406 template <
typename ScalarType>
409 std::vector<ScalarType> betas(std::min(m,n));
410 ScalarType* norms=
new ScalarType[block_size];
412 for(
vcl_size_t k=0; k<std::min(m,n); k+=block_size)
414 block_size=std::min(std::min(m-k,n-k),block_size);
420 norms[bi]=
_nrm2(&A[k+bi][k+bi],m-k-bi);
422 if(norms[bi]!=ScalarType(0))
426 if(std::abs(A[k+bi][k+bi]-ScalarType(1))>std::abs(A[k+bi][k+bi]+ScalarType(1)))
429 A[k+bi][i]/=norms[bi];
430 A[k+bi][k+bi]+=ScalarType(1);
435 ScalarType s=
_dotc(m-k-bi,&A[k+bi][k+bi],&A[j][k+bi]);
436 s = -s/A[k+bi][k+bi];
437 _axpy(&A[k+bi][k+bi],&A[j][k+bi],m-k-bi,s);
441 betas[k+bi]=-norms[bi];
445 #ifdef VIENNACL_OPENMP
446 #pragma omp parallel for
447 for(
int j=k+block_size; j<(int)n; j++)
454 if(norms[bi]!=ScalarType(0))
456 ScalarType s=
_dotc(m-k-bi,&A[k+bi][k+bi],&A[j][k+bi]);
457 s = -s/A[k+bi][k+bi];
458 _axpy(&A[k+bi][k+bi],A[j]+k+bi,m-k-bi,s);
467 ScalarType beta=A[j][j];
486 template <
typename ScalarType>
489 std::vector<ScalarType> betas(std::min(m,n));
490 ScalarType* norms=
new ScalarType[block_size];
491 ScalarType* ss=
new ScalarType[n];
494 ScalarType** block_cols=
new ScalarType*[block_size];
496 block_cols[i]=
new ScalarType[m];
498 for(
vcl_size_t k=0; k<std::min(m,n); k+=block_size)
500 block_size=std::min(std::min(m-k,n-k),block_size);
505 block_cols[bi][i]=A[k+i][k+bi];
511 norms[bi]=
_nrm2(&block_cols[bi][bi],m-k-bi);
513 if(norms[bi]!=ScalarType(0))
517 if(std::abs(block_cols[bi][bi]-ScalarType(1))>std::abs(block_cols[bi][bi]+ScalarType(1)))
520 block_cols[bi][i]/=norms[bi];
521 block_cols[bi][bi]+=ScalarType(1);
526 ScalarType s=
_dotc(m-k-bi,&block_cols[bi][bi],&block_cols[j][bi]);
527 s = -s/block_cols[bi][bi];
528 _axpy(&block_cols[bi][bi],&block_cols[j][bi],m-k-bi,s);
532 betas[k+bi]=-norms[bi];
538 A[k+i][k+bi]=block_cols[bi][i];
541 #ifdef VIENNACL_OPENMP
542 #pragma omp parallel for
543 for(
int section=0;section<(int)num_threads;section++)
545 for(
vcl_size_t section=0;section<num_threads;section++)
548 vcl_size_t start=((n-k-block_size)*(section+0))/num_threads+k+block_size;
549 vcl_size_t end =((n-k-block_size)*(section+1))/num_threads+k+block_size;
553 if(norms[bi]!=ScalarType(0))
558 _axpy(&A[i][start],ss+start,length,A[i][k+bi]);
560 ss[i]=-ss[i]/A[k+bi][k+bi];
562 _axpy(ss+start,&A[i][start],length,A[i][k+bi]);
571 ScalarType beta=A[j][j];
580 delete [] block_cols[i];
581 delete [] block_cols;
T conjIfComplex(T x)
Definition: sse_blas.hpp:69
std::size_t vcl_size_t
Definition: forwards.h:58
optimized BLAS functions using SSE2 and SSE3 intrinsic functions
std::vector< ScalarType > inplace_qr_row_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size=8, vcl_size_t num_threads=1)
Inplace qr factorization of an m x n dense row-major matrix, returning the householder normalization ...
Definition: sse_kernels.hpp:487
std::vector< ScalarType > inplace_qr_col_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size=8)
Inplace qr factorization of an m x n dense column-major matrix, returning the householder normalizati...
Definition: sse_kernels.hpp:407
void eliminateHermitian(ScalarType **A, vcl_size_t row, vcl_size_t from, vcl_size_t to, vcl_size_t width, ScalarType *ss)
Definition: sse_kernels.hpp:73
T _nrm2(const T *, vcl_size_t)
Definition: sse_blas.hpp:117
void reduceHermitianToBandedMatrix(ScalarType **A, vcl_size_t n, vcl_size_t block_size, vcl_size_t num_threads)
Definition: sse_kernels.hpp:159
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:43
bool lu_factorize_row_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t *piv=NULL, vcl_size_t block_size=8)
Inplace lu factorization of an m x n dense row-major matrix with optional partial pivoting...
Definition: sse_kernels.hpp:295
void _axpy(const T *, T *, vcl_size_t, T)
Definition: sse_blas.hpp:73
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
bool isHermitian(ScalarType **const A, vcl_size_t n)
Definition: sse_kernels.hpp:50
T _dotc(vcl_size_t, const T *, const T *)
Definition: sse_blas.hpp:89
vcl_size_t getHermitianBandwidth(ScalarType **const A, vcl_size_t n)
Definition: sse_kernels.hpp:61
void tridiagonalizeHermitianBandedMatrix(ScalarType **A, vcl_size_t n, vcl_size_t bandwidth)
Definition: sse_kernels.hpp:127
void inplace_tred2(ScalarType **A, vcl_size_t n, vcl_size_t block_size=1, vcl_size_t num_threads=1)
Inplace reduction of a dense n x n row-major or column-major hermitian (or real symmetric) matrix to ...
Definition: sse_kernels.hpp:266