ViennaCL - The Vienna Computing Library  1.5.1
sse_blas.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
27 //complex BLAS functions are included but unused in this version of ViennaCL
28 #if defined VIENNACL_WITH_COMPLEX
29 #include <complex>
30 #endif
31 
32 //defining VIENNACL_SSE3 adds a slight optimization for complex multiplication using SSE3
33 #if defined VIENNACL_WITH_SSE3
34 #include <pmmintrin.h>
35 #elif defined VIENNACL_WITH_SSE2
36 #include <emmintrin.h>
37 #endif
38 
39 #include <cstddef>
40 #include <cmath>
41 
42 namespace viennacl
43 {
44  namespace linalg
45  {
46 
47  namespace host_based
48  {
49  //saxpy, daxpy, caxpy, zaxpy
50  template <class T> inline void _axpy(const T*, T*, vcl_size_t, T);
51 
52  //sdot, ddot, cdotu, zdotu
53  template <class T> inline T _dot (vcl_size_t, const T*, const T*);
54 
55  //sdot, ddot, cdotc, zdotc
56  template <class T> inline T _dotc(vcl_size_t, const T*, const T*);
57 
58  //sswap, dswap, cswap, zswap
59  template <class T> inline void _swap(vcl_size_t, T*, T*);
60 
61  //scopy, dcopy, ccopy, zcopy
62  template <class T> inline void _copy(vcl_size_t, T*, T*);
63 
64  //snrm2, dnrm2, euclidian norm of complex vectors
65  template <class T> inline T _nrm2(const T*, vcl_size_t);
66 
67  namespace detail
68  {
69  template <class T> inline T conjIfComplex(T x){return x;}
70  }
71 
72  template <class T>
73  inline void _axpy(const T* x, T* y, vcl_size_t n, T a)
74  {
75  for(vcl_size_t i=0;i<n;i++)
76  y[i]+=a*x[i];
77  }
78 
79  template <class T>
80  inline T _dot(vcl_size_t n, const T* x, const T* y)
81  {
82  T sum(0);
83  for(vcl_size_t i=0;i<n;i++)
84  sum+=x[i]*y[i];
85  return sum;
86  }
87 
88  template <class T>
89  inline T _dotc(vcl_size_t n, const T* x, const T* y)
90  {
91  T sum(0);
92  for(vcl_size_t i=0;i<n;i++)
93  sum+=detail::conjIfComplex(x[i])*y[i];
94  return sum;
95  }
96 
97  template <class T>
98  inline void _swap(vcl_size_t n, T* sx, T* sy)
99  {
100  T t;
101  for(vcl_size_t i=0;i<n;i++)
102  {
103  t=sx[i];
104  sx[i]=sy[i];
105  sy[i]=t;
106  }
107  }
108 
109  template <class T>
110  inline void _copy(vcl_size_t n, T* cx, T* cy)
111  {
112  for(vcl_size_t i=0;i<n;i++)
113  cx[i]=cy[i];
114  }
115 
116  template <class T>
117  inline T _nrm2(const T* x, vcl_size_t n)
118  {
119  //based on http://www.netlib.org/blas/snrm2.f, but works with std::complex
120 
121  if(n<1)
122  return T(0);
123  if(n==1)
124  return std::abs(x[0]);
125  T scale(0);
126  T scaledSquareSum(1);
127  for(vcl_size_t i=0;i<n;i++){
128  if(x[i]!=T(0)){
129  T absXi=std::abs(x[i]);
130  if(std::abs(x[i])>std::abs(scale)){
131  T temp=scale/absXi;
132  scaledSquareSum=T(1)+scaledSquareSum*temp*temp;
133  scale=absXi;
134  }
135  else{
136  T temp=absXi/scale;
137  scaledSquareSum+=temp*temp;
138  }
139  }
140  }
141  return scale*sqrt(scaledSquareSum);
142  }
143 
144  #if defined VIENNACL_WITH_COMPLEX
145 
146  namespace detail
147  {
148  template <> inline std::complex<double> conjIfComplex(std::complex<double> x){return conj(x);}
149  template <> inline std::complex<float > conjIfComplex(std::complex<float > x){return conj(x);}
150  }
151 
152  template <>
153  inline std::complex<double> _nrm2(const std::complex<double>* x, vcl_size_t n)
154  {
155  //based on http://www.netlib.org/blas/snrm2.f
156 
157  if(n<1)
158  return std::complex<double>(0);
159  if(n==1)
160  return std::abs(x[0]);
161  double scale=0.0;
162  double scaledSquareSum=1.0;
163  for(vcl_size_t i=0;i<n;i++){
164  if(x[i].real()!=0.0){
165  double absXi=std::abs(x[i].real());
166  if(absXi>scale){
167  double temp=scale/absXi;
168  scaledSquareSum=1.0+scaledSquareSum*temp*temp;
169  scale=absXi;
170  }
171  else{
172  double temp=absXi/scale;
173  scaledSquareSum+=temp*temp;
174  }
175  }
176  if(x[i].imag()!=0.0){
177  double absXi=std::abs(x[i].imag());
178  if(absXi>scale){
179  double temp=scale/absXi;
180  scaledSquareSum=1.0+scaledSquareSum*temp*temp;
181  scale=absXi;
182  }
183  else{
184  double temp=absXi/scale;
185  scaledSquareSum+=temp*temp;
186  }
187  }
188  }
189  return std::complex<double>(scale*sqrt(scaledSquareSum));
190  }
191 
192  template <>
193  inline std::complex<float> _nrm2(const std::complex<float>* x, vcl_size_t n)
194  {
195  //based on http://www.netlib.org/blas/snrm2.f
196 
197  if(n<1)
198  return std::complex<float>(0);
199  if(n==1)
200  return std::abs(x[0]);
201  float scale=0.0;
202  float scaledSquareSum=1.0;
203  for(vcl_size_t i=0;i<n;i++){
204  if(x[i].real()!=0.0){
205  float absXi=std::abs(x[i].real());
206  if(absXi>scale){
207  float temp=scale/absXi;
208  scaledSquareSum=1.0f+scaledSquareSum*temp*temp;
209  scale=absXi;
210  }
211  else{
212  float temp=absXi/scale;
213  scaledSquareSum+=temp*temp;
214  }
215  }
216  if(x[i].imag()!=0.0){
217  float absXi=std::abs(x[i].imag());
218  if(absXi>scale){
219  float temp=scale/absXi;
220  scaledSquareSum=1.0f+scaledSquareSum*temp*temp;
221  scale=absXi;
222  }
223  else{
224  float temp=absXi/scale;
225  scaledSquareSum+=temp*temp;
226  }
227  }
228  }
229  return std::complex<float>(scale*sqrt(scaledSquareSum));
230  }
231 
232  #endif //defined VIENNACL_COMPLEX
233 
234  #if defined VIENNACL_WITH_SSE2
235 
236  //saxpy
237  template <>
238  inline void _axpy<float>(const float* x, float* y, vcl_size_t n, float a)
239  {
240 
241  //if the array is short or if either array is unaligned, perform the non-SSE code
242  if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(float)!=0)
243  for(vcl_size_t i=0;i<n;i++)
244  y[i]+=a*x[i];
245  else
246  {
247  //process unaligned section of arrays
248  while(((vcl_size_t)x)%16)
249  {
250  if(n<=0)
251  return;
252  y[0]+=a*x[0];
253  x++;
254  y++;
255  n--;
256  }
257 
258  __m128 sum0;
259  __m128 sum1;
260  __m128 reg0,reg1,reg2,reg3;
261  __m128 areg=_mm_set1_ps(a);
262  __m128 prod;
263 
264  //add floats 8 at a time
265  while(n>=8){
266 
267  //read floats into MMX registers (8 from each array)
268  reg0=_mm_load_ps(x+0);
269  reg1=_mm_load_ps(x+4);
270  reg2=_mm_load_ps(y+0);
271  reg3=_mm_load_ps(y+4);
272 
273  //add floats
274  prod=_mm_mul_ps(reg0,areg);
275  sum0=_mm_add_ps(prod,reg2);
276  prod=_mm_mul_ps(reg1,areg);
277  sum1=_mm_add_ps(prod,reg3);
278 
279  //put float sums into y
280  _mm_store_ps(y+0,sum0);
281  _mm_store_ps(y+4,sum1);
282 
283  x+=8;
284  y+=8;
285  n-=8;
286  }
287 
288  //add beyond the last multiple of 8
289  for(vcl_size_t i=0;i<n;i++)
290  y[i]+=a*x[i];
291  }
292  }
293 
294  //daxpy
295  template <>
296  inline void _axpy<double>(const double* x, double* y, vcl_size_t n, double a)
297  {
298  //if the array is short or if either array is unaligned, perform the non-SSE code
299  if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(double)!=0)
300  for(vcl_size_t i=0;i<n;i++)
301  y[i]+=a*x[i];
302 
303  else
304  {
305  //process unaligned section of arrays
306  while(((vcl_size_t)x)%16)
307  {
308  if(n<=0)
309  return;
310  y[0]+=a*x[0];
311  x++;
312  y++;
313  n--;
314  }
315 
316  __m128d sum0;
317  __m128d sum1;
318  __m128d reg0,reg1,reg2,reg3;
319  __m128d areg=_mm_set1_pd(a);
320  __m128d prod;
321 
322  //add doubles 4 at a time
323  while(n>=8){
324 
325  //read floats into MMX registers (4 from each array)
326  reg0=_mm_load_pd(x+0);
327  reg1=_mm_load_pd(x+2);
328  reg2=_mm_load_pd(y+0);
329  reg3=_mm_load_pd(y+2);
330 
331  //add floats
332  prod=_mm_mul_pd(reg0,areg);
333  sum0=_mm_add_pd(prod,reg2);
334  prod=_mm_mul_pd(reg1,areg);
335  sum1=_mm_add_pd(prod,reg3);
336 
337  //put float sums into y
338  _mm_store_pd(y+0,sum0);
339  _mm_store_pd(y+2,sum1);
340 
341  x+=4;
342  y+=4;
343  n-=4;
344  }
345 
346  //add beyond the last multiple of 4
347  for(vcl_size_t i=0;i<n;i++)
348  y[i]+=a*x[i];
349  }
350  }
351 
352  //sdot
353  template <>
354  inline float _dot<float>(vcl_size_t n, const float* x, const float* y)
355  {
356 
357  //if the array is short or if either array is unaligned, perform the non-SSE code
358  if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(float)!=0)
359  {
360  float sum=0;
361  for(vcl_size_t i=0;i<n;i++)
362  sum+=x[i]*y[i];
363  return sum;
364  }
365  else
366  {
367 
368  //process unaligned section of array
369  float sum=0;
370  while(((vcl_size_t)x)%16)
371  {
372  if(n<=0)
373  return sum;
374  sum+=x[0]*y[0];
375  y++;
376  x++;
377  n--;
378  }
379 
380  __m128 sumReg=_mm_setzero_ps();
381  __m128 reg0,reg1,reg2,reg3;
382 
383  //add floats 8 at a time
384  while(n>=8)
385  {
386  //read floats into MMX registers (8 from each array)
387  reg0=_mm_load_ps(x+0);
388  reg1=_mm_load_ps(x+4);
389  reg2=_mm_load_ps(y+0);
390  reg3=_mm_load_ps(y+4);
391 
392  //multiply floats together
393  reg0=_mm_mul_ps(reg0,reg2);
394  reg1=_mm_mul_ps(reg1,reg3);
395 
396  //add to sums
397  sumReg=_mm_add_ps(sumReg,reg0);
398  sumReg=_mm_add_ps(sumReg,reg1);
399 
400  x+=8;
401  y+=8;
402  n-=8;
403  }
404 
405  //add beyond where the inner loop stopped
406  for(vcl_size_t i=0;i<n;i++)
407  sum+=x[i]*y[i];
408 
409  //move the sums from the xmm registers to aligned memory on the stack
410  float sums[8];
411  float* pSums=(float*)((((vcl_size_t)sums)&(~15))+16);
412  _mm_store_ps(pSums,sumReg);
413 
414  return sum+pSums[0]+pSums[1]+pSums[2]+pSums[3];
415  }
416  }
417 
418  //ddot
419  template <>
420  inline double _dot(vcl_size_t n, const double* x, const double* y)
421  {
422  //if the array is short or if either array is unaligned, perform the non-SSE code
423  if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(double)!=0)
424  {
425  double sum=0;
426  for(vcl_size_t i=0;i<n;i++)
427  sum+=x[i]*y[i];
428  return sum;
429  }
430  else
431  {
432  //process unaligned section of array
433  double sum=0;
434  while(((vcl_size_t)x)%16)
435  {
436  if(n<=0)
437  return sum;
438  sum+=x[0]*y[0];
439  y++;
440  x++;
441  n--;
442  }
443 
444  __m128d sum0=_mm_setzero_pd();
445  __m128d sum1=_mm_setzero_pd();
446  __m128d reg0,reg1,reg2,reg3;
447 
448  //add doubles 4 at a time
449  while(n>=4)
450  {
451  //read doubles into MMX registers (4 from each array)
452  reg0=_mm_load_pd(x+0);
453  reg1=_mm_load_pd(x+2);
454  reg2=_mm_load_pd(y+0);
455  reg3=_mm_load_pd(y+2);
456 
457  //multiply doubles together
458  reg0=_mm_mul_pd(reg0,reg2);
459  reg1=_mm_mul_pd(reg1,reg3);
460 
461  //add to sums
462  sum0=_mm_add_pd(sum0,reg0);
463  sum1=_mm_add_pd(sum1,reg1);
464 
465  x+=4;
466  y+=4;
467  n-=4;
468  }
469 
470  //add beyond where the inner loop stopped
471  for(vcl_size_t i=0;i<n;i++)
472  sum+=x[i]*y[i];
473 
474  //move the sums from the xmm registers to aligned memory on the stack
475  double sums[4];
476  double* pSums=(double*)((((vcl_size_t)sums)&(~15))+16);
477  sum0=_mm_add_pd(sum0,sum1);
478  _mm_store_pd(pSums,sum0);
479 
480  return sum+pSums[0]+pSums[1];
481  }
482  }
483 
484  //conjugated dot products are the same as non-conjugated dot products for real numbers
485  template <> inline float _dotc<float >(vcl_size_t n, const float *x, const float *y){return _dot(n,x,y);}
486  template <> inline double _dotc<double>(vcl_size_t n, const double *x, const double *y){return _dot(n,x,y);}
487 
488  #if defined VIENNACL_WITH_COMPLEX
489 
490  //caxpy
491  template <>
492  inline void _axpy<std::complex<float> >(const std::complex<float>* x, std::complex<float>* y, vcl_size_t n, std::complex<float> a)
493  {
494  //if the array is short or if either array is unaligned, perform the non-SSE code
495  if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0)
496  for(vcl_size_t i=0;i<n;i++)
497  y[i]+=a*x[i];
498 
499  else
500  {
501  //process unaligned section of arrays
502  while(((vcl_size_t)x)%16)
503  {
504  if(n<=0)
505  return;
506  y[0]+=a*x[0];
507  x++;
508  y++;
509  n--;
510  }
511 
512  __m128 reg0,reg1,reg2,reg3,reg4;
513  __m128 areg0=_mm_set_ps(a.imag(),a.real(),a.imag(),a.real());
514  __m128 areg1=_mm_set_ps(a.real(),a.imag(),a.real(),a.imag());
515  #ifndef VIENNACL_WITH_SSE3
516  __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
517  #endif
518 
519  //add complex floats 4 at a time
520  while(n>=4)
521  {
522  //read floats into MMX registers (8 from each array)
523  reg0=_mm_load_ps((float*)(x+0));
524  reg1=_mm_load_ps((float*)(x+2));
525  reg2=_mm_load_ps((float*)(y+0));
526  reg3=_mm_load_ps((float*)(y+2));
527 
528  //do complex multiplication and addition
529  #ifndef VIENNACL_WITH_SSE3
530  reg4=_mm_shuffle_ps(reg0,reg0,0xA0);
531  reg0=_mm_shuffle_ps(reg0,reg0,0xF5);
532  reg4=_mm_mul_ps(reg4,areg0);
533  reg0=_mm_mul_ps(reg0,areg1);
534  reg0=_mm_mul_ps(reg0,nreg);
535  reg0=_mm_add_ps(reg4,reg0);
536  reg0=_mm_add_ps(reg0,reg2);
537  reg4=_mm_shuffle_ps(reg1,reg1,0xA0);
538  reg1=_mm_shuffle_ps(reg1,reg1,0xF5);
539  reg4=_mm_mul_ps(reg4,areg0);
540  reg1=_mm_mul_ps(reg1,areg1);
541  reg1=_mm_mul_ps(reg1,nreg);
542  reg1=_mm_add_ps(reg4,reg1);
543  reg1=_mm_add_ps(reg1,reg3);
544  #else
545  reg4=_mm_moveldup_ps(reg0);
546  reg0=_mm_movehdup_ps(reg0);
547  reg4=_mm_mul_ps(reg4,areg0);
548  reg0=_mm_mul_ps(reg0,areg1);
549  reg0=_mm_addsub_ps(reg4,reg0);
550  reg0=_mm_add_ps(reg0,reg2);
551  reg4=_mm_moveldup_ps(reg1);
552  reg1=_mm_movehdup_ps(reg1);
553  reg4=_mm_mul_ps(reg4,areg0);
554  reg1=_mm_mul_ps(reg1,areg1);
555  reg1=_mm_addsub_ps(reg4,reg1);
556  reg1=_mm_add_ps(reg1,reg3);
557  #endif
558  //put results into y
559  _mm_store_ps((float*)(y+0),reg0);
560  _mm_store_ps((float*)(y+2),reg1);
561 
562  x+=4;
563  y+=4;
564  n-=4;
565  }
566 
567  //add beyond the last multiple of 4
568  for(vcl_size_t i=0;i<n;i++)
569  y[i]+=a*x[i];
570  }
571  }
572 
573  //zaxpy
574  template <>
575  inline void _axpy<std::complex<double> >(const std::complex<double>* x, std::complex<double>* y, vcl_size_t n, std::complex<double> a)
576  {
577  //if the array is short or if either array is unaligned, perform the non-SSE code
578  if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16)
579  for(vcl_size_t i=0;i<n;i++)
580  y[i]+=a*x[i];
581 
582  else
583  {
584  __m128d reg0,reg1,reg2,reg3,reg4;
585  __m128d areg0=_mm_set_pd(a.imag(),a.real());
586  __m128d areg1=_mm_set_pd(a.real(),a.imag());
587  #ifndef VIENNACL_WITH_SSE3
588  __m128d nreg=_mm_set_pd(1.0,-1.0);
589  #endif
590 
591  //add complex doubles 2 at a time
592  while(n>=2)
593  {
594  //read doubles into MMX registers (4 from each array)
595  reg0=_mm_load_pd((double*)(x+0));
596  reg1=_mm_load_pd((double*)(x+1));
597  reg2=_mm_load_pd((double*)(y+0));
598  reg3=_mm_load_pd((double*)(y+1));
599 
600  //do complex multiplication and addition
601  #ifndef VIENNACL_WITH_SSE3
602  reg4=_mm_shuffle_pd(reg0,reg0,0x0);
603  reg0=_mm_shuffle_pd(reg0,reg0,0x3);
604  reg4=_mm_mul_pd(reg4,areg0);
605  reg0=_mm_mul_pd(reg0,areg1);
606  reg0=_mm_mul_pd(reg0,nreg);
607  reg0=_mm_add_pd(reg4,reg0);
608  reg0=_mm_add_pd(reg0,reg2);
609  reg4=_mm_shuffle_pd(reg1,reg1,0x0);
610  reg1=_mm_shuffle_pd(reg1,reg1,0x3);
611  reg4=_mm_mul_pd(reg4,areg0);
612  reg1=_mm_mul_pd(reg1,areg1);
613  reg1=_mm_mul_pd(reg1,nreg);
614  reg1=_mm_add_pd(reg4,reg1);
615  reg1=_mm_add_pd(reg1,reg3);
616  #else
617  reg4=_mm_shuffle_pd(reg0,reg0,0x0);
618  reg0=_mm_shuffle_pd(reg0,reg0,0x3);
619  reg4=_mm_mul_pd(reg4,areg0);
620  reg0=_mm_mul_pd(reg0,areg1);
621  reg0=_mm_addsub_pd(reg4,reg0);
622  reg0=_mm_add_pd(reg0,reg2);
623  reg4=_mm_shuffle_pd(reg1,reg1,0x0);
624  reg1=_mm_shuffle_pd(reg1,reg1,0x3);
625  reg4=_mm_mul_pd(reg4,areg0);
626  reg1=_mm_mul_pd(reg1,areg1);
627  reg1=_mm_addsub_pd(reg4,reg1);
628  reg1=_mm_add_pd(reg1,reg3);
629  #endif
630  //put results into y
631  _mm_store_pd((double*)(y+0),reg0);
632  _mm_store_pd((double*)(y+1),reg1);
633 
634  x+=2;
635  y+=2;
636  n-=2;
637  }
638 
639  //add beyond the last multiple of 2
640  if(n)
641  y[0]+=a*x[0];
642  }
643  }
644 
645  //cdotu
646  template <>
647  inline std::complex<float> _dot<std::complex<float> >(vcl_size_t n, const std::complex<float>* x, const std::complex<float>* y)
648  {
649  //if the array is short or if either array is unaligned, perform the non-SSE code
650  if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0)
651  {
652  std::complex<float> sum(0);
653  for(vcl_size_t i=0;i<n;i++)
654  sum+=x[i]*y[i];
655  return sum;
656  }
657  else
658  {
659  //process unaligned section of arrays
660  std::complex<float> sum(0);
661  while(((vcl_size_t)x)%16)
662  {
663  if(n<=0)
664  return sum;
665  sum+=x[0]*y[0];
666  y++;
667  x++;
668  n--;
669  }
670 
671  __m128 sumReg=_mm_setzero_ps();
672  __m128 reg0,reg1,reg2,reg3,reg4;
673  #ifndef VIENNACL_WITH_SSE3
674  __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
675  #endif
676 
677  //add complex floats 4 at a time
678  while(n>=4)
679  {
680  //read floats into MMX registers (8 from each array)
681  reg0=_mm_load_ps((float*)(x+0));
682  reg1=_mm_load_ps((float*)(x+2));
683  reg2=_mm_load_ps((float*)(y+0));
684  reg3=_mm_load_ps((float*)(y+2));
685 
686  //multiply complex floats together
687  #ifndef VIENNACL_WITH_SSE3
688  reg4=_mm_shuffle_ps(reg2,reg2,0xA0);
689  reg2=_mm_shuffle_ps(reg2,reg2,0xF5);
690  reg4=_mm_mul_ps(reg4,reg0);
691  reg2=_mm_mul_ps(reg2,reg0);
692  reg2=_mm_shuffle_ps(reg2,reg2,0xB1);
693  reg2=_mm_mul_ps(reg2,nreg);
694  reg0=_mm_add_ps(reg4,reg2);
695  reg4=_mm_shuffle_ps(reg3,reg3,0xA0);
696  reg3=_mm_shuffle_ps(reg3,reg3,0xF5);
697  reg4=_mm_mul_ps(reg4,reg1);
698  reg3=_mm_mul_ps(reg3,reg1);
699  reg3=_mm_shuffle_ps(reg3,reg3,0xB1);
700  reg3=_mm_mul_ps(reg3,nreg);
701  reg1=_mm_add_ps(reg4,reg3);
702  #else
703  reg4=_mm_moveldup_ps(reg2);
704  reg2=_mm_movehdup_ps(reg2);
705  reg4=_mm_mul_ps(reg4,reg0);
706  reg2=_mm_mul_ps(reg2,reg0);
707  reg2=_mm_shuffle_ps(reg2,reg2,0xB1);
708  reg0=_mm_addsub_ps(reg4,reg2);
709  reg4=_mm_moveldup_ps(reg3);
710  reg3=_mm_movehdup_ps(reg3);
711  reg4=_mm_mul_ps(reg4,reg1);
712  reg3=_mm_mul_ps(reg3,reg1);
713  reg3=_mm_shuffle_ps(reg3,reg3,0xB1);
714  reg1=_mm_addsub_ps(reg4,reg3);
715  #endif
716 
717  //add to sum
718  sumReg=_mm_add_ps(sumReg,reg0);
719  sumReg=_mm_add_ps(sumReg,reg1);
720 
721  x+=4;
722  y+=4;
723  n-=4;
724  }
725 
726  //add beyond where the inner loop stopped
727  for(vcl_size_t i=0;i<n;i++)
728  sum+=x[i]*y[i];
729 
730  //move the sums from the xmm registers to aligned memory on the stack
731  std::complex<float> sums[4];
732  std::complex<float>* pSums=(std::complex<float>*)((((vcl_size_t)sums)&(~15))+16);
733  pSums[0]=std::complex<float>(0);
734  pSums[1]=std::complex<float>(0);
735  _mm_store_ps((float*)pSums,sumReg);
736 
737  return sum+pSums[0]+pSums[1];
738  }
739  }
740 
741  //zdotu
742  template <>
743  inline std::complex<double> _dot<std::complex<double> >(vcl_size_t n, const std::complex<double>* x, const std::complex<double>* y)
744  {
745  //if the array is short or if either array is unaligned, perform the non-SSE code
746  if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16)
747  {
748  std::complex<double> sum(0);
749  for(vcl_size_t i=0;i<n;i++)
750  sum+=x[i]*y[i];
751  return sum;
752  }
753  else
754  {
755  __m128d sumReg=_mm_setzero_pd();
756  __m128d reg0,reg1,reg2,reg3,reg4;
757  #ifndef VIENNACL_WITH_SSE3
758  __m128d nreg=_mm_set_pd(1.0,-1.0);
759  #endif
760 
761  //add complex doubles 2 at a time
762  while(n>=2)
763  {
764  //read doubles into MMX registers (4 from each array)
765  reg0=_mm_load_pd((double*)(x+0));
766  reg1=_mm_load_pd((double*)(x+1));
767  reg2=_mm_load_pd((double*)(y+0));
768  reg3=_mm_load_pd((double*)(y+1));
769 
770  //multiply complex doubles together
771  #ifndef VIENNACL_WITH_SSE3
772  reg4=_mm_shuffle_pd(reg2,reg2,0x0);
773  reg2=_mm_shuffle_pd(reg2,reg2,0x3);
774  reg4=_mm_mul_pd(reg4,reg0);
775  reg2=_mm_mul_pd(reg2,reg0);
776  reg2=_mm_shuffle_pd(reg2,reg2,0x1);
777  reg2=_mm_mul_pd(reg2,nreg);
778  reg0=_mm_add_pd(reg4,reg2);
779  reg4=_mm_shuffle_pd(reg3,reg3,0x0);
780  reg3=_mm_shuffle_pd(reg3,reg3,0x3);
781  reg4=_mm_mul_pd(reg4,reg1);
782  reg3=_mm_mul_pd(reg3,reg1);
783  reg3=_mm_shuffle_pd(reg3,reg3,0x1);
784  reg3=_mm_mul_pd(reg3,nreg);
785  reg1=_mm_add_pd(reg4,reg3);
786  #else
787  reg4=_mm_shuffle_pd(reg2,reg2,0x0);
788  reg2=_mm_shuffle_pd(reg2,reg2,0x3);
789  reg4=_mm_mul_pd(reg4,reg0);
790  reg2=_mm_mul_pd(reg2,reg0);
791  reg2=_mm_shuffle_pd(reg2,reg2,0x1);
792  reg0=_mm_addsub_pd(reg4,reg2);
793  reg4=_mm_shuffle_pd(reg3,reg3,0x0);
794  reg3=_mm_shuffle_pd(reg3,reg3,0x3);
795  reg4=_mm_mul_pd(reg4,reg1);
796  reg3=_mm_mul_pd(reg3,reg1);
797  reg3=_mm_shuffle_pd(reg3,reg3,0x1);
798  reg1=_mm_addsub_pd(reg4,reg3);
799  #endif
800 
801  //add to sum
802  sumReg=_mm_add_pd(sumReg,reg0);
803  sumReg=_mm_add_pd(sumReg,reg1);
804 
805  x+=2;
806  y+=2;
807  n-=2;
808  }
809 
810  //add beyond where the inner loop stopped
811  std::complex<double> sum(0);
812  if(n)
813  sum=x[0]*y[0];
814 
815  //move the sums from the xmm registers to aligned memory on the stack
816  std::complex<double> sums[2];
817  std::complex<double>* pSums=(std::complex<double>*)((((vcl_size_t)sums)&(~15))+16);
818  pSums[0]=std::complex<double>(0);
819  _mm_store_pd((double*)pSums,sumReg);
820 
821  return sum+pSums[0];
822  }
823  }
824 
825  //cdotc
826  template <>
827  inline std::complex<float> _dotc<std::complex<float> >(vcl_size_t n, const std::complex<float>* x, const std::complex<float>* y)
828  {
829  //if the array is short or if either array is unaligned, perform the non-SSE code
830  if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0)
831  {
832  std::complex<float> sum(0);
833  for(vcl_size_t i=0;i<n;i++)
834  sum+=conj(x[i])*y[i];
835  return sum;
836  }
837  else
838  {
839  //process unaligned section of arrays
840  std::complex<float> sum(0);
841  while(((vcl_size_t)x)%16)
842  {
843  if(n<=0)
844  return sum;
845  sum+=conj(x[0])*y[0];
846  y++;
847  x++;
848  n--;
849  }
850 
851  __m128 sumReg=_mm_setzero_ps();
852  __m128 reg0,reg1,reg2,reg3,reg4;
853  #ifndef VIENNACL_WITH_SSE3
854  __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
855  #endif
856 
857  //add complex floats 4 at a time
858  while(n>=4)
859  {
860  //read floats into MMX registers (8 from each array)
861  reg0=_mm_load_ps((float*)(x+0));
862  reg1=_mm_load_ps((float*)(x+2));
863  reg2=_mm_load_ps((float*)(y+0));
864  reg3=_mm_load_ps((float*)(y+2));
865 
866  //multiply complex doubles together
867  #ifndef VIENNACL_WITH_SSE3
868  reg4=_mm_shuffle_ps(reg2,reg2,0xA0);
869  reg2=_mm_shuffle_ps(reg2,reg2,0xF5);
870  reg4=_mm_mul_ps(reg4,reg0);
871  reg2=_mm_mul_ps(reg2,reg0);
872  reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
873  reg4=_mm_mul_ps(reg4,nreg);
874  reg0=_mm_add_ps(reg4,reg2);
875  reg4=_mm_shuffle_ps(reg3,reg3,0xA0);
876  reg3=_mm_shuffle_ps(reg3,reg3,0xF5);
877  reg4=_mm_mul_ps(reg4,reg1);
878  reg3=_mm_mul_ps(reg3,reg1);
879  reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
880  reg4=_mm_mul_ps(reg4,nreg);
881  reg1=_mm_add_ps(reg4,reg3);
882  #else
883  reg4=_mm_moveldup_ps(reg2);
884  reg2=_mm_movehdup_ps(reg2);
885  reg4=_mm_mul_ps(reg4,reg0);
886  reg2=_mm_mul_ps(reg2,reg0);
887  reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
888  reg0=_mm_addsub_ps(reg2,reg4);
889  reg4=_mm_moveldup_ps(reg3);
890  reg3=_mm_movehdup_ps(reg3);
891  reg4=_mm_mul_ps(reg4,reg1);
892  reg3=_mm_mul_ps(reg3,reg1);
893  reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
894  reg1=_mm_addsub_ps(reg3,reg4);
895  #endif
896 
897  //add to sum
898  sumReg=_mm_add_ps(sumReg,reg0);
899  sumReg=_mm_add_ps(sumReg,reg1);
900 
901  x+=4;
902  y+=4;
903  n-=4;
904  }
905 
906  //add beyond where the inner loop stopped
907  for(vcl_size_t i=0;i<n;i++)
908  sum+=conj(x[i])*y[i];
909 
910  //move the sums from the xmm registers to aligned memory on the stack
911  std::complex<float> sums[4];
912  std::complex<float>* pSums=(std::complex<float>*)((((vcl_size_t)sums)&(~15))+16);
913  sumReg=_mm_shuffle_ps(sumReg,sumReg,0xB1);//swap real and imag
914  _mm_store_ps((float*)pSums,sumReg);
915 
916  return sum+pSums[0]+pSums[1];
917  }
918  }
919 
920  //zdotc
921  template <>
922  inline std::complex<double> _dotc<std::complex<double> >(vcl_size_t n, const std::complex<double>* x, const std::complex<double>* y)
923  {
924  //if the array is short or if either array is unaligned, perform the non-SSE code
925  if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16)
926  {
927  std::complex<double> sum(0);
928  for(vcl_size_t i=0;i<n;i++)
929  sum+=conj(x[i])*y[i];
930  return sum;
931  }
932  else
933  {
934  __m128d sumReg=_mm_setzero_pd();
935  __m128d reg0,reg1,reg2,reg3,reg4;
936  #ifndef VIENNACL_WITH_SSE3
937  __m128d nreg=_mm_set_pd(1.0,-1.0);
938  #endif
939 
940  //add complex doubles 2 at a time
941  while(n>=2)
942  {
943  //read doubles into MMX registers (4 from each array)
944  reg0=_mm_load_pd((double*)(x+0));
945  reg1=_mm_load_pd((double*)(x+1));
946  reg2=_mm_load_pd((double*)(y+0));
947  reg3=_mm_load_pd((double*)(y+1));
948 
949  //multiply complex floats together
950  #ifndef VIENNACL_WITH_SSE3
951  reg4=_mm_shuffle_pd(reg2,reg2,0x0);
952  reg2=_mm_shuffle_pd(reg2,reg2,0x3);
953  reg4=_mm_mul_pd(reg4,reg0);
954  reg2=_mm_mul_pd(reg2,reg0);
955  reg4=_mm_shuffle_pd(reg4,reg4,0x1);
956  reg4=_mm_mul_pd(reg4,nreg);
957  reg0=_mm_add_pd(reg4,reg2);
958  reg4=_mm_shuffle_pd(reg3,reg3,0x0);
959  reg3=_mm_shuffle_pd(reg3,reg3,0x3);
960  reg4=_mm_mul_pd(reg4,reg1);
961  reg3=_mm_mul_pd(reg3,reg1);
962  reg4=_mm_shuffle_pd(reg4,reg4,0x1);
963  reg4=_mm_mul_pd(reg4,nreg);
964  reg1=_mm_add_pd(reg4,reg3);
965  #else
966  reg4=_mm_shuffle_pd(reg2,reg2,0x0);
967  reg2=_mm_shuffle_pd(reg2,reg2,0x3);
968  reg4=_mm_mul_pd(reg4,reg0);
969  reg2=_mm_mul_pd(reg2,reg0);
970  reg4=_mm_shuffle_pd(reg4,reg4,0x1);
971  reg0=_mm_addsub_pd(reg2,reg4);
972  reg4=_mm_shuffle_pd(reg3,reg3,0x0);
973  reg3=_mm_shuffle_pd(reg3,reg3,0x3);
974  reg4=_mm_mul_pd(reg4,reg1);
975  reg3=_mm_mul_pd(reg3,reg1);
976  reg4=_mm_shuffle_pd(reg4,reg4,0x1);
977  reg1=_mm_addsub_pd(reg3,reg4);
978 
979  #endif
980 
981  //add to sum
982  sumReg=_mm_add_pd(sumReg,reg0);
983  sumReg=_mm_add_pd(sumReg,reg1);
984 
985  x+=2;
986  y+=2;
987  n-=2;
988  }
989 
990  //add beyond where the inner loop stopped
991  std::complex<double> sum(0);
992  if(n)
993  sum=conj(x[0])*y[0];
994 
995  //move the sums from the xmm registers to aligned memory on the stack
996  std::complex<double> sums[2];
997  std::complex<double>* pSums=(std::complex<double>*)((((vcl_size_t)sums)&(~15))+16);
998  sumReg=_mm_shuffle_pd(sumReg,sumReg,0x1);//swap real and imag
999  _mm_store_pd((double*)pSums,sumReg);
1000 
1001  return sum+pSums[0];
1002  }
1003  }
1004 
1005  #endif //defined VIENNACL_WITH_COMPLEX
1006 
1007  #endif //defined VIENNACL_WITH_SSE2
1008 
1009  } //namespace host_based
1010  } //namespace linalg
1011 } //namespace viennacl
1012 
1013 #endif
T conjIfComplex(T x)
Definition: sse_blas.hpp:69
std::size_t vcl_size_t
Definition: forwards.h:58
void _swap(vcl_size_t, T *, T *)
Definition: sse_blas.hpp:98
T _nrm2(const T *, vcl_size_t)
Definition: sse_blas.hpp:117
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
Definition: prod.hpp:91
void _axpy(const T *, T *, vcl_size_t, T)
Definition: sse_blas.hpp:73
T _dot(vcl_size_t, const T *, const T *)
Definition: sse_blas.hpp:80
T _dotc(vcl_size_t, const T *, const T *)
Definition: sse_blas.hpp:89
void _copy(vcl_size_t, T *, T *)
Definition: sse_blas.hpp:110