GDAL
gdalsse_priv.h
1 /******************************************************************************
2  * $Id: gdalsse_priv.h 28877 2015-04-08 23:11:36Z rouault $
3  *
4  * Project: GDAL
5  * Purpose: SSE2 helper
6  * Author: Even Rouault <even dot rouault at spatialys dot com>
7  *
8  ******************************************************************************
9  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a
12  * copy of this software and associated documentation files (the "Software"),
13  * to deal in the Software without restriction, including without limitation
14  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15  * and/or sell copies of the Software, and to permit persons to whom the
16  * Software is furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included
19  * in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  ****************************************************************************/
29 
30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
32 
33 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
34 /* Could possibly be used too on 32bit, but we would need to check at runtime */
35 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
36 
37 /* Requires SSE2 */
38 #include <emmintrin.h>
39 #include <string.h>
40 
41 class XMMReg2Double
42 {
43  public:
44  __m128d xmm;
45 
46  XMMReg2Double() {}
47  XMMReg2Double(double val) { xmm = _mm_load_sd (&val); }
48  XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
49 
50  static inline XMMReg2Double Zero()
51  {
52  XMMReg2Double reg;
53  reg.Zeroize();
54  return reg;
55  }
56 
57  static inline XMMReg2Double Load2Val(const double* ptr)
58  {
59  XMMReg2Double reg;
60  reg.nsLoad2Val(ptr);
61  return reg;
62  }
63 
64  static inline XMMReg2Double Load2Val(const float* ptr)
65  {
66  XMMReg2Double reg;
67  reg.nsLoad2Val(ptr);
68  return reg;
69  }
70 
71  static inline XMMReg2Double Load2ValAligned(const double* ptr)
72  {
73  XMMReg2Double reg;
74  reg.nsLoad2ValAligned(ptr);
75  return reg;
76  }
77 
78  static inline XMMReg2Double Load2Val(const unsigned char* ptr)
79  {
80  XMMReg2Double reg;
81  reg.nsLoad2Val(ptr);
82  return reg;
83  }
84 
85  static inline XMMReg2Double Load2Val(const short* ptr)
86  {
87  XMMReg2Double reg;
88  reg.nsLoad2Val(ptr);
89  return reg;
90  }
91 
92  static inline XMMReg2Double Load2Val(const unsigned short* ptr)
93  {
94  XMMReg2Double reg;
95  reg.nsLoad2Val(ptr);
96  return reg;
97  }
98 
99  inline void nsLoad2Val(const double* ptr)
100  {
101  xmm = _mm_loadu_pd(ptr);
102  }
103 
104  inline void nsLoad2ValAligned(const double* pval)
105  {
106  xmm = _mm_load_pd(pval);
107  }
108 
109  inline void nsLoad2Val(const float* pval)
110  {
111  __m128 temp1 = _mm_load_ss(pval);
112  __m128 temp2 = _mm_load_ss(pval + 1);
113  temp1 = _mm_shuffle_ps(temp1, temp2, _MM_SHUFFLE(1,0,1,0));
114  temp1 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,3,2,0));
115  xmm = _mm_cvtps_pd(temp1);
116  }
117 
118  inline void nsLoad2Val(const unsigned char* ptr)
119  {
120  __m128i xmm_i = _mm_cvtsi32_si128(*(unsigned short*)(ptr));
121  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
122  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
123  xmm = _mm_cvtepi32_pd(xmm_i);
124  }
125 
126  inline void nsLoad2Val(const short* ptr)
127  {
128  int i;
129  memcpy(&i, ptr, 4);
130  __m128i xmm_i = _mm_cvtsi32_si128(i);
131  xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
132  xmm_i = _mm_srai_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
133  xmm = _mm_cvtepi32_pd(xmm_i);
134  }
135 
136  inline void nsLoad2Val(const unsigned short* ptr)
137  {
138  int i;
139  memcpy(&i, ptr, 4);
140  __m128i xmm_i = _mm_cvtsi32_si128(i);
141  xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
142  xmm_i = _mm_srli_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|0|b|0|a */
143  xmm = _mm_cvtepi32_pd(xmm_i);
144  }
145 
146  static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
147  {
148  __m128i xmm_i = _mm_cvtsi32_si128(*(int*)(ptr));
149  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
150  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
151  low.xmm = _mm_cvtepi32_pd(xmm_i);
152  high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
153  }
154 
155  static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
156  {
157  low.nsLoad2Val(ptr);
158  high.nsLoad2Val(ptr+2);
159  }
160 
161  static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
162  {
163  low.nsLoad2Val(ptr);
164  high.nsLoad2Val(ptr+2);
165  }
166 
167  static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
168  {
169  low.nsLoad2Val(ptr);
170  high.nsLoad2Val(ptr+2);
171  }
172 
173  static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
174  {
175  __m128 temp1 = _mm_loadu_ps(ptr);
176  __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
177  low.xmm = _mm_cvtps_pd(temp1);
178  high.xmm = _mm_cvtps_pd(temp2);
179  }
180 
181  inline void Zeroize()
182  {
183  xmm = _mm_setzero_pd();
184  }
185 
186  inline const XMMReg2Double& operator= (const XMMReg2Double& other)
187  {
188  xmm = other.xmm;
189  return *this;
190  }
191 
192  inline const XMMReg2Double& operator+= (const XMMReg2Double& other)
193  {
194  xmm = _mm_add_pd(xmm, other.xmm);
195  return *this;
196  }
197 
198  inline XMMReg2Double operator+ (const XMMReg2Double& other)
199  {
200  XMMReg2Double ret;
201  ret.xmm = _mm_add_pd(xmm, other.xmm);
202  return ret;
203  }
204 
205  inline XMMReg2Double operator- (const XMMReg2Double& other)
206  {
207  XMMReg2Double ret;
208  ret.xmm = _mm_sub_pd(xmm, other.xmm);
209  return ret;
210  }
211 
212  inline XMMReg2Double operator* (const XMMReg2Double& other)
213  {
214  XMMReg2Double ret;
215  ret.xmm = _mm_mul_pd(xmm, other.xmm);
216  return ret;
217  }
218 
219  inline const XMMReg2Double& operator*= (const XMMReg2Double& other)
220  {
221  xmm = _mm_mul_pd(xmm, other.xmm);
222  return *this;
223  }
224 
225  inline void AddLowAndHigh()
226  {
227  __m128d xmm2;
228  xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */
229  xmm = _mm_add_pd(xmm, xmm2);
230  }
231 
232  inline void Store2Double(double* pval)
233  {
234  _mm_storeu_pd(pval, xmm);
235  }
236 
237  inline void Store2DoubleAligned(double* pval)
238  {
239  _mm_store_pd(pval, xmm);
240  }
241 
242  inline operator double () const
243  {
244  double val;
245  _mm_store_sd(&val, xmm);
246  return val;
247  }
248 };
249 
250 #else
251 
252 #warning "Software emulation of SSE2 !"
253 
255 {
256  public:
257  double low;
258  double high;
259 
260  XMMReg2Double() {}
261  XMMReg2Double(double val) { low = val; high = 0.0; }
262  XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}
263 
264  static inline XMMReg2Double Zero()
265  {
266  XMMReg2Double reg;
267  reg.Zeroize();
268  return reg;
269  }
270 
271  static inline XMMReg2Double Load2Val(const double* ptr)
272  {
273  XMMReg2Double reg;
274  reg.nsLoad2Val(ptr);
275  return reg;
276  }
277 
278  static inline XMMReg2Double Load2ValAligned(const double* ptr)
279  {
280  XMMReg2Double reg;
281  reg.nsLoad2ValAligned(ptr);
282  return reg;
283  }
284 
285  static inline XMMReg2Double Load2Val(const float* ptr)
286  {
287  XMMReg2Double reg;
288  reg.nsLoad2Val(ptr);
289  return reg;
290  }
291 
292  static inline XMMReg2Double Load2Val(const unsigned char* ptr)
293  {
294  XMMReg2Double reg;
295  reg.nsLoad2Val(ptr);
296  return reg;
297  }
298 
299  static inline XMMReg2Double Load2Val(const short* ptr)
300  {
301  XMMReg2Double reg;
302  reg.nsLoad2Val(ptr);
303  return reg;
304  }
305 
306  inline void nsLoad2Val(const double* pval)
307  {
308  low = pval[0];
309  high = pval[1];
310  }
311 
312  inline void nsLoad2ValAligned(const double* pval)
313  {
314  low = pval[0];
315  high = pval[1];
316  }
317 
318  inline void nsLoad2Val(const float* pval)
319  {
320  low = pval[0];
321  high = pval[1];
322  }
323 
324  inline void nsLoad2Val(const unsigned char* ptr)
325  {
326  low = ptr[0];
327  high = ptr[1];
328  }
329 
330  inline void nsLoad2Val(const short* ptr)
331  {
332  low = ptr[0];
333  high = ptr[1];
334  }
335 
336  inline void nsLoad2Val(const unsigned short* ptr)
337  {
338  low = ptr[0];
339  high = ptr[1];
340  }
341 
342  static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
343  {
344  low.low = ptr[0];
345  low.high = ptr[1];
346  high.low = ptr[2];
347  high.high = ptr[3];
348  }
349 
350  static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
351  {
352  low.nsLoad2Val(ptr);
353  high.nsLoad2Val(ptr+2);
354  }
355 
356  static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
357  {
358  low.nsLoad2Val(ptr);
359  high.nsLoad2Val(ptr+2);
360  }
361 
362  static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
363  {
364  low.nsLoad2Val(ptr);
365  high.nsLoad2Val(ptr+2);
366  }
367 
368  static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
369  {
370  low.nsLoad2Val(ptr);
371  high.nsLoad2Val(ptr+2);
372  }
373 
374  inline void Zeroize()
375  {
376  low = 0.0;
377  high = 0.0;
378  }
379 
380  inline const XMMReg2Double& operator= (const XMMReg2Double& other)
381  {
382  low = other.low;
383  high = other.high;
384  return *this;
385  }
386 
387  inline const XMMReg2Double& operator+= (const XMMReg2Double& other)
388  {
389  low += other.low;
390  high += other.high;
391  return *this;
392  }
393 
394  inline XMMReg2Double operator+ (const XMMReg2Double& other)
395  {
396  XMMReg2Double ret;
397  ret.low = low + other.low;
398  ret.high = high + other.high;
399  return ret;
400  }
401 
402  inline XMMReg2Double operator- (const XMMReg2Double& other)
403  {
404  XMMReg2Double ret;
405  ret.low = low - other.low;
406  ret.high = high - other.high;
407  return ret;
408  }
409 
410  inline XMMReg2Double operator* (const XMMReg2Double& other)
411  {
412  XMMReg2Double ret;
413  ret.low = low * other.low;
414  ret.high = high * other.high;
415  return ret;
416  }
417 
418  inline const XMMReg2Double& operator*= (const XMMReg2Double& other)
419  {
420  low *= other.low;
421  high *= other.high;
422  return *this;
423  }
424 
425  inline void AddLowAndHigh()
426  {
427  double add = low + high;
428  low = add;
429  high = add;
430  }
431 
432  inline void Store2Double(double* pval)
433  {
434  pval[0] = low;
435  pval[1] = high;
436  }
437 
438  inline void Store2DoubleAligned(double* pval)
439  {
440  pval[0] = low;
441  pval[1] = high;
442  }
443 
444  inline operator double () const
445  {
446  return low;
447  }
448 };
449 
450 #endif /* defined(__x86_64) || defined(_M_X64) */
451 
453 {
454  public:
455  XMMReg2Double low, high;
456 
457  XMMReg4Double() {}
458  XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}
459 
460  static inline XMMReg4Double Zero()
461  {
462  XMMReg4Double reg;
463  reg.low.Zeroize();
464  reg.high.Zeroize();
465  return reg;
466  }
467 
468  static inline XMMReg4Double Load4Val(const unsigned char* ptr)
469  {
470  XMMReg4Double reg;
471  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
472  return reg;
473  }
474 
475  static inline XMMReg4Double Load4Val(const short* ptr)
476  {
477  XMMReg4Double reg;
478  reg.low.nsLoad2Val(ptr);
479  reg.high.nsLoad2Val(ptr+2);
480  return reg;
481  }
482 
483  static inline XMMReg4Double Load4Val(const unsigned short* ptr)
484  {
485  XMMReg4Double reg;
486  reg.low.nsLoad2Val(ptr);
487  reg.high.nsLoad2Val(ptr+2);
488  return reg;
489  }
490 
491  static inline XMMReg4Double Load4Val(const double* ptr)
492  {
493  XMMReg4Double reg;
494  reg.low.nsLoad2Val(ptr);
495  reg.high.nsLoad2Val(ptr+2);
496  return reg;
497  }
498 
499  static inline XMMReg4Double Load4ValAligned(const double* ptr)
500  {
501  XMMReg4Double reg;
502  reg.low.nsLoad2ValAligned(ptr);
503  reg.high.nsLoad2ValAligned(ptr+2);
504  return reg;
505  }
506 
507  static inline XMMReg4Double Load4Val(const float* ptr)
508  {
509  XMMReg4Double reg;
510  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
511  return reg;
512  }
513 
514  inline const XMMReg4Double& operator= (const XMMReg4Double& other)
515  {
516  low = other.low;
517  high = other.high;
518  return *this;
519  }
520 
521  inline const XMMReg4Double& operator+= (const XMMReg4Double& other)
522  {
523  low += other.low;
524  high += other.high;
525  return *this;
526  }
527 
528  inline XMMReg4Double operator+ (const XMMReg4Double& other)
529  {
530  XMMReg4Double ret;
531  ret.low = low + other.low;
532  ret.high = high + other.high;
533  return ret;
534  }
535 
536  inline XMMReg4Double operator- (const XMMReg4Double& other)
537  {
538  XMMReg4Double ret;
539  ret.low = low - other.low;
540  ret.high = high - other.high;
541  return ret;
542  }
543 
544  inline XMMReg4Double operator* (const XMMReg4Double& other)
545  {
546  XMMReg4Double ret;
547  ret.low = low * other.low;
548  ret.high = high * other.high;
549  return ret;
550  }
551 
552  inline const XMMReg4Double& operator*= (const XMMReg4Double& other)
553  {
554  low *= other.low;
555  high *= other.high;
556  return *this;
557  }
558 
559  inline void AddLowAndHigh()
560  {
561  low = low + high;
562  low.AddLowAndHigh();
563  }
564 
565  inline XMMReg2Double& GetLow()
566  {
567  return low;
568  }
569 };
570 
571 #endif /* GDALSSE_PRIV_H_INCLUDED */
Definition: gdalsse_priv.h:254
Definition: gdalsse_priv.h:452

Generated for GDAL by doxygen 1.8.11.