Eigen  3.2.92
CUDA/PacketMath.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_CUDA_H
11 #define EIGEN_PACKET_MATH_CUDA_H
12 
13 namespace Eigen {
14 
15 namespace internal {
16 
17 // Make sure this is only available when targeting a GPU: we don't want to
18 // introduce conflicts between these packet_traits definitions and the ones
19 // we'll use on the host side (SSE, AVX, ...)
20 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
21 template<> struct is_arithmetic<float4> { enum { value = true }; };
22 template<> struct is_arithmetic<double2> { enum { value = true }; };
23 
24 
25 template<> struct packet_traits<float> : default_packet_traits
26 {
27  typedef float4 type;
28  typedef float4 half;
29  enum {
30  Vectorizable = 1,
31  AlignedOnScalar = 1,
32  size=4,
33  HasHalfPacket = 0,
34 
35  HasDiv = 1,
36  HasSin = 0,
37  HasCos = 0,
38  HasLog = 1,
39  HasExp = 1,
40  HasSqrt = 1,
41  HasRsqrt = 1,
42  HasLGamma = 1,
43  HasErf = 1,
44  HasErfc = 1,
45 
46  HasBlend = 0,
47  };
48 };
49 
50 template<> struct packet_traits<double> : default_packet_traits
51 {
52  typedef double2 type;
53  typedef double2 half;
54  enum {
55  Vectorizable = 1,
56  AlignedOnScalar = 1,
57  size=2,
58  HasHalfPacket = 0,
59 
60  HasDiv = 1,
61  HasLog = 1,
62  HasExp = 1,
63  HasSqrt = 1,
64  HasRsqrt = 1,
65  HasLGamma = 1,
66  HasErf = 1,
67  HasErfc = 1,
68 
69  HasBlend = 0,
70  };
71 };
72 
73 
74 template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
75 template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
76 
77 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
78  return make_float4(from, from, from, from);
79 }
80 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
81  return make_double2(from, from);
82 }
83 
84 
85 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
86  return make_float4(a, a+1, a+2, a+3);
87 }
88 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
89  return make_double2(a, a+1);
90 }
91 
92 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
93  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
94 }
95 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
96  return make_double2(a.x+b.x, a.y+b.y);
97 }
98 
99 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
100  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
101 }
102 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
103  return make_double2(a.x-b.x, a.y-b.y);
104 }
105 
106 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
107  return make_float4(-a.x, -a.y, -a.z, -a.w);
108 }
109 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
110  return make_double2(-a.x, -a.y);
111 }
112 
113 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
114 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
115 
116 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
117  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
118 }
119 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
120  return make_double2(a.x*b.x, a.y*b.y);
121 }
122 
123 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
124  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
125 }
126 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
127  return make_double2(a.x/b.x, a.y/b.y);
128 }
129 
130 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
131  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
132 }
133 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
134  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
135 }
136 
137 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
138  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
139 }
140 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
141  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
142 }
143 
144 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
145  return *reinterpret_cast<const float4*>(from);
146 }
147 
148 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
149  return *reinterpret_cast<const double2*>(from);
150 }
151 
152 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
153  return make_float4(from[0], from[1], from[2], from[3]);
154 }
155 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
156  return make_double2(from[0], from[1]);
157 }
158 
159 template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
160  return make_float4(from[0], from[0], from[1], from[1]);
161 }
162 template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
163  return make_double2(from[0], from[0]);
164 }
165 
166 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
167  *reinterpret_cast<float4*>(to) = from;
168 }
169 
170 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
171  *reinterpret_cast<double2*>(to) = from;
172 }
173 
174 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
175  to[0] = from.x;
176  to[1] = from.y;
177  to[2] = from.z;
178  to[3] = from.w;
179 }
180 
181 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
182  to[0] = from.x;
183  to[1] = from.y;
184 }
185 
186 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
187 template<>
188 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
189  return __ldg((const float4*)from);
190 }
191 template<>
192 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
193  return __ldg((const double2*)from);
194 }
195 
196 template<>
197 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
198  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
199 }
200 template<>
201 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
202  return make_double2(__ldg(from+0), __ldg(from+1));
203 }
204 #endif
205 
206 template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
207  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
208 }
209 
210 template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
211  return make_double2(from[0*stride], from[1*stride]);
212 }
213 
214 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
215  to[stride*0] = from.x;
216  to[stride*1] = from.y;
217  to[stride*2] = from.z;
218  to[stride*3] = from.w;
219 }
220 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
221  to[stride*0] = from.x;
222  to[stride*1] = from.y;
223 }
224 
225 template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
226  return a.x;
227 }
228 template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
229  return a.x;
230 }
231 
232 template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
233  return a.x + a.y + a.z + a.w;
234 }
235 template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
236  return a.x + a.y;
237 }
238 
239 template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
240  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
241 }
242 template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
243  return fmax(a.x, a.y);
244 }
245 
246 template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
247  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
248 }
249 template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
250  return fmin(a.x, a.y);
251 }
252 
253 template<> EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
254  return a.x * a.y * a.z * a.w;
255 }
256 template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
257  return a.x * a.y;
258 }
259 
260 template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
261  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
262 }
263 template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
264  return make_double2(fabs(a.x), fabs(a.y));
265 }
266 
267 
268 EIGEN_DEVICE_FUNC inline void
269 ptranspose(PacketBlock<float4,4>& kernel) {
270  double tmp = kernel.packet[0].y;
271  kernel.packet[0].y = kernel.packet[1].x;
272  kernel.packet[1].x = tmp;
273 
274  tmp = kernel.packet[0].z;
275  kernel.packet[0].z = kernel.packet[2].x;
276  kernel.packet[2].x = tmp;
277 
278  tmp = kernel.packet[0].w;
279  kernel.packet[0].w = kernel.packet[3].x;
280  kernel.packet[3].x = tmp;
281 
282  tmp = kernel.packet[1].z;
283  kernel.packet[1].z = kernel.packet[2].y;
284  kernel.packet[2].y = tmp;
285 
286  tmp = kernel.packet[1].w;
287  kernel.packet[1].w = kernel.packet[3].y;
288  kernel.packet[3].y = tmp;
289 
290  tmp = kernel.packet[2].w;
291  kernel.packet[2].w = kernel.packet[3].z;
292  kernel.packet[3].z = tmp;
293 }
294 
295 EIGEN_DEVICE_FUNC inline void
296 ptranspose(PacketBlock<double2,2>& kernel) {
297  double tmp = kernel.packet[0].y;
298  kernel.packet[0].y = kernel.packet[1].x;
299  kernel.packet[1].x = tmp;
300 }
301 
302 #endif
303 
304 } // end namespace internal
305 
306 } // end namespace Eigen
307 
308 
309 #endif // EIGEN_PACKET_MATH_CUDA_H
Definition: LDLT.h:16
Definition: Constants.h:230
Definition: Eigen_Colamd.h:54