10 #ifndef EIGEN_PACKET_MATH_CUDA_H
11 #define EIGEN_PACKET_MATH_CUDA_H
20 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
21 template<>
struct is_arithmetic<float4> {
enum { value =
true }; };
22 template<>
struct is_arithmetic<double2> {
enum { value =
true }; };
25 template<>
struct packet_traits<float> : default_packet_traits
47 template<>
struct packet_traits<double> : default_packet_traits
68 template<>
struct unpacket_traits<float4> {
typedef float type;
enum {size=4, alignment=
Aligned16};
typedef float4 half; };
69 template<>
struct unpacket_traits<double2> {
typedef double type;
enum {size=2, alignment=
Aligned16};
typedef double2 half; };
71 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(
const float& from) {
72 return make_float4(from, from, from, from);
74 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(
const double& from) {
75 return make_double2(from, from);
79 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(
const float& a) {
80 return make_float4(a, a+1, a+2, a+3);
82 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(
const double& a) {
83 return make_double2(a, a+1);
86 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(
const float4& a,
const float4& b) {
87 return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
89 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(
const double2& a,
const double2& b) {
90 return make_double2(a.x+b.x, a.y+b.y);
93 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(
const float4& a,
const float4& b) {
94 return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
96 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(
const double2& a,
const double2& b) {
97 return make_double2(a.x-b.x, a.y-b.y);
100 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(
const float4& a) {
101 return make_float4(-a.x, -a.y, -a.z, -a.w);
103 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(
const double2& a) {
104 return make_double2(-a.x, -a.y);
107 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(
const float4& a) {
return a; }
108 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(
const double2& a) {
return a; }
110 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(
const float4& a,
const float4& b) {
111 return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
113 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(
const double2& a,
const double2& b) {
114 return make_double2(a.x*b.x, a.y*b.y);
117 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(
const float4& a,
const float4& b) {
118 return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
120 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(
const double2& a,
const double2& b) {
121 return make_double2(a.x/b.x, a.y/b.y);
124 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(
const float4& a,
const float4& b) {
125 return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
127 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(
const double2& a,
const double2& b) {
128 return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
131 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(
const float4& a,
const float4& b) {
132 return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
134 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(
const double2& a,
const double2& b) {
135 return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
138 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(
const float* from) {
139 return *
reinterpret_cast<const float4*
>(from);
142 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(
const double* from) {
143 return *
reinterpret_cast<const double2*
>(from);
146 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(
const float* from) {
147 return make_float4(from[0], from[1], from[2], from[3]);
149 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(
const double* from) {
150 return make_double2(from[0], from[1]);
153 template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(
const float* from) {
154 return make_float4(from[0], from[0], from[1], from[1]);
156 template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(
const double* from) {
157 return make_double2(from[0], from[0]);
160 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const float4& from) {
161 *
reinterpret_cast<float4*
>(to) = from;
164 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const double2& from) {
165 *
reinterpret_cast<double2*
>(to) = from;
168 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const float4& from) {
175 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const double2& from) {
182 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(
const float* from) {
183 return __ldg((
const float4*)from);
186 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(
const double* from) {
187 return __ldg((
const double2*)from);
191 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(
const float* from) {
192 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
195 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(
const double* from) {
196 return make_double2(__ldg(from+0), __ldg(from+1));
200 template<> EIGEN_DEVICE_FUNC
inline float4 pgather<float, float4>(
const float* from, Index stride) {
201 return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
204 template<> EIGEN_DEVICE_FUNC
inline double2 pgather<double, double2>(
const double* from, Index stride) {
205 return make_double2(from[0*stride], from[1*stride]);
208 template<> EIGEN_DEVICE_FUNC
inline void pscatter<float, float4>(
float* to,
const float4& from, Index stride) {
209 to[stride*0] = from.x;
210 to[stride*1] = from.y;
211 to[stride*2] = from.z;
212 to[stride*3] = from.w;
214 template<> EIGEN_DEVICE_FUNC
inline void pscatter<double, double2>(
double* to,
const double2& from, Index stride) {
215 to[stride*0] = from.x;
216 to[stride*1] = from.y;
219 template<> EIGEN_DEVICE_FUNC
inline float pfirst<float4>(
const float4& a) {
222 template<> EIGEN_DEVICE_FUNC
inline double pfirst<double2>(
const double2& a) {
226 template<> EIGEN_DEVICE_FUNC
inline float predux<float4>(
const float4& a) {
227 return a.x + a.y + a.z + a.w;
229 template<> EIGEN_DEVICE_FUNC
inline double predux<double2>(
const double2& a) {
233 template<> EIGEN_DEVICE_FUNC
inline float predux_max<float4>(
const float4& a) {
234 return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
236 template<> EIGEN_DEVICE_FUNC
inline double predux_max<double2>(
const double2& a) {
237 return fmax(a.x, a.y);
240 template<> EIGEN_DEVICE_FUNC
inline float predux_min<float4>(
const float4& a) {
241 return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
243 template<> EIGEN_DEVICE_FUNC
inline double predux_min<double2>(
const double2& a) {
244 return fmin(a.x, a.y);
247 template<> EIGEN_DEVICE_FUNC
inline float4 pabs<float4>(
const float4& a) {
248 return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
250 template<> EIGEN_DEVICE_FUNC
inline double2 pabs<double2>(
const double2& a) {
251 return make_double2(fabs(a.x), fabs(a.y));
255 EIGEN_DEVICE_FUNC
inline void
256 ptranspose(PacketBlock<float4,4>& kernel) {
257 double tmp = kernel.packet[0].y;
258 kernel.packet[0].y = kernel.packet[1].x;
259 kernel.packet[1].x = tmp;
261 tmp = kernel.packet[0].z;
262 kernel.packet[0].z = kernel.packet[2].x;
263 kernel.packet[2].x = tmp;
265 tmp = kernel.packet[0].w;
266 kernel.packet[0].w = kernel.packet[3].x;
267 kernel.packet[3].x = tmp;
269 tmp = kernel.packet[1].z;
270 kernel.packet[1].z = kernel.packet[2].y;
271 kernel.packet[2].y = tmp;
273 tmp = kernel.packet[1].w;
274 kernel.packet[1].w = kernel.packet[3].y;
275 kernel.packet[3].y = tmp;
277 tmp = kernel.packet[2].w;
278 kernel.packet[2].w = kernel.packet[3].z;
279 kernel.packet[3].z = tmp;
282 EIGEN_DEVICE_FUNC
inline void
283 ptranspose(PacketBlock<double2,2>& kernel) {
284 double tmp = kernel.packet[0].y;
285 kernel.packet[0].y = kernel.packet[1].x;
286 kernel.packet[1].x = tmp;
296 #endif // EIGEN_PACKET_MATH_CUDA_H
Definition: Constants.h:222
Definition: Eigen_Colamd.h:54