TensorReductionCuda.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12 
13 namespace Eigen {
14 namespace internal {
15 
16 
17 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
18 // Full reducers for GPU, don't vectorize for now
19 
20 // Reducer function that enables multiple cuda thread to safely accumulate at the same
21 // output address. It basically reads the current value of the output variable, and
22 // attempts to update it with the new value. If in the meantime another cuda thread
23 // updated the content of the output address it will try again.
24 template <typename T, typename R>
25 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
26 #if __CUDA_ARCH__ >= 300
27  if (sizeof(T) == 4)
28  {
29  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
30  unsigned int newval = oldval;
31  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32  if (newval == oldval) {
33  return;
34  }
35  unsigned int readback;
36  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
37  oldval = readback;
38  newval = oldval;
39  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40  if (newval == oldval) {
41  return;
42  }
43  }
44  }
45  else if (sizeof(T) == 8) {
46  unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
47  unsigned long long newval = oldval;
48  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49  if (newval == oldval) {
50  return;
51  }
52  unsigned long long readback;
53  while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
54  oldval = readback;
55  newval = oldval;
56  reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57  if (newval == oldval) {
58  return;
59  }
60  }
61  }
62  else {
63  assert(0 && "Wordsize not supported");
64  }
65 #else
66  assert(0 && "Shouldn't be called on unsupported device");
67 #endif
68 }
69 
70 template <typename T>
71 __device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
72 #if __CUDA_ARCH__ >= 300
73  atomicAdd(output, accum);
74 #else
75  assert(0 && "Shouldn't be called on unsupported device");
76 #endif
77 }
78 
79 template <int BlockSize, int NumPerThread, typename Self,
80  typename Reducer, typename Index>
81 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
82  typename Self::CoeffReturnType* output) {
83  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
84 
85  if (first_index == 0) {
86  *output = reducer.initialize();
87  }
88 
89  typename Self::CoeffReturnType accum = reducer.initialize();
90  for (Index i = 0; i < NumPerThread; ++i) {
91  const Index index = first_index + i * BlockSize;
92  if (index >= num_coeffs) {
93  break;
94  }
95  typename Self::CoeffReturnType val = input.m_impl.coeff(index);
96  reducer.reduce(val, &accum);
97  }
98 
99  for (int offset = warpSize/2; offset > 0; offset /= 2) {
100  reducer.reduce(__shfl_down(accum, offset), &accum);
101  }
102 
103  if ((threadIdx.x & (warpSize - 1)) == 0) {
104  atomicReduce(output, accum, reducer);
105  }
106 }
107 
108 
109 template <typename Self, typename Op, bool Vectorizable>
110 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
111  // Unfortunately nvidia doesn't support well exotic types such as complex,
112  // so reduce the scope of the optimized version of the code to the simple case
113  // of floats.
114  static const bool HasOptimizedImplementation = !Op::IsStateful &&
115  internal::is_same<typename Self::CoeffReturnType, float>::value;
116 
117  template <typename OutputType>
118  EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
119  assert(false && "Should only be called on floats");
120  }
121 
122  EIGEN_DEVICE_FUNC static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
123  typedef typename Self::Index Index;
124 
125  const Index num_coeffs = array_prod(self.m_impl.dimensions());
126  const int block_size = 256;
127  const int num_per_thread = 128;
128  const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
129  LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
130  num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
131  }
132 };
133 
134 #endif
135 
136 
137 } // end namespace internal
138 } // end namespace Eigen
139 
140 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13