TensorExecutor.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12 
13 namespace Eigen {
14 
23 namespace internal {
24 
25 // Default strategy: the expression is evaluated with a single cpu thread.
26 template<typename Expression, typename Device, bool Vectorizable>
27 class TensorExecutor
28 {
29  public:
30  typedef typename Expression::Index Index;
31  EIGEN_DEVICE_FUNC
32  static inline void run(const Expression& expr, const Device& device = Device())
33  {
34  TensorEvaluator<Expression, Device> evaluator(expr, device);
35  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
36  if (needs_assign)
37  {
38  const Index size = array_prod(evaluator.dimensions());
39  for (Index i = 0; i < size; ++i) {
40  evaluator.evalScalar(i);
41  }
42  }
43  evaluator.cleanup();
44  }
45 };
46 
47 
48 template<typename Expression>
49 class TensorExecutor<Expression, DefaultDevice, true>
50 {
51  public:
52  typedef typename Expression::Index Index;
53  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
54  {
55  TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
56  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
57  if (needs_assign)
58  {
59  const Index size = array_prod(evaluator.dimensions());
60  static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
61  const Index VectorizedSize = (size / PacketSize) * PacketSize;
62 
63  for (Index i = 0; i < VectorizedSize; i += PacketSize) {
64  evaluator.evalPacket(i);
65  }
66  for (Index i = VectorizedSize; i < size; ++i) {
67  evaluator.evalScalar(i);
68  }
69  }
70  evaluator.cleanup();
71  }
72 };
73 
74 
75 
76 // Multicore strategy: the index space is partitioned and each partition is executed on a single core
77 #ifdef EIGEN_USE_THREADS
78 template <typename Evaluator, typename Index, bool Vectorizable>
79 struct EvalRange {
80  static void run(Evaluator evaluator, const Index first, const Index last) {
81  eigen_assert(last > first);
82  for (Index i = first; i < last; ++i) {
83  evaluator.evalScalar(i);
84  }
85  }
86 };
87 
88 template <typename Evaluator, typename Index>
89 struct EvalRange<Evaluator, Index, true> {
90  static void run(Evaluator evaluator, const Index first, const Index last) {
91  eigen_assert(last > first);
92 
93  Index i = first;
94  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
95  if (last - first >= PacketSize) {
96  eigen_assert(first % PacketSize == 0);
97  Index lastPacket = last - (last % PacketSize);
98  for (; i < lastPacket; i += PacketSize) {
99  evaluator.evalPacket(i);
100  }
101  }
102 
103  for (; i < last; ++i) {
104  evaluator.evalScalar(i);
105  }
106  }
107 };
108 
109 template<typename Expression, bool Vectorizable>
110 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
111 {
112  public:
113  typedef typename Expression::Index Index;
114  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
115  {
116  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
117  Evaluator evaluator(expr, device);
118  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
119  if (needs_assign)
120  {
121  const Index size = array_prod(evaluator.dimensions());
122 
123  static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
124 
125  int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
126  const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
127  const Index numblocks = size / blocksize;
128 
129  std::vector<Notification*> results;
130  results.reserve(numblocks);
131  for (int i = 0; i < numblocks; ++i) {
132  results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
133  }
134 
135  if (numblocks * blocksize < size) {
136  EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
137  }
138 
139  for (int i = 0; i < numblocks; ++i) {
140  wait_until_ready(results[i]);
141  delete results[i];
142  }
143 
144  }
145  evaluator.cleanup();
146  }
147 };
148 #endif
149 
150 
151 // GPU: the evaluation of the expression is offloaded to a GPU.
152 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
153 template <typename Evaluator, typename Index>
154 __global__ void
155 __launch_bounds__(1024)
156 EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) {
157  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
158  // complex types such as evaluators we should really conform to the C++
159  // standard and call a proper copy constructor.
160  Evaluator eval(memcopied_eval);
161 
162  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
163  const Index step_size = blockDim.x * gridDim.x;
164 
165  // Use the scalar path
166  for (Index i = first_index; i < size; i += step_size) {
167  eval.evalScalar(i);
168  }
169 }
170 
171 template <typename Evaluator, typename Index>
172 __global__ void
173 __launch_bounds__(1024)
174 EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
175  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
176  // complex types such as evaluators we should really conform to the C++
177  // standard and call a proper copy constructor.
178  Evaluator eval(memcopied_eval);
179 
180  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
181  const Index step_size = blockDim.x * gridDim.x;
182 
183  // Use the vector path
184  const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
185  const Index vectorized_step_size = step_size * PacketSize;
186  const Index vectorized_size = (size / PacketSize) * PacketSize;
187  for (Index i = first_index * PacketSize; i < vectorized_size;
188  i += vectorized_step_size) {
189  eval.evalPacket(i);
190  }
191  for (Index i = vectorized_size + first_index; i < size; i += step_size) {
192  eval.evalScalar(i);
193  }
194 }
195 
196 
197 template<typename Expression>
198 class TensorExecutor<Expression, GpuDevice, false>
199 {
200  public:
201  typedef typename Expression::Index Index;
202  static inline void run(const Expression& expr, const GpuDevice& device)
203  {
204  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
205  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
206  if (needs_assign)
207  {
208  const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
209  const int block_size = device.maxCudaThreadsPerBlock();
210  const Index size = array_prod(evaluator.dimensions());
211  LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
212  }
213  evaluator.cleanup();
214  }
215 };
216 
217 template<typename Expression>
218 class TensorExecutor<Expression, GpuDevice, true>
219 {
220  public:
221  typedef typename Expression::Index Index;
222  static inline void run(const Expression& expr, const GpuDevice& device)
223  {
224  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
225  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
226  if (needs_assign)
227  {
228  const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
229  const int block_size = device.maxCudaThreadsPerBlock();
230  const Index size = array_prod(evaluator.dimensions());
231  LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
232  }
233  evaluator.cleanup();
234  }
235 };
236 
237 #endif
238 
239 } // end namespace internal
240 
241 } // end namespace Eigen
242 
243 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13
The tensor executor class.