10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
26 template<
typename Expression,
typename Device,
bool Vectorizable>
30 typedef typename Expression::Index Index;
32 static inline void run(
const Expression& expr,
const Device& device = Device())
34 TensorEvaluator<Expression, Device> evaluator(expr, device);
35 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
38 const Index size = array_prod(evaluator.dimensions());
39 for (Index i = 0; i < size; ++i) {
40 evaluator.evalScalar(i);
48 template<
typename Expression>
52 typedef typename Expression::Index Index;
53 static inline void run(
const Expression& expr,
const DefaultDevice& device = DefaultDevice())
55 TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
56 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
59 const Index size = array_prod(evaluator.dimensions());
60 static const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
61 const Index VectorizedSize = (size / PacketSize) * PacketSize;
63 for (Index i = 0; i < VectorizedSize; i += PacketSize) {
64 evaluator.evalPacket(i);
66 for (Index i = VectorizedSize; i < size; ++i) {
67 evaluator.evalScalar(i);
77 #ifdef EIGEN_USE_THREADS
78 template <
typename Evaluator,
typename Index,
bool Vectorizable>
80 static void run(Evaluator evaluator,
const Index first,
const Index last) {
81 eigen_assert(last > first);
82 for (Index i = first; i < last; ++i) {
83 evaluator.evalScalar(i);
88 template <
typename Evaluator,
typename Index>
89 struct EvalRange<Evaluator, Index, true> {
90 static void run(Evaluator evaluator,
const Index first,
const Index last) {
91 eigen_assert(last > first);
94 static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
95 if (last - first >= PacketSize) {
96 eigen_assert(first % PacketSize == 0);
97 Index lastPacket = last - (last % PacketSize);
98 for (; i < lastPacket; i += PacketSize) {
99 evaluator.evalPacket(i);
103 for (; i < last; ++i) {
104 evaluator.evalScalar(i);
109 template<
typename Expression,
bool Vectorizable>
113 typedef typename Expression::Index Index;
114 static inline void run(
const Expression& expr,
const ThreadPoolDevice& device)
116 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
117 Evaluator evaluator(expr, device);
118 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
121 const Index size = array_prod(evaluator.dimensions());
123 static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
125 int blocksz = std::ceil<int>(
static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
126 const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
127 const Index numblocks = size / blocksize;
129 std::vector<Notification*> results;
130 results.reserve(numblocks);
131 for (
int i = 0; i < numblocks; ++i) {
132 results.push_back(device.enqueue(&EvalRange<Evaluator, Index, Vectorizable>::run, evaluator, i*blocksize, (i+1)*blocksize));
135 if (numblocks * blocksize < size) {
136 EvalRange<Evaluator, Index, Vectorizable>::run(evaluator, numblocks * blocksize, size);
139 for (
int i = 0; i < numblocks; ++i) {
140 wait_until_ready(results[i]);
152 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
153 template <
typename Evaluator,
typename Index>
155 __launch_bounds__(1024)
156 EigenMetaKernel_NonVectorizable(Evaluator memcopied_eval, Index size) {
160 Evaluator eval(memcopied_eval);
162 const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
163 const Index step_size = blockDim.x * gridDim.x;
166 for (Index i = first_index; i < size; i += step_size) {
171 template <
typename Evaluator,
typename Index>
173 __launch_bounds__(1024)
174 EigenMetaKernel_Vectorizable(Evaluator memcopied_eval, Index size) {
178 Evaluator eval(memcopied_eval);
180 const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
181 const Index step_size = blockDim.x * gridDim.x;
184 const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
185 const Index vectorized_step_size = step_size * PacketSize;
186 const Index vectorized_size = (size / PacketSize) * PacketSize;
187 for (Index i = first_index * PacketSize; i < vectorized_size;
188 i += vectorized_step_size) {
191 for (Index i = vectorized_size + first_index; i < size; i += step_size) {
197 template<
typename Expression>
201 typedef typename Expression::Index Index;
202 static inline void run(
const Expression& expr,
const GpuDevice& device)
204 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
205 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
208 const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
209 const int block_size = device.maxCudaThreadsPerBlock();
210 const Index size = array_prod(evaluator.dimensions());
211 LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
217 template<
typename Expression>
221 typedef typename Expression::Index Index;
222 static inline void run(
const Expression& expr,
const GpuDevice& device)
224 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
225 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
228 const int num_blocks = device.getNumCudaMultiProcessors() * device.maxCudaThreadsPerMultiProcessor() / device.maxCudaThreadsPerBlock();
229 const int block_size = device.maxCudaThreadsPerBlock();
230 const Index size = array_prod(evaluator.dimensions());
231 LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>, Index>), num_blocks, block_size, 0, device, evaluator, size);
243 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13
The tensor executor class.