10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
23 template<
typename Op,
typename Dims,
typename XprType>
24 struct traits<TensorReductionOp<Op, Dims, XprType> >
27 typedef typename traits<XprType>::Scalar Scalar;
28 typedef typename internal::packet_traits<Scalar>::type Packet;
29 typedef typename traits<XprType>::StorageKind StorageKind;
30 typedef typename traits<XprType>::Index Index;
31 typedef typename XprType::Nested Nested;
34 template<
typename Op,
typename Dims,
typename XprType>
35 struct eval<TensorReductionOp<Op, Dims, XprType>,
Eigen::Dense>
37 typedef const TensorReductionOp<Op, Dims, XprType>& type;
40 template<
typename Op,
typename Dims,
typename XprType>
41 struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type>
43 typedef TensorReductionOp<Op, Dims, XprType> type;
47 template <
typename OutputDims>
struct DimInitializer {
48 template <
typename InputDims,
typename ReducedDims> EIGEN_DEVICE_FUNC
49 static void run(
const InputDims& input_dims,
50 const array<
bool, internal::array_size<InputDims>::value>& reduced,
51 OutputDims* output_dims, ReducedDims* reduced_dims) {
52 const int NumInputDims = internal::array_size<InputDims>::value;
55 for (
int i = 0; i < NumInputDims; ++i) {
57 (*reduced_dims)[reduceIndex] = input_dims[i];
60 (*output_dims)[outputIndex] = input_dims[i];
67 template <>
struct DimInitializer<Sizes<1> > {
68 template <
typename InputDims,
typename Index,
size_t Rank> EIGEN_DEVICE_FUNC
69 static void run(
const InputDims& input_dims,
const array<bool, Rank>&,
70 Sizes<1>*, array<Index, Rank>* reduced_dims) {
71 const int NumInputDims = internal::array_size<InputDims>::value;
72 for (
int i = 0; i < NumInputDims; ++i) {
73 (*reduced_dims)[i] = input_dims[i];
79 template <
typename ReducedDims,
int NumTensorDims,
int Layout>
80 struct are_inner_most_dims {
81 static const bool value =
false;
83 template <
typename ReducedDims,
int NumTensorDims,
int Layout>
84 struct preserve_inner_most_dims {
85 static const bool value =
false;
88 #if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
89 template <
typename ReducedDims,
int NumTensorDims>
90 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
91 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
92 static const bool tmp2 = index_statically_eq<ReducedDims>()(0, 0);
93 static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
94 static const bool value = tmp1 & tmp2 & tmp3;
96 template <
typename ReducedDims,
int NumTensorDims>
97 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
98 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
99 static const bool tmp2 = index_statically_eq<ReducedDims>()(0, NumTensorDims - array_size<ReducedDims>::value);
100 static const bool tmp3 = index_statically_eq<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
101 static const bool value = tmp1 & tmp2 & tmp3;
104 template <
typename ReducedDims,
int NumTensorDims>
105 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
106 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
107 static const bool tmp2 = index_statically_gt<ReducedDims>()(0, 0);
108 static const bool value = tmp1 & tmp2;
111 template <
typename ReducedDims,
int NumTensorDims>
112 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
113 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>()();
114 static const bool tmp2 = index_statically_lt<ReducedDims>()(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
115 static const bool value = tmp1 & tmp2;
120 template <
int DimIndex,
typename Self,
typename Op>
121 struct GenericDimReducer {
122 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&
self,
typename Self::Index firstIndex, Op& reducer,
typename Self::CoeffReturnType* accum) {
123 EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
124 for (
int j = 0; j <
self.m_reducedDims[DimIndex]; ++j) {
125 const typename Self::Index input = firstIndex + j *
self.m_reducedStrides[DimIndex];
126 GenericDimReducer<DimIndex-1, Self, Op>::reduce(
self, input, reducer, accum);
130 template <
typename Self,
typename Op>
131 struct GenericDimReducer<0, Self, Op> {
132 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&
self,
typename Self::Index firstIndex, Op& reducer,
typename Self::CoeffReturnType* accum) {
133 for (
int j = 0; j <
self.m_reducedDims[0]; ++j) {
134 const typename Self::Index input = firstIndex + j *
self.m_reducedStrides[0];
135 reducer.reduce(
self.m_impl.coeff(input), accum);
140 template <
typename Self,
typename Op,
bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
141 struct InnerMostDimReducer {
142 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Self::CoeffReturnType reduce(
const Self&
self,
typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer) {
143 typename Self::CoeffReturnType accum = reducer.initialize();
144 for (
typename Self::Index j = 0; j < numValuesToReduce; ++j) {
145 reducer.reduce(
self.m_impl.coeff(firstIndex + j), &accum);
147 return reducer.finalize(accum);
151 template <
typename Self,
typename Op>
152 struct InnerMostDimReducer<Self, Op, true> {
153 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Self::CoeffReturnType reduce(
const Self&
self,
typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer) {
154 const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
155 const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
156 typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
157 for (
typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
158 reducer.reducePacket(
self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
160 typename Self::CoeffReturnType accum = reducer.initialize();
161 for (
typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
162 reducer.reduce(
self.m_impl.coeff(firstIndex + j), &accum);
164 return reducer.finalizeBoth(accum, p);
168 template <
int DimIndex,
typename Self,
typename Op,
bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
169 struct InnerMostDimPreserver {
170 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&,
typename Self::Index, Op&,
typename Self::PacketReturnType*) {
171 eigen_assert(
false &&
"should never be called");
175 template <
int DimIndex,
typename Self,
typename Op>
176 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
177 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&
self,
typename Self::Index firstIndex, Op& reducer,
typename Self::PacketReturnType* accum) {
178 EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
179 for (
typename Self::Index j = 0; j <
self.m_reducedDims[DimIndex]; ++j) {
180 const typename Self::Index input = firstIndex + j *
self.m_reducedStrides[DimIndex];
181 InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(
self, input, reducer, accum);
186 template <
typename Self,
typename Op>
187 struct InnerMostDimPreserver<0, Self, Op, true> {
188 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void reduce(
const Self&
self,
typename Self::Index firstIndex, Op& reducer,
typename Self::PacketReturnType* accum) {
189 for (
typename Self::Index j = 0; j <
self.m_reducedDims[0]; ++j) {
190 const typename Self::Index input = firstIndex + j *
self.m_reducedStrides[0];
191 reducer.reducePacket(
self.m_impl.template packet<Unaligned>(input), accum);
197 template <
typename Self,
typename Op,
typename Device,
bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
199 static const bool HasOptimizedImplementation =
false;
201 static EIGEN_DEVICE_FUNC
void run(
const Self&
self, Op& reducer,
const Device&,
typename Self::CoeffReturnType* output) {
202 const typename Self::Index num_coeffs = array_prod(
self.m_impl.dimensions());
203 *output = InnerMostDimReducer<Self, Op>::reduce(
self, 0, num_coeffs, reducer);
208 #ifdef EIGEN_USE_THREADS
210 template <
typename Eval,
typename Op,
bool Vectorizable = (Eval::InputPacketAccess & Op::PacketAccess)>
211 struct FullReducerShard {
212 static void run(
const Eval& eval,
typename Eval::Index firstIndex,
typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
214 shard->saccum = reducer.initialize();
215 for (
typename Eval::Index j = 0; j < numValuesToReduce; ++j) {
216 reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
220 typename Eval::CoeffReturnType saccum;
223 template <
typename Eval,
typename Op>
224 struct FullReducerShard<Eval, Op, true> {
225 static void run(
const Eval& eval,
typename Eval::Index firstIndex,
typename Eval::Index numValuesToReduce, Op& reducer, FullReducerShard* shard) {
227 const int packetSize = internal::unpacket_traits<typename Eval::PacketReturnType>::size;
228 const typename Eval::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
230 shard->paccum = reducer.template initializePacket<typename Eval::PacketReturnType>();
231 for (
typename Eval::Index j = 0; j < VectorizedSize; j += packetSize) {
232 reducer.reducePacket(eval.m_impl.template packet<Unaligned>(firstIndex + j), &shard->paccum);
234 shard->saccum = reducer.initialize();
235 for (
typename Eval::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
236 reducer.reduce(eval.m_impl.coeff(firstIndex + j), &shard->saccum);
240 typename Eval::PacketReturnType paccum;
241 typename Eval::CoeffReturnType saccum;
245 template <
typename Self,
typename Op>
246 struct FullReducer<Self, Op, ThreadPoolDevice, false> {
247 static const bool HasOptimizedImplementation = !Op::IsStateful;
250 static void run(
const Self&
self, Op& reducer,
const ThreadPoolDevice& device,
typename Self::CoeffReturnType* output) {
251 typedef typename Self::Index Index;
252 const Index num_coeffs = array_prod(
self.m_impl.dimensions());
253 const Index blocksize = std::floor<Index>(
static_cast<float>(num_coeffs)/device.numThreads());
254 const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
255 eigen_assert(num_coeffs >= numblocks * blocksize);
257 std::vector<Notification*> results;
258 results.reserve(numblocks);
259 std::vector<FullReducerShard<Self, Op, false> > shards;
260 shards.resize(numblocks);
261 for (Index i = 0; i < numblocks; ++i) {
262 results.push_back(device.enqueue(&FullReducerShard<Self, Op, false>::run,
self, i*blocksize, blocksize, reducer, &shards[i]));
265 FullReducerShard<Self, Op, false> finalShard;
266 if (numblocks * blocksize < num_coeffs) {
267 FullReducerShard<Self, Op, false>::run(
self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
269 finalShard.saccum = reducer.initialize();
272 for (Index i = 0; i < numblocks; ++i) {
273 wait_until_ready(results[i]);
277 for (Index i = 0; i < numblocks; ++i) {
278 reducer.reduce(shards[i].saccum, &finalShard.saccum);
280 *output = reducer.finalize(finalShard.saccum);
284 template <
typename Self,
typename Op>
285 struct FullReducer<Self, Op, ThreadPoolDevice, true> {
286 static const bool HasOptimizedImplementation = !Op::IsStateful;
289 static void run(
const Self&
self, Op& reducer,
const ThreadPoolDevice& device,
typename Self::CoeffReturnType* output) {
290 typedef typename Self::Index Index;
291 const Index num_coeffs = array_prod(
self.m_impl.dimensions());
292 const Index blocksize = std::floor<Index>(
static_cast<float>(num_coeffs)/device.numThreads());
293 const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
294 eigen_assert(num_coeffs >= numblocks * blocksize);
296 std::vector<Notification*> results;
297 results.reserve(numblocks);
298 std::vector<FullReducerShard<Self, Op, true> > shards;
299 shards.resize(numblocks);
300 for (Index i = 0; i < numblocks; ++i) {
301 results.push_back(device.enqueue(&FullReducerShard<Self, Op, true>::run,
self, i*blocksize, blocksize, reducer, &shards[i]));
304 FullReducerShard<Self, Op, true> finalShard;
305 if (numblocks * blocksize < num_coeffs) {
306 FullReducerShard<Self, Op, true>::run(
self, numblocks * blocksize, num_coeffs - numblocks * blocksize, reducer, &finalShard);
308 finalShard.paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
309 finalShard.saccum = reducer.initialize();
312 for (Index i = 0; i < numblocks; ++i) {
313 wait_until_ready(results[i]);
317 for (Index i = 0; i < numblocks; ++i) {
318 reducer.reducePacket(shards[i].paccum, &finalShard.paccum);
319 reducer.reduce(shards[i].saccum, &finalShard.saccum);
322 *output = reducer.finalizeBoth(finalShard.saccum, finalShard.paccum);
328 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
335 template <
typename T,
typename R>
336 __device__ EIGEN_ALWAYS_INLINE
void atomicReduce(T* output, T accum, R& reducer) {
337 #if __CUDA_ARCH__ >= 300
340 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
341 unsigned int newval = oldval;
342 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
343 if (newval == oldval) {
346 unsigned int readback;
347 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
350 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
351 if (newval == oldval) {
356 else if (
sizeof(T) == 8) {
357 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
358 unsigned long long newval = oldval;
359 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
360 if (newval == oldval) {
363 unsigned long long readback;
364 while ((readback = atomicCAS((
unsigned long long*)output, oldval, newval)) != oldval) {
367 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
368 if (newval == oldval) {
374 assert(0 &&
"Wordsize not supported");
377 assert(0 &&
"Shouldn't be called on unsupported device");
381 template <
typename T>
382 __device__
inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
383 #if __CUDA_ARCH__ >= 300
384 atomicAdd(output, accum);
386 assert(0 &&
"Shouldn't be called on unsupported device");
390 template <
int BlockSize,
int NumPerThread,
typename Self,
391 typename Reducer,
typename Index>
392 __global__
void FullReductionKernel(Reducer reducer,
const Self input, Index num_coeffs,
393 typename Self::CoeffReturnType* output) {
394 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
396 if (first_index == 0) {
397 *output = reducer.initialize();
400 typename Self::CoeffReturnType accum = reducer.initialize();
401 for (Index i = 0; i < NumPerThread; ++i) {
402 const Index index = first_index + i * BlockSize;
403 if (index >= num_coeffs) {
406 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
407 reducer.reduce(val, &accum);
410 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
411 reducer.reduce(__shfl_down(accum, offset), &accum);
414 if ((threadIdx.x & (warpSize - 1)) == 0) {
415 atomicReduce(output, accum, reducer);
420 template <
typename Self,
typename Op,
bool Vectorizable>
421 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
425 static const bool HasOptimizedImplementation = !Op::IsStateful &&
426 internal::is_same<typename Self::CoeffReturnType, float>::value;
428 template <
typename OutputType>
429 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output) {
430 assert(
false &&
"Should only be called on floats");
433 static void run(
const Self&
self, Op& reducer,
const GpuDevice& device,
float* output) {
434 typedef typename Self::Index Index;
436 const Index num_coeffs = array_prod(
self.m_impl.dimensions());
437 const int block_size = 256;
438 const int num_per_thread = 128;
439 const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
440 LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
441 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output);
448 template <
typename Self,
typename Op,
449 bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
452 typedef typename Self::Index Index;
453 typedef typename Self::Scalar Scalar;
454 typedef typename Self::CoeffReturnType CoeffReturnType;
455 explicit BlockReducer(
const Op& reducer) : op_(reducer) {
456 accum_ = op_.initialize();
458 void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
459 for (Index i = 0; i < num_values_to_reduce; ++i) {
460 op_.reduce(data[index + i], &accum_);
463 CoeffReturnType Finalize() {
464 return op_.finalize(accum_);
468 CoeffReturnType accum_;
473 template <
typename Self,
typename Op>
474 class BlockReducer<Self, Op, true> {
476 typedef typename Self::Index Index;
477 typedef typename Self::Scalar Scalar;
478 typedef typename Self::CoeffReturnType CoeffReturnType;
479 typedef typename Self::PacketReturnType PacketReturnType;
480 explicit BlockReducer(
const Op& reducer) : op_(reducer) {
481 vaccum_ = op_.template initializePacket<PacketReturnType>();
482 accum_ = op_.initialize();
484 void Reduce(Index index, Index num_values_to_reduce, Scalar* data) {
485 const int packet_size = internal::unpacket_traits<PacketReturnType>::size;
486 const typename Self::Index vectorized_size = (num_values_to_reduce /
487 packet_size) * packet_size;
488 for (
typename Self::Index i = 0; i < vectorized_size; i += packet_size) {
489 op_.reducePacket(internal::ploadt<PacketReturnType, Unaligned>(
490 &data[index + i]), &vaccum_);
493 for (
typename Self::Index i = vectorized_size;
494 i < num_values_to_reduce; ++i) {
495 op_.reduce(data[index + i], &accum_);
498 typename Self::CoeffReturnType Finalize() {
499 return op_.finalizeBoth(accum_, vaccum_);
503 typename Self::PacketReturnType vaccum_;
504 typename Self::CoeffReturnType accum_;
511 template <
typename Op,
typename Dims,
typename XprType>
512 class TensorReductionOp :
public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> {
514 typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
515 typedef typename Eigen::internal::traits<TensorReductionOp>::Packet Packet;
516 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
517 typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
518 typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
519 typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
520 typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
521 typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
523 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
524 TensorReductionOp(
const XprType& expr,
const Dims& dims) : m_expr(expr), m_dims(dims)
526 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
527 TensorReductionOp(
const XprType& expr,
const Dims& dims,
const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
530 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
531 const XprType& expression()
const {
return m_expr; }
532 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
533 const Dims& dims()
const {
return m_dims; }
534 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
535 const Op& reducer()
const {
return m_reducer; }
538 typename XprType::Nested m_expr;
545 template<
typename Op,
typename Dims,
typename ArgType,
typename Device>
546 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
548 typedef TensorReductionOp<Op, Dims, ArgType> XprType;
549 typedef typename XprType::Index Index;
550 typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
551 static const int NumInputDims = internal::array_size<InputDimensions>::value;
552 static const int NumReducedDims = internal::array_size<Dims>::value;
553 static const int NumOutputDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims;
554 typedef typename internal::conditional<NumInputDims==NumReducedDims, Sizes<1>, DSizes<Index, NumOutputDims> >::type Dimensions;
555 typedef typename XprType::Scalar Scalar;
556 typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device> Self;
557 static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
561 PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
562 Layout = TensorEvaluator<ArgType, Device>::Layout,
566 static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
567 static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
568 static const bool RunningFullReduction = (NumInputDims==NumReducedDims);
570 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(
const XprType& op,
const Device& device)
571 : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
573 EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
574 EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
575 YOU_MADE_A_PROGRAMMING_MISTAKE);
578 array<bool, NumInputDims> reduced;
579 for (
int i = 0; i < NumInputDims; ++i) {
582 for (
int i = 0; i < NumReducedDims; ++i) {
583 eigen_assert(op.dims()[i] >= 0);
584 eigen_assert(op.dims()[i] < NumInputDims);
585 reduced[op.dims()[i]] =
true;
588 const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
589 internal::DimInitializer<Dimensions>::run(input_dims, reduced, &m_dimensions, &m_reducedDims);
592 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
593 m_outputStrides[0] = 1;
594 for (
int i = 1; i < NumOutputDims; ++i) {
595 m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
598 m_outputStrides[NumOutputDims - 1] = 1;
599 for (
int i = NumOutputDims - 2; i >= 0; --i) {
600 m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
605 array<Index, NumInputDims> input_strides;
606 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
607 input_strides[0] = 1;
608 for (
int i = 1; i < NumInputDims; ++i) {
609 input_strides[i] = input_strides[i-1] * input_dims[i-1];
612 input_strides[NumInputDims - 1] = 1;
613 for (
int i = NumInputDims - 2; i >= 0; --i) {
614 input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
620 for (
int i = 0; i < NumInputDims; ++i) {
622 m_reducedStrides[reduceIndex] = input_strides[i];
625 m_preservedStrides[outputIndex] = input_strides[i];
631 if (NumInputDims == NumReducedDims) {
632 eigen_assert(m_dimensions[0] == 1);
633 m_preservedStrides[0] = internal::array_prod(input_dims);
637 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions()
const {
return m_dimensions; }
639 typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
640 typedef typename internal::remove_const<typename XprType::PacketReturnType>::type PacketReturnType;
642 EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(CoeffReturnType* data) {
643 m_impl.evalSubExprsIfNeeded(NULL);
646 if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
647 ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
648 (internal::array_prod(m_impl.dimensions()) > 1024 * 1024))) {
650 bool need_assign =
false;
652 m_result =
static_cast<CoeffReturnType*
>(m_device.allocate(
sizeof(CoeffReturnType)));
657 Op reducer(m_reducer);
658 internal::FullReducer<Self, Op, Device>::run(*
this, reducer, m_device, data);
664 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void cleanup() {
667 m_device.deallocate(m_result);
671 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const
673 if (RunningFullReduction && m_result) {
676 Op reducer(m_reducer);
677 if (ReducingInnerMostDims) {
678 const Index num_values_to_reduce =
679 (
static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
680 return internal::InnerMostDimReducer<Self, Op>::reduce(*
this, firstInput(index),
681 num_values_to_reduce, reducer);
683 typename Self::CoeffReturnType accum = reducer.initialize();
684 internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*
this, firstInput(index), reducer, &accum);
685 return reducer.finalize(accum);
690 template<
int LoadMode>
691 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index)
const
693 const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
694 EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
695 eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
697 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
698 if (ReducingInnerMostDims) {
699 const Index num_values_to_reduce =
700 (
static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumOutputDims - 1];
701 const Index firstIndex = firstInput(index);
702 for (Index i = 0; i < packetSize; ++i) {
703 Op reducer(m_reducer);
704 values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*
this, firstIndex + i * num_values_to_reduce,
705 num_values_to_reduce, reducer);
707 }
else if (PreservingInnerMostDims) {
708 const Index firstIndex = firstInput(index);
709 const int innermost_dim = (
static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
711 if (((firstIndex % m_dimensions[innermost_dim]) + packetSize - 1) < m_dimensions[innermost_dim]) {
712 Op reducer(m_reducer);
713 typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
714 internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*
this, firstIndex, reducer, &accum);
715 return reducer.finalizePacket(accum);
717 for (
int i = 0; i < packetSize; ++i) {
718 values[i] = coeff(index + i);
722 for (
int i = 0; i < packetSize; ++i) {
723 values[i] = coeff(index + i);
726 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
730 EIGEN_DEVICE_FUNC Scalar* data()
const {
return NULL; }
733 template <
int,
typename,
typename>
friend struct internal::GenericDimReducer;
734 template <
typename,
typename,
bool>
friend struct internal::InnerMostDimReducer;
735 template <
int,
typename,
typename,
bool>
friend struct internal::InnerMostDimPreserver;
736 template <
typename S,
typename O,
typename D,
bool V>
friend struct internal::FullReducer;
737 #ifdef EIGEN_USE_THREADS
738 template <
typename S,
typename O,
bool V>
friend struct internal::FullReducerShard;
740 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
741 template <
int B,
int N,
typename S,
typename R,
typename I>
friend void internal::FullReductionKernel(R,
const S, I,
typename S::CoeffReturnType*);
746 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index)
const {
747 if (ReducingInnerMostDims) {
748 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
749 return index * m_preservedStrides[0];
751 return index * m_preservedStrides[NumOutputDims - 1];
755 Index startInput = 0;
756 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
757 for (
int i = NumOutputDims - 1; i > 0; --i) {
759 const Index idx = index / m_outputStrides[i];
760 startInput += idx * m_preservedStrides[i];
761 index -= idx * m_outputStrides[i];
763 if (PreservingInnerMostDims) {
764 eigen_assert(m_preservedStrides[0] == 1);
767 startInput += index * m_preservedStrides[0];
770 for (
int i = 0; i < NumOutputDims - 1; ++i) {
772 const Index idx = index / m_outputStrides[i];
773 startInput += idx * m_preservedStrides[i];
774 index -= idx * m_outputStrides[i];
776 if (PreservingInnerMostDims) {
777 eigen_assert(m_preservedStrides[NumOutputDims - 1] == 1);
780 startInput += index * m_preservedStrides[NumOutputDims - 1];
787 Dimensions m_dimensions;
789 array<Index, NumOutputDims> m_outputStrides;
792 array<Index, NumOutputDims> m_preservedStrides;
796 array<Index, NumReducedDims> m_reducedStrides;
799 array<Index, NumReducedDims> m_reducedDims;
802 TensorEvaluator<ArgType, Device> m_impl;
808 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
809 static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
811 static const bool RunningOnGPU =
false;
813 CoeffReturnType* m_result;
815 const Device& m_device;
820 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13