TensorConvolution.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12 
13 namespace Eigen {
14 
22 namespace internal {
23 
24 template <typename Index, typename InputDims, size_t NumKernelDims, int Layout>
25 class IndexMapper {
26  public:
27  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
28  const array<Index, NumKernelDims>& indices) {
29 
30  array<Index, NumDims> dimensions = input_dims;
31  for (int i = 0; i < NumKernelDims; ++i) {
32  const Index index = indices[i];
33  const Index input_dim = input_dims[index];
34  const Index kernel_dim = kernel_dims[i];
35  const Index result_dim = input_dim - kernel_dim + 1;
36  dimensions[index] = result_dim;
37  }
38 
39  array<Index, NumDims> inputStrides;
40  array<Index, NumDims> outputStrides;
41  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
42  inputStrides[0] = 1;
43  outputStrides[0] = 1;
44  for (int i = 1; i < NumDims; ++i) {
45  inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46  outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
47  }
48  } else {
49  inputStrides[NumDims - 1] = 1;
50  outputStrides[NumDims - 1] = 1;
51  for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
52  inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53  outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
54  }
55  }
56 
57  array<Index, NumDims> cudaInputDimensions;
58  array<Index, NumDims> cudaOutputDimensions;
59  array<Index, NumDims> tmp = dimensions;
60  array<Index, NumDims> ordering;
61  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
62  ? 0
63  : NumDims - NumKernelDims;
64  for (int i = 0; i < NumKernelDims; ++i) {
65  const Index index = i + offset;
66  ordering[index] = indices[i];
67  tmp[indices[i]] = -1;
68  cudaInputDimensions[index] = input_dims[indices[i]];
69  cudaOutputDimensions[index] = dimensions[indices[i]];
70  }
71 
72  int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
73  ? NumKernelDims
74  : 0;
75  for (int i = 0; i < NumDims; ++i) {
76  if (tmp[i] >= 0) {
77  ordering[written] = i;
78  cudaInputDimensions[written] = input_dims[i];
79  cudaOutputDimensions[written] = dimensions[i];
80  ++written;
81  }
82  }
83 
84  for (int i = 0; i < NumDims; ++i) {
85  m_inputStrides[i] = inputStrides[ordering[i]];
86  m_outputStrides[i] = outputStrides[ordering[i]];
87  }
88 
89  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
90  for (int i = 0; i < NumDims; ++i) {
91  if (i > NumKernelDims) {
92  m_cudaInputStrides[i] =
93  m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
94  m_cudaOutputStrides[i] =
95  m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
96  } else {
97  m_cudaInputStrides[i] = 1;
98  m_cudaOutputStrides[i] = 1;
99  }
100  }
101  } else {
102  for (int i = NumDims - 1; i >= 0; --i) {
103  if (i + 1 < offset) {
104  m_cudaInputStrides[i] =
105  m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
106  m_cudaOutputStrides[i] =
107  m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
108  } else {
109  m_cudaInputStrides[i] = 1;
110  m_cudaOutputStrides[i] = 1;
111  }
112  }
113  }
114  }
115 
116  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
117  Index inputIndex = 0;
118  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
119  for (int d = NumDims - 1; d > NumKernelDims; --d) {
120  const Index idx = p / m_cudaInputStrides[d];
121  inputIndex += idx * m_inputStrides[d];
122  p -= idx * m_cudaInputStrides[d];
123  }
124  inputIndex += p * m_inputStrides[NumKernelDims];
125  } else {
126  int limit = 0;
127  if (NumKernelDims < NumDims) {
128  limit = NumDims - NumKernelDims - 1;
129  }
130  for (int d = 0; d < limit; ++d) {
131  const Index idx = p / m_cudaInputStrides[d];
132  inputIndex += idx * m_inputStrides[d];
133  p -= idx * m_cudaInputStrides[d];
134  }
135  inputIndex += p * m_inputStrides[limit];
136  }
137  return inputIndex;
138  }
139 
140  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
141  Index outputIndex = 0;
142  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
143  for (int d = NumDims - 1; d > NumKernelDims; --d) {
144  const Index idx = p / m_cudaOutputStrides[d];
145  outputIndex += idx * m_outputStrides[d];
146  p -= idx * m_cudaOutputStrides[d];
147  }
148  outputIndex += p * m_outputStrides[NumKernelDims];
149  } else {
150  int limit = 0;
151  if (NumKernelDims < NumDims) {
152  limit = NumDims - NumKernelDims - 1;
153  }
154  for (int d = 0; d < limit; ++d) {
155  const Index idx = p / m_cudaOutputStrides[d];
156  outputIndex += idx * m_outputStrides[d];
157  p -= idx * m_cudaOutputStrides[d];
158  }
159  outputIndex += p * m_outputStrides[limit];
160  }
161  return outputIndex;
162  }
163 
164  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
165  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166  ? 0
167  : NumDims - NumKernelDims;
168  return i * m_inputStrides[offset];
169  }
170 
171  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
172  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173  ? 0
174  : NumDims - NumKernelDims;
175  return i * m_outputStrides[offset];
176  }
177 
178  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
179  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180  ? 0
181  : NumDims - NumKernelDims;
182  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
183  }
184 
185  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
186  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187  ? 0
188  : NumDims - NumKernelDims;
189  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
190  }
191 
192  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
193  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
194  ? 0
195  : NumDims - NumKernelDims;
196  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
197  k * m_inputStrides[offset + 2];
198  }
199 
200  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
201  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
202  ? 0
203  : NumDims - NumKernelDims;
204  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
205  k * m_outputStrides[offset + 2];
206  }
207 
208  private:
209  static const size_t NumDims = internal::array_size<InputDims>::value;
210  array<Index, NumDims> m_inputStrides;
211  array<Index, NumDims> m_outputStrides;
212  array<Index, NumDims> m_cudaInputStrides;
213  array<Index, NumDims> m_cudaOutputStrides;
214 };
215 
216 
217 
218 template<typename Dimensions, typename InputXprType, typename KernelXprType>
219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
220 {
221  // Type promotion to handle the case where the types of the lhs and the rhs are different.
222  typedef typename promote_storage_type<typename InputXprType::Scalar,
223  typename KernelXprType::Scalar>::ret Scalar;
224  typedef typename packet_traits<Scalar>::type Packet;
225  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
226  typename traits<KernelXprType>::StorageKind>::ret StorageKind;
227  typedef typename promote_index_type<typename traits<InputXprType>::Index,
228  typename traits<KernelXprType>::Index>::type Index;
229  typedef typename InputXprType::Nested LhsNested;
230  typedef typename KernelXprType::Nested RhsNested;
231  typedef typename remove_reference<LhsNested>::type _LhsNested;
232  typedef typename remove_reference<RhsNested>::type _RhsNested;
233  static const int NumDimensions = traits<InputXprType>::NumDimensions;
234  static const int Layout = traits<InputXprType>::Layout;
235 
236  enum {
237  Flags = 0,
238  };
239 };
240 
241 template<typename Dimensions, typename InputXprType, typename KernelXprType>
242 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
243 {
244  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
245 };
246 
247 template<typename Dimensions, typename InputXprType, typename KernelXprType>
248 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
249 {
250  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
251 };
252 
253 } // end namespace internal
254 
255 
256 
257 template<typename Indices, typename InputXprType, typename KernelXprType>
258 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
259 {
260  public:
261  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
262  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Packet Packet;
263  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
264  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
265  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
266  typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType,
267  typename KernelXprType::PacketReturnType>::ret PacketReturnType;
268  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
269  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
270  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
271 
272  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
273  : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
274 
275  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
276  const Indices& indices() const { return m_indices; }
277 
279  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
280  const typename internal::remove_all<typename InputXprType::Nested>::type&
281  inputExpression() const { return m_input_xpr; }
282 
283  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
284  const typename internal::remove_all<typename KernelXprType::Nested>::type&
285  kernelExpression() const { return m_kernel_xpr; }
286 
287  protected:
288  typename InputXprType::Nested m_input_xpr;
289  typename KernelXprType::Nested m_kernel_xpr;
290  const Indices m_indices;
291 };
292 
293 
294 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
295 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
296 {
297  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
298 
299  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
300  static const int NumKernelDims = internal::array_size<Indices>::value;
301  typedef typename XprType::Index Index;
302  typedef DSizes<Index, NumDims> Dimensions;
303 
304  enum {
305  IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
306  PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
307  Layout = TensorEvaluator<InputArgType, Device>::Layout,
308  CoordAccess = false, // to be implemented
309  };
310 
311  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
312  : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
313  {
314  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
315 
316  const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
317  const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
318 
319  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
320  m_inputStride[0] = 1;
321  for (int i = 1; i < NumDims; ++i) {
322  m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
323  }
324  } else {
325  m_inputStride[NumDims - 1] = 1;
326  for (int i = NumDims - 2; i >= 0; --i) {
327  m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
328  }
329  }
330 
331  m_dimensions = m_inputImpl.dimensions();
332  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
333  for (int i = 0; i < NumKernelDims; ++i) {
334  const Index index = op.indices()[i];
335  const Index input_dim = input_dims[index];
336  const Index kernel_dim = kernel_dims[i];
337  const Index result_dim = input_dim - kernel_dim + 1;
338  m_dimensions[index] = result_dim;
339  if (i > 0) {
340  m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
341  } else {
342  m_kernelStride[0] = 1;
343  }
344  m_indexStride[i] = m_inputStride[index];
345  }
346 
347  m_outputStride[0] = 1;
348  for (int i = 1; i < NumDims; ++i) {
349  m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
350  }
351  } else {
352  for (int i = NumKernelDims - 1; i >= 0; --i) {
353  const Index index = op.indices()[i];
354  const Index input_dim = input_dims[index];
355  const Index kernel_dim = kernel_dims[i];
356  const Index result_dim = input_dim - kernel_dim + 1;
357  m_dimensions[index] = result_dim;
358  if (i < NumKernelDims - 1) {
359  m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
360  } else {
361  m_kernelStride[NumKernelDims - 1] = 1;
362  }
363  m_indexStride[i] = m_inputStride[index];
364  }
365 
366  m_outputStride[NumDims - 1] = 1;
367  for (int i = NumDims - 2; i >= 0; --i) {
368  m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
369  }
370  }
371  }
372 
373  typedef typename XprType::Scalar Scalar;
374  typedef typename XprType::CoeffReturnType CoeffReturnType;
375  typedef typename XprType::PacketReturnType PacketReturnType;
376 
377  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
378 
379  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
380  m_inputImpl.evalSubExprsIfNeeded(NULL);
381  preloadKernel();
382  return true;
383  }
384  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
385  m_inputImpl.cleanup();
386  if (m_local_kernel) {
387  m_device.deallocate((void*)m_kernel);
388  m_local_kernel = false;
389  }
390  m_kernel = NULL;
391  }
392 
393  void evalTo(typename XprType::Scalar* buffer) {
394  evalSubExprsIfNeeded(NULL);
395  for (int i = 0; i < dimensions().TotalSize(); ++i) {
396  buffer[i] += coeff(i);
397  }
398  cleanup();
399  }
400 
401  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
402  {
403  CoeffReturnType result = CoeffReturnType(0);
404  convolve(firstInput(index), 0, NumKernelDims-1, result);
405  return result;
406  }
407 
408  template<int LoadMode>
409  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
410  {
411  const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
412  Index indices[2] = {index, index+PacketSize-1};
413  Index startInputs[2] = {0, 0};
414  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
415  for (int i = NumDims - 1; i > 0; --i) {
416  const Index idx0 = indices[0] / m_outputStride[i];
417  const Index idx1 = indices[1] / m_outputStride[i];
418  startInputs[0] += idx0 * m_inputStride[i];
419  startInputs[1] += idx1 * m_inputStride[i];
420  indices[0] -= idx0 * m_outputStride[i];
421  indices[1] -= idx1 * m_outputStride[i];
422  }
423  } else {
424  for (int i = 0; i < NumDims - 1; ++i) {
425  const Index idx0 = indices[0] / m_outputStride[i];
426  const Index idx1 = indices[1] / m_outputStride[i];
427  startInputs[0] += idx0 * m_inputStride[i];
428  startInputs[1] += idx1 * m_inputStride[i];
429  indices[0] -= idx0 * m_outputStride[i];
430  indices[1] -= idx1 * m_outputStride[i];
431  }
432  }
433  startInputs[0] += indices[0];
434  startInputs[1] += indices[1];
435 
436  if (startInputs[1]-startInputs[0] == PacketSize-1) {
437  PacketReturnType result = internal::pset1<PacketReturnType>(0);
438  convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
439  return result;
440  } else {
441  EIGEN_ALIGN_MAX Scalar data[PacketSize];
442  data[0] = Scalar(0);
443  convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
444  for (int i = 1; i < PacketSize-1; ++i) {
445  data[i] = Scalar(0);
446  convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
447  }
448  data[PacketSize-1] = Scalar(0);
449  convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
450  return internal::pload<PacketReturnType>(data);
451  }
452  }
453 
454  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
455 
456  private:
457  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
458  Index startInput = 0;
459  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
460  for (int i = NumDims - 1; i > 0; --i) {
461  const Index idx = index / m_outputStride[i];
462  startInput += idx * m_inputStride[i];
463  index -= idx * m_outputStride[i];
464  }
465  } else {
466  for (int i = 0; i < NumDims - 1; ++i) {
467  const Index idx = index / m_outputStride[i];
468  startInput += idx * m_inputStride[i];
469  index -= idx * m_outputStride[i];
470  }
471  }
472  startInput += index;
473  return startInput;
474  }
475 
476  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
477  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
478  const Index input = firstIndex + j * m_indexStride[DimIndex];
479  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
480  if (DimIndex > 0) {
481  convolve(input, kernel, DimIndex-1, accum);
482  } else {
483  accum += m_inputImpl.coeff(input) * m_kernel[kernel];
484  }
485  }
486  }
487 
488  template <typename Packet>
489  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
490  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
491  const Index input = firstIndex + j * m_indexStride[DimIndex];
492  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
493  if (DimIndex > 0) {
494  convolvePacket(input, kernel, DimIndex-1, accum);
495  } else {
496  accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
497  }
498  }
499  }
500 
501  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
502  // Don't make a local copy of the kernel unless we have to (i.e. it's an
503  // expression that needs to be evaluated)
504  const Scalar* in_place = m_kernelImpl.data();
505  if (in_place) {
506  m_kernel = in_place;
507  m_local_kernel = false;
508  } else {
509  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
510  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
511  typedef TensorEvalToOp<const KernelArgType> EvalTo;
512  EvalTo evalToTmp(local, m_kernelArg);
513  const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
514  internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
515 
516  m_kernel = local;
517  m_local_kernel = true;
518  }
519  }
520 
521  array<Index, NumDims> m_inputStride;
522  array<Index, NumDims> m_outputStride;
523 
524  array<Index, NumKernelDims> m_indexStride;
525  array<Index, NumKernelDims> m_kernelStride;
526  TensorEvaluator<InputArgType, Device> m_inputImpl;
527  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
528  Dimensions m_dimensions;
529 
530  KernelArgType m_kernelArg;
531  const Scalar* m_kernel;
532  bool m_local_kernel;
533  const Device& m_device;
534 };
535 
536 
537 
538 
539 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
540 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
541 
542 template <int StaticKernelSize>
543 struct GetKernelSize {
544  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
545  return StaticKernelSize;
546  }
547 };
548 template <>
549 struct GetKernelSize<Dynamic> {
550  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
551  return kernelSize;
552  }
553 };
554 
555 template <typename InputEvaluator, typename Index, typename InputDims,
556  int StaticKernelSize>
557 __global__ void EigenConvolutionKernel1D(
558  InputEvaluator eval,
559  const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
560  indexMapper,
561  const float* __restrict kernel, const int numPlanes, const int numX,
562  const int maxX, const int kernelSize, float* buffer) {
563  extern __shared__ float s[];
564 
565  const int first_x = blockIdx.x * maxX;
566  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
567  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
568  const int num_x_output = last_x - first_x + 1;
569 
570  const int first_plane = blockIdx.y * blockDim.y;
571  const int plane_stride = blockDim.y * gridDim.y;
572 
573  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
574  // Load inputs to shared memory
575  const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
576  const int plane_kernel_offset = threadIdx.y * num_x_input;
577  #pragma unroll
578  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
579  const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
580  s[i + plane_kernel_offset] = eval.coeff(tensor_index);
581  }
582 
583  __syncthreads();
584 
585  // Compute the convolution
586  const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
587 
588  #pragma unroll
589  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
590  const int kernel_offset = plane_kernel_offset + i;
591  float result = 0.0f;
592  #pragma unroll
593  for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
594  result += s[k + kernel_offset] * kernel[k];
595  }
596  const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
597  buffer[tensor_index] = result;
598  }
599  __syncthreads();
600  }
601 };
602 
603 template <typename InputEvaluator, typename Index, typename InputDims,
604  int StaticKernelSizeX, int StaticKernelSizeY>
605 __global__ void EigenConvolutionKernel2D(
606  InputEvaluator eval,
607  const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
608  indexMapper,
609  const float* __restrict kernel, const int numPlanes, const int numX,
610  const int maxX, const int numY, const int maxY, const int kernelSizeX,
611  const int kernelSizeY, float* buffer) {
612  extern __shared__ float s[];
613 
614  const int first_x = blockIdx.x * maxX;
615  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
616  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
617  const int num_x_output = last_x - first_x + 1;
618 
619  const int first_y = blockIdx.y * maxY;
620  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
621  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
622  const int num_y_output = last_y - first_y + 1;
623 
624  const int first_plane = blockIdx.z * blockDim.z;
625  const int plane_stride = blockDim.z * gridDim.z;
626 
627  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
628 
629  const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
630  const int plane_kernel_offset = threadIdx.z * num_y_input;
631 
632  // Load inputs to shared memory
633  #pragma unroll
634  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
635  const int input_offset = num_x_input * (j + plane_kernel_offset);
636  #pragma unroll
637  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
638  const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
639  s[i + input_offset] = eval.coeff(tensor_index);
640  }
641  }
642 
643  __syncthreads();
644 
645  // Convolution
646  const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
647 
648  #pragma unroll
649  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
650  #pragma unroll
651  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
652  float result = 0.0f;
653  #pragma unroll
654  for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
655  const int kernel_offset = kernelSizeX * l;
656  const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
657  #pragma unroll
658  for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
659  result += s[k + input_offset] * kernel[k + kernel_offset];
660  }
661  }
662  const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
663  buffer[tensor_index] = result;
664  }
665  }
666 
667  __syncthreads();
668  }
669 };
670 
671 template <typename InputEvaluator, typename Index, typename InputDims>
672 __global__ void EigenConvolutionKernel3D(
673  InputEvaluator eval,
674  const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
675  indexMapper,
676  const float* __restrict kernel, const size_t numPlanes, const size_t numX,
677  const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
678  const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
679  const size_t kernelSizeZ, float* buffer) {
680  extern __shared__ float s[];
681 
682  // Load inputs to shared memory
683  const int first_x = blockIdx.x * maxX;
684  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
685  const int num_x_input = last_x - first_x + kernelSizeX;
686 
687  const int first_y = blockIdx.y * maxY;
688  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
689  const int num_y_input = last_y - first_y + kernelSizeY;
690 
691  const int first_z = blockIdx.z * maxZ;
692  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
693  const int num_z_input = last_z - first_z + kernelSizeZ;
694 
695  for (int p = 0; p < numPlanes; ++p) {
696 
697  const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
698  const int plane_kernel_offset = 0;
699 
700  for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
701  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
702  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
703  const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
704  s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
705  }
706  }
707  }
708 
709  __syncthreads();
710 
711  // Convolution
712  const int num_z_output = last_z - first_z + 1;
713  const int num_y_output = last_y - first_y + 1;
714  const int num_x_output = last_x - first_x + 1;
715  const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
716 
717  for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
718  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
719  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
720  float result = 0.0f;
721  for (int n = 0; n < kernelSizeZ; ++n) {
722  for (int m = 0; m < kernelSizeY; ++m) {
723  for (int l = 0; l < kernelSizeX; ++l) {
724  result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
725  }
726  }
727  }
728  const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
729  buffer[tensor_index] = result;
730  }
731  }
732  }
733  __syncthreads();
734  }
735 };
736 
737 
738 
739 template<typename Indices, typename InputArgType, typename KernelArgType>
740 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
741 {
742  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
743 
744  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
745  static const int NumKernelDims = internal::array_size<Indices>::value;
746  typedef typename XprType::Index Index;
747  typedef DSizes<Index, NumDims> Dimensions;
748  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
749 
750  enum {
751  IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
752  PacketAccess = false,
753  Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
754  CoordAccess = false, // to be implemented
755  };
756 
757  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
758  : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
759  {
760  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
761 
762  const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
763  const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
764 
765  m_dimensions = m_inputImpl.dimensions();
766  for (int i = 0; i < NumKernelDims; ++i) {
767  const Index index = op.indices()[i];
768  const Index input_dim = input_dims[index];
769  const Index kernel_dim = kernel_dims[i];
770  const Index result_dim = input_dim - kernel_dim + 1;
771  m_dimensions[index] = result_dim;
772  }
773  }
774 
775  typedef typename XprType::CoeffReturnType CoeffReturnType;
776  typedef typename XprType::PacketReturnType PacketReturnType;
777  typedef typename InputArgType::Scalar Scalar;
778 
779  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
780 
781  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
782  preloadKernel();
783  m_inputImpl.evalSubExprsIfNeeded(NULL);
784  if (data) {
785  executeEval(data);
786  return false;
787  } else {
788  m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
789  executeEval(m_buf);
790  return true;
791  }
792  }
793 
794  EIGEN_STRONG_INLINE void cleanup() {
795  m_inputImpl.cleanup();
796  if (m_buf) {
797  m_device.deallocate(m_buf);
798  m_buf = NULL;
799  }
800  if (m_local_kernel) {
801  m_device.deallocate((void*)m_kernel);
802  m_local_kernel = false;
803  }
804  m_kernel = NULL;
805  }
806 
807  EIGEN_STRONG_INLINE void preloadKernel() {
808  // Don't make a local copy of the kernel unless we have to (i.e. it's an
809  // expression that needs to be evaluated)
810  const Scalar* in_place = m_kernelImpl.data();
811  if (in_place) {
812  m_kernel = in_place;
813  m_local_kernel = false;
814  } else {
815  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
816  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
817  typedef TensorEvalToOp<const KernelArgType> EvalTo;
818  EvalTo evalToTmp(local, m_kernelArg);
819  const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
820  internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
821 
822  m_kernel = local;
823  m_local_kernel = true;
824  }
825  }
826 
827  static unsigned int ceil(unsigned int num, unsigned int denom) {
828  const unsigned int rounded_toward_zero = num / denom;
829  if (num > rounded_toward_zero * denom) {
830  return rounded_toward_zero + 1;
831  }
832  return rounded_toward_zero;
833  }
834 
835  void executeEval(Scalar* data) const {
836  typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
837 
838  const int maxSharedMem = m_device.sharedMemPerBlock();
839  const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
840  const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
841  const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
842  const int warpSize = 32;
843 
844  switch (NumKernelDims) {
845  case 1: {
846  const int kernel_size = m_kernelImpl.dimensions().TotalSize();
847 
848  const int numX = dimensions()[m_indices[0]];
849  const int numP = dimensions().TotalSize() / numX;
850  int maxX;
851  dim3 block_size;
852 
853  const int single_stride_dim =
854  static_cast<int>(Layout) == static_cast<int>(ColMajor)
855  ? 0
856  : m_inputImpl.dimensions().rank() - 1;
857  if (m_indices[0] == single_stride_dim) {
858  // Maximum the reuse
859  const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
860  maxX = numext::mini<int>(inner_dim, numX);
861  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
862  block_size.x = numext::mini(maxThreadsPerBlock, maxX);
863  block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
864  }
865  else {
866  // Read as much as possible alongside the inner most dimension, that is the plane
867  const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
868  const int maxP = numext::mini<int>(inner_dim, numP);
869  maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
870 
871  block_size.x = numext::mini(warpSize, maxX);
872  block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
873  }
874 
875  const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
876  assert(shared_mem <= maxSharedMem);
877 
878  const int num_x_blocks = ceil(numX, maxX);
879  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
880  const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
881 
882  dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
883 
884 
885  //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
886 
887  const array<Index, 1> indices(m_indices[0]);
888  const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
889  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
890  m_inputImpl.dimensions(), kernel_dims, indices);
891  switch(kernel_size) {
892  case 4: {
893  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
894  break;
895  }
896  case 7: {
897  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
898  break;
899  }
900  default: {
901  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
902  }
903  }
904  break;
905  }
906 
907  case 2: {
908  const int idxX =
909  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
910  const int idxY =
911  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
912  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
913  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
914 
915  const int numX = dimensions()[m_indices[idxX]];
916  const int numY = dimensions()[m_indices[idxY]];
917  const int numP = dimensions().TotalSize() / (numX*numY);
918 
919  const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
920 
921  // Snap maxX to warp size
922  int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
923  const int maxX = numext::mini<int>(inner_dim, numX);
924  const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
925  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
926 
927  dim3 block_size;
928  block_size.x = numext::mini(1024, maxX);
929  block_size.y = numext::mini<int>(1024/block_size.x, maxY);
930  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
931 
932  const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
933  assert(shared_mem <= maxSharedMem);
934 
935  const int num_x_blocks = ceil(numX, maxX);
936  const int num_y_blocks = ceil(numY, maxY);
937  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
938  const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
939 
940  dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
941 
942 
943  //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
944 
945  const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
946  const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
947  m_kernelImpl.dimensions()[idxY]);
948  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
949  m_inputImpl.dimensions(), kernel_dims, indices);
950  switch (kernel_size_x) {
951  case 4: {
952  switch (kernel_size_y) {
953  case 7: {
954  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
955  break;
956  }
957  default: {
958  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
959  break;
960  }
961  }
962  break;
963  }
964  case 7: {
965  switch (kernel_size_y) {
966  case 4: {
967  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
968  break;
969  }
970  default: {
971  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
972  break;
973  }
974  }
975  break;
976  }
977  default: {
978  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
979  break;
980  }
981  }
982  break;
983  }
984 
985  case 3: {
986  const int idxX =
987  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
988  const int idxY =
989  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
990  const int idxZ =
991  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
992 
993  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
994  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
995  const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
996 
997  const int numX = dimensions()[m_indices[idxX]];
998  const int numY = dimensions()[m_indices[idxY]];
999  const int numZ = dimensions()[m_indices[idxZ]];
1000  const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1001 
1002  const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1003  const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1004  const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1005 
1006  dim3 block_size;
1007  block_size.x = numext::mini(32, maxX);
1008  block_size.y = numext::mini(32, maxY);
1009  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1010  dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1011 
1012  const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1013  assert(shared_mem <= maxSharedMem);
1014 
1015  //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1016  const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1017  m_indices[idxZ]);
1018  const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1019  m_kernelImpl.dimensions()[idxY],
1020  m_kernelImpl.dimensions()[idxZ]);
1021  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1022  m_inputImpl.dimensions(), kernel_dims, indices);
1023 
1024  LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1025  break;
1026  }
1027 
1028  default: {
1029  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1030  }
1031  }
1032  }
1033 
1034  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1035  {
1036  eigen_assert(m_buf);
1037  eigen_assert(index < m_dimensions.TotalSize());
1038  return m_buf[index];
1039  }
1040 
1041  template<int LoadMode>
1042  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1043  {
1044  eigen_assert(m_buf);
1045  eigen_assert(index < m_dimensions.TotalSize());
1046  return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1047  }
1048 
1049  private:
1050  // No assignment (copies are needed by the kernels)
1051  TensorEvaluator& operator = (const TensorEvaluator&);
1052 
1053  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1054  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1055  KernelArgType m_kernelArg;
1056  Indices m_indices;
1057  Dimensions m_dimensions;
1058  Scalar* m_buf;
1059  const Scalar* m_kernel;
1060  bool m_local_kernel;
1061 
1062  const GpuDevice& m_device;
1063 };
1064 #endif
1065 
1066 
1067 } // end namespace Eigen
1068 
1069 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13