TensorContractionCuda.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 // Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
6 // Copyright (C) 2014 Eric Martin <eric@ericmart.in>
7 //
8 // This Source Code Form is subject to the terms of the Mozilla
9 // Public License v. 2.0. If a copy of the MPL was not distributed
10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 
12 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
13 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
14 
15 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
16 
17 namespace Eigen {
18 
19 template<typename Scalar, typename Index, typename LhsMapper,
20  typename RhsMapper, typename OutputMapper, bool needs_edge_check>
21 __device__ EIGEN_STRONG_INLINE void
22 EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
23  const OutputMapper output, volatile Scalar* lhs_shmem, volatile Scalar* rhs_shmem,
24  const Index m_size, const Index n_size, const Index k_size) {
25 
26  const Index m_block_idx = blockIdx.x;
27  const Index n_block_idx = blockIdx.y;
28 
29  const Index base_m = 64 * m_block_idx;
30  const Index base_n = 64 * n_block_idx;
31 
32  // declare and initialize 64 registers for output 8x8 block
33 
34  // prefetch registers
35  Scalar lhs_pf0;
36  Scalar lhs_pf1;
37  Scalar lhs_pf2;
38  Scalar lhs_pf3;
39  Scalar lhs_pf4;
40  Scalar lhs_pf5;
41  Scalar lhs_pf6;
42  Scalar lhs_pf7;
43 
44  Scalar rhs_pf0;
45  Scalar rhs_pf1;
46  Scalar rhs_pf2;
47  Scalar rhs_pf3;
48  Scalar rhs_pf4;
49  Scalar rhs_pf5;
50  Scalar rhs_pf6;
51  Scalar rhs_pf7;
52 
53  // shared memory is formatted
54  // (contract idx in block, nocontract idx in block, block idx)
55  // where block idx is column major. This transposition limits the number of
56  // bank conflicts when reading the LHS. The core idea is that since the contracting
57  // index is shared by both sides, then the contracting index should be in threadIdx.x.
58 
59  // On the LHS, we pad each row inside of each block with an extra element. This makes
60  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
61  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
62 
63  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
64  // conflicts on writes and also none on reads.
65 
66  // storage indices
67  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
68  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
69 
70  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
71  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
72  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
73  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
74  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
75  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
76  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
77  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
78 
79  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
80  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
81  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
82  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
83  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
84  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
85  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
86  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
87 
88  // in the loading code, the following variables are important:
89  // threadIdx.x: the vertical position in an 8x8 block
90  // threadIdx.y: the vertical index of the 8x8 block in the grid
91  // threadIdx.z: the horizontal position in an 8x8 block
92  // k: the horizontal index of the 8x8 block in the grid
93  //
94  // The k parameter is implicit (it was the loop counter for a loop that went
95  // from 0 to <8, but now that loop is unrolled in the below code.
96 
97  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
98  const Index lhs_vert = base_m + load_idx_vert;
99 
100 #define prefetchIntoRegisters(base_k) \
101  { \
102  lhs_pf0 = Scalar(0); \
103  lhs_pf1 = Scalar(0); \
104  lhs_pf2 = Scalar(0); \
105  lhs_pf3 = Scalar(0); \
106  lhs_pf4 = Scalar(0); \
107  lhs_pf5 = Scalar(0); \
108  lhs_pf6 = Scalar(0); \
109  lhs_pf7 = Scalar(0); \
110  \
111  rhs_pf0 = Scalar(0); \
112  rhs_pf1 = Scalar(0); \
113  rhs_pf2 = Scalar(0); \
114  rhs_pf3 = Scalar(0); \
115  rhs_pf4 = Scalar(0); \
116  rhs_pf5 = Scalar(0); \
117  rhs_pf6 = Scalar(0); \
118  rhs_pf7 = Scalar(0); \
119  \
120  if (!needs_edge_check || lhs_vert < m_size) { \
121  const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \
122  const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \
123  const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \
124  const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \
125  const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \
126  const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \
127  const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \
128  const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \
129  \
130  if (!needs_edge_check || lhs_horiz_7 < k_size) { \
131  lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
132  lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
133  lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
134  lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
135  lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
136  lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
137  lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
138  lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \
139  } else if (lhs_horiz_6 < k_size) { \
140  lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
141  lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
142  lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
143  lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
144  lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
145  lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
146  lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
147  } else if (lhs_horiz_5 < k_size) { \
148  lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
149  lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
150  lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
151  lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
152  lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
153  lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
154  } else if (lhs_horiz_4 < k_size) { \
155  lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
156  lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
157  lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
158  lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
159  lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
160  } else if (lhs_horiz_3 < k_size) { \
161  lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
162  lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
163  lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
164  lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
165  } else if (lhs_horiz_2 < k_size) { \
166  lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
167  lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
168  lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
169  } else if (lhs_horiz_1 < k_size) { \
170  lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
171  lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
172  } else if (lhs_horiz_0 < k_size) { \
173  lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
174  } \
175  } \
176  \
177  const Index rhs_vert = base_k + load_idx_vert; \
178  if (!needs_edge_check || rhs_vert < k_size) { \
179  const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \
180  const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \
181  const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \
182  const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \
183  const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \
184  const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \
185  const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \
186  const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \
187  \
188  if (rhs_horiz_7 < n_size) { \
189  rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
190  rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
191  rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
192  rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
193  rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
194  rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
195  rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
196  rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \
197  } else if (rhs_horiz_6 < n_size) { \
198  rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
199  rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
200  rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
201  rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
202  rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
203  rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
204  rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
205  } else if (rhs_horiz_5 < n_size) { \
206  rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
207  rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
208  rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
209  rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
210  rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
211  rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
212  } else if (rhs_horiz_4 < n_size) { \
213  rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
214  rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
215  rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
216  rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
217  rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
218  } else if (rhs_horiz_3 < n_size) { \
219  rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
220  rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
221  rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
222  rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
223  } else if (rhs_horiz_2 < n_size) { \
224  rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
225  rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
226  rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
227  } else if (rhs_horiz_1 < n_size) { \
228  rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
229  rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
230  } else if (rhs_horiz_0 < n_size) { \
231  rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
232  } \
233  } \
234  } \
235 
236 #define writeRegToShmem(_) \
237  lhs_shmem[lhs_store_idx_0] = lhs_pf0; \
238  rhs_shmem[rhs_store_idx_0] = rhs_pf0; \
239  \
240  lhs_shmem[lhs_store_idx_1] = lhs_pf1; \
241  rhs_shmem[rhs_store_idx_1] = rhs_pf1; \
242  \
243  lhs_shmem[lhs_store_idx_2] = lhs_pf2; \
244  rhs_shmem[rhs_store_idx_2] = rhs_pf2; \
245  \
246  lhs_shmem[lhs_store_idx_3] = lhs_pf3; \
247  rhs_shmem[rhs_store_idx_3] = rhs_pf3; \
248  \
249  lhs_shmem[lhs_store_idx_4] = lhs_pf4; \
250  rhs_shmem[rhs_store_idx_4] = rhs_pf4; \
251  \
252  lhs_shmem[lhs_store_idx_5] = lhs_pf5; \
253  rhs_shmem[rhs_store_idx_5] = rhs_pf5; \
254  \
255  lhs_shmem[lhs_store_idx_6] = lhs_pf6; \
256  rhs_shmem[rhs_store_idx_6] = rhs_pf6; \
257  \
258  lhs_shmem[lhs_store_idx_7] = lhs_pf7; \
259  rhs_shmem[rhs_store_idx_7] = rhs_pf7; \
260 
261  // declare and initialize result array
262 #define res(i, j) _res_##i##j
263 #define initResultRow(i) \
264  Scalar res(i, 0) = Scalar(0); \
265  Scalar res(i, 1) = Scalar(0); \
266  Scalar res(i, 2) = Scalar(0); \
267  Scalar res(i, 3) = Scalar(0); \
268  Scalar res(i, 4) = Scalar(0); \
269  Scalar res(i, 5) = Scalar(0); \
270  Scalar res(i, 6) = Scalar(0); \
271  Scalar res(i, 7) = Scalar(0); \
272 
273  initResultRow(0);
274  initResultRow(1);
275  initResultRow(2);
276  initResultRow(3);
277  initResultRow(4);
278  initResultRow(5);
279  initResultRow(6);
280  initResultRow(7);
281 #undef initResultRow
282 
283  for (Index base_k = 0; base_k < k_size; base_k += 64) {
284  // wait for previous iteration to finish with shmem. Despite common sense,
285  // the code is a bit faster with this here then at bottom of loop
286  __syncthreads();
287 
288  prefetchIntoRegisters(base_k);
289  writeRegToShmem();
290 
291  #undef prefetchIntoRegisters
292  #undef writeRegToShmem
293 
294  // wait for shared mem packing to be done before starting computation
295  __syncthreads();
296 
297  // compute 8x8 matrix product by outer product. This involves packing one column
298  // of LHS and one row of RHS into registers (takes 16 registers).
299 
300 #define lcol(i) _lcol##i
301  Scalar lcol(0);
302  Scalar lcol(1);
303  Scalar lcol(2);
304  Scalar lcol(3);
305  Scalar lcol(4);
306  Scalar lcol(5);
307  Scalar lcol(6);
308  Scalar lcol(7);
309 
310 #define rrow(j) _rrow##j
311  Scalar rrow(0);
312  Scalar rrow(1);
313  Scalar rrow(2);
314  Scalar rrow(3);
315  Scalar rrow(4);
316  Scalar rrow(5);
317  Scalar rrow(6);
318  Scalar rrow(7);
319 
320  // Now x corresponds to k, y to m, and z to n
321  const volatile Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
322  const volatile Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
323 
324 #define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
325 #define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
326 
327 #define loadData(i, j) \
328  lcol(0) = lhs_element(0, j); \
329  rrow(0) = rhs_element(i, 0); \
330  lcol(1) = lhs_element(1, j); \
331  rrow(1) = rhs_element(i, 1); \
332  lcol(2) = lhs_element(2, j); \
333  rrow(2) = rhs_element(i, 2); \
334  lcol(3) = lhs_element(3, j); \
335  rrow(3) = rhs_element(i, 3); \
336  lcol(4) = lhs_element(4, j); \
337  rrow(4) = rhs_element(i, 4); \
338  lcol(5) = lhs_element(5, j); \
339  rrow(5) = rhs_element(i, 5); \
340  lcol(6) = lhs_element(6, j); \
341  rrow(6) = rhs_element(i, 6); \
342  lcol(7) = lhs_element(7, j); \
343  rrow(7) = rhs_element(i, 7); \
344 
345 #define computeCol(j) \
346  res(0, j) += lcol(0) * rrow(j); \
347  res(1, j) += lcol(1) * rrow(j); \
348  res(2, j) += lcol(2) * rrow(j); \
349  res(3, j) += lcol(3) * rrow(j); \
350  res(4, j) += lcol(4) * rrow(j); \
351  res(5, j) += lcol(5) * rrow(j); \
352  res(6, j) += lcol(6) * rrow(j); \
353  res(7, j) += lcol(7) * rrow(j); \
354 
355 #define computePass(i) \
356  loadData(i, i); \
357  \
358  computeCol(0); \
359  computeCol(1); \
360  computeCol(2); \
361  computeCol(3); \
362  computeCol(4); \
363  computeCol(5); \
364  computeCol(6); \
365  computeCol(7); \
366 
367  computePass(0);
368  computePass(1);
369  computePass(2);
370  computePass(3);
371  computePass(4);
372  computePass(5);
373  computePass(6);
374  computePass(7);
375 
376 #undef lcol
377 #undef rrow
378 #undef lhs_element
379 #undef rhs_element
380 #undef loadData
381 #undef computeCol
382 #undef computePass
383  } // end loop over k
384 
385  // we've now iterated over all of the large (ie width 64) k blocks and
386  // accumulated results in registers. At this point thread (x, y, z) contains
387  // the sum across all big k blocks of the product of little k block of index (x, y)
388  // with block of index (y, z). To compute the final output, we need to reduce
389  // the 8 threads over y by summation.
390 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
391 
392 #define reduceRow(i, mask) \
393  shuffleInc(i, 0, mask); \
394  shuffleInc(i, 1, mask); \
395  shuffleInc(i, 2, mask); \
396  shuffleInc(i, 3, mask); \
397  shuffleInc(i, 4, mask); \
398  shuffleInc(i, 5, mask); \
399  shuffleInc(i, 6, mask); \
400  shuffleInc(i, 7, mask); \
401 
402 #define reduceMatrix(mask) \
403  reduceRow(0, mask); \
404  reduceRow(1, mask); \
405  reduceRow(2, mask); \
406  reduceRow(3, mask); \
407  reduceRow(4, mask); \
408  reduceRow(5, mask); \
409  reduceRow(6, mask); \
410  reduceRow(7, mask); \
411 
412  // actually perform the reduction, now each thread of index (_, y, z)
413  // contains the correct values in its registers that belong in the output
414  // block
415  reduceMatrix(1);
416  reduceMatrix(2);
417  reduceMatrix(4);
418 
419 #undef shuffleInc
420 #undef reduceRow
421 #undef reduceMatrix
422 
423  // now we need to copy the 64 values into main memory. We can't split work
424  // among threads because all variables are in registers. There's 2 ways
425  // to do this:
426  // (1) have 1 thread do 64 writes from registers into global memory
427  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
428  // each do 8 writes into global memory. We can just overwrite the shared
429  // memory from the problem we just solved.
430  // (2) is slightly faster than (1) due to less branching and more ILP
431 
432  // TODO: won't yield much gain, but could just use currently unused shared mem
433  // and then we won't have to sync
434  // wait for shared mem to be out of use
435  __syncthreads();
436 
437 #define writeResultShmem(i, j) \
438  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
439 
440 #define writeRow(i) \
441  writeResultShmem(i, 0); \
442  writeResultShmem(i, 1); \
443  writeResultShmem(i, 2); \
444  writeResultShmem(i, 3); \
445  writeResultShmem(i, 4); \
446  writeResultShmem(i, 5); \
447  writeResultShmem(i, 6); \
448  writeResultShmem(i, 7); \
449 
450  if (threadIdx.x == 0) {
451  writeRow(0);
452  writeRow(1);
453  writeRow(2);
454  writeRow(3);
455  writeRow(4);
456  writeRow(5);
457  writeRow(6);
458  writeRow(7);
459  }
460 #undef writeResultShmem
461 #undef writeRow
462 
463  const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
464  const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
465 
466  if (threadIdx.x < max_i_write) {
467  if (max_j_write == 8) {
468  // TODO: can i trade bank conflicts for coalesced writes?
469  Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
470  Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
471  Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
472  Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
473  Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
474  Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
475  Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
476  Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
477 
478  output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
479  output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
480  output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
481  output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
482  output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
483  output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
484  output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
485  output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
486  } else {
487 #pragma unroll 7
488  for (int j = 0; j < max_j_write; j++) {
489  Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
490  output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
491  }
492  }
493  }
494 #undef res
495 }
496 
497 
498 template<typename Scalar, typename Index, typename LhsMapper,
499  typename RhsMapper, typename OutputMapper>
500 __global__ void
501 __launch_bounds__(512)
502 EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
503  const OutputMapper output,
504  const Index m_size, const Index n_size, const Index k_size) {
505  __shared__ volatile Scalar lhs_shmem[72 * 64];
506  __shared__ volatile Scalar rhs_shmem[72 * 64];
507 
508  const Index m_block_idx = blockIdx.x;
509  const Index n_block_idx = blockIdx.y;
510 
511  const Index base_m = 64 * m_block_idx;
512  const Index base_n = 64 * n_block_idx;
513 
514  if (base_m + 63 < m_size && base_n + 63 < n_size) {
515  EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
516  } else {
517  EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
518  }
519 }
520 
521 
522 template<typename Index, typename LhsMapper,
523  typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
524  bool CHECK_RHS_BOUNDARY>
525 __device__ EIGEN_STRONG_INLINE void
526 EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
527  const OutputMapper output, float2 lhs_shmem2[][16],
528  float2 rhs_shmem2[][8], const Index m_size,
529  const Index n_size, const Index k_size,
530  const Index base_m, const Index base_n) {
531  typedef float Scalar;
532 
533  // prefetch registers
534  float4 lhs_pf0, rhs_pf0;
535 
536  float4 results[4];
537  for (int i=0; i < 4; i++) {
538  results[i].x = results[i].y = results[i].z = results[i].w = 0;
539  }
540 
541 
542 #define prefetch_lhs(reg, row, col) \
543  if (!CHECK_LHS_BOUNDARY) { \
544  if (col < k_size) { \
545  reg =lhs.loadPacket(row, col); \
546  } \
547  } else { \
548  if (col < k_size) { \
549  if (row + 3 < m_size) { \
550  reg =lhs.loadPacket(row, col); \
551  } else if (row + 2 < m_size) { \
552  reg.x =lhs(row + 0, col); \
553  reg.y =lhs(row + 1, col); \
554  reg.z =lhs(row + 2, col); \
555  } else if (row + 1 < m_size) { \
556  reg.x =lhs(row + 0, col); \
557  reg.y =lhs(row + 1, col); \
558  } else if (row < m_size) { \
559  reg.x =lhs(row + 0, col); \
560  } \
561  } \
562  } \
563 
564 
565  Index lhs_vert = base_m+threadIdx.x*4;
566 
567  for (Index k = 0; k < k_size; k += 16) {
568  lhs_pf0 = internal::pset1<float4>(0);
569  rhs_pf0 = internal::pset1<float4>(0);
570 
571  Index lhs_horiz = threadIdx.y+k;
572  prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
573 
574  Index rhs_vert = k+(threadIdx.x%4)*4;
575  Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
576 
577  if (!CHECK_RHS_BOUNDARY) {
578  if ((rhs_vert + 3) < k_size) {
579  // just CHECK_RHS_BOUNDARY
580  rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
581  } else if (rhs_vert + 2 < k_size) {
582  // just CHECK_RHS_BOUNDARY
583  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
584  rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
585  rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
586  } else if (rhs_vert + 1 < k_size) {
587  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
588  rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
589  } else if (rhs_vert < k_size) {
590  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
591  }
592  } else {
593  if (rhs_horiz0 < n_size) {
594  if ((rhs_vert + 3) < k_size) {
595  rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
596  } else if ((rhs_vert + 2) < k_size) {
597  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
598  rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
599  rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
600  } else if ((rhs_vert + 1) < k_size) {
601  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
602  rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
603  } else if (rhs_vert < k_size) {
604  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
605  }
606  }
607  }
608  float x1, x2 ;
609  // the following can be a bitwise operation..... some day.
610  if((threadIdx.x%8) < 4) {
611  x1 = rhs_pf0.y;
612  x2 = rhs_pf0.w;
613  } else {
614  x1 = rhs_pf0.x;
615  x2 = rhs_pf0.z;
616  }
617  x1 = __shfl_xor(x1, 4);
618  x2 = __shfl_xor(x2, 4);
619  if((threadIdx.x%8) < 4) {
620  rhs_pf0.y = x1;
621  rhs_pf0.w = x2;
622  } else {
623  rhs_pf0.x = x1;
624  rhs_pf0.z = x2;
625  }
626 
627  // We have 64 features.
628  // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
629  // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
630  // ...
631  // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
632  // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
633  // ...
634  rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
635  rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
636 
637  // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
638  // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
639  // ...
640  // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
641  // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63)
642  // ...
643 
644  lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
645  lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
646 
647 
648 #define add_vals(fl1, fl2, fr1, fr2)\
649  results[0].x += fl1.x * fr1.x;\
650  results[0].y += fl1.y * fr1.x;\
651  results[0].z += fl2.x * fr1.x;\
652  results[0].w += fl2.y * fr1.x;\
653 \
654  results[1].x += fl1.x * fr1.y;\
655  results[1].y += fl1.y * fr1.y;\
656  results[1].z += fl2.x * fr1.y;\
657  results[1].w += fl2.y * fr1.y;\
658 \
659  results[2].x += fl1.x * fr2.x;\
660  results[2].y += fl1.y * fr2.x;\
661  results[2].z += fl2.x * fr2.x;\
662  results[2].w += fl2.y * fr2.x;\
663 \
664  results[3].x += fl1.x * fr2.y;\
665  results[3].y += fl1.y * fr2.y;\
666  results[3].z += fl2.x * fr2.y;\
667  results[3].w += fl2.y * fr2.y;\
668 
669  __syncthreads();
670 
671  // Do the multiplies.
672  #pragma unroll
673  for (int koff = 0; koff < 16; koff ++) {
674  // 32 x threads.
675  float2 fl1 = lhs_shmem2[koff][threadIdx.x];
676  float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
677 
678  int start_feature = threadIdx.y * 4;
679  float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
680  float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
681 
682  add_vals(fl1, fl2, fr1, fr2)
683  }
684  __syncthreads();
685  }
686 
687 #undef prefetch_lhs
688 #undef add_vals
689 
690  Index horiz_base = threadIdx.y*4+base_n;
691  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
692  for (int i = 0; i < 4; i++) {
693  output(lhs_vert, horiz_base + i) = results[i].x;
694  output(lhs_vert + 1, horiz_base + i) = results[i].y;
695  output(lhs_vert + 2, horiz_base + i) = results[i].z;
696  output(lhs_vert + 3, horiz_base + i) = results[i].w;
697  }
698  } else if (!CHECK_RHS_BOUNDARY) {
699  // CHECK LHS
700  if (lhs_vert + 3 < m_size) {
701  for (int i = 0; i < 4; i++) {
702  output(lhs_vert, horiz_base + i) = results[i].x;
703  output(lhs_vert + 1, horiz_base + i) = results[i].y;
704  output(lhs_vert + 2, horiz_base + i) = results[i].z;
705  output(lhs_vert + 3, horiz_base + i) = results[i].w;
706  }
707  } else if (lhs_vert + 2 < m_size) {
708  for (int i = 0; i < 4; i++) {
709  output(lhs_vert, horiz_base + i) = results[i].x;
710  output(lhs_vert + 1, horiz_base + i) = results[i].y;
711  output(lhs_vert + 2, horiz_base + i) = results[i].z;
712  }
713  } else if (lhs_vert + 1 < m_size) {
714  for (int i = 0; i < 4; i++) {
715  output(lhs_vert, horiz_base + i) = results[i].x;
716  output(lhs_vert + 1, horiz_base + i) = results[i].y;
717  }
718  } else if (lhs_vert < m_size) {
719  for (int i = 0; i < 4; i++) {
720  output(lhs_vert, horiz_base + i) = results[i].x;
721  }
722  }
723  } else if (!CHECK_LHS_BOUNDARY) {
724  // CHECK RHS
725  /*
726  int ncols_rem = fminf(n_size- horiz_base, 4);
727  for (int i = 0; i < ncols_rem; i++) {
728  output(lhs_vert, horiz_base + i) = results[i].x;
729  output(lhs_vert + 1, horiz_base + i) = results[i].y;
730  output(lhs_vert + 2, horiz_base + i) = results[i].z;
731  output(lhs_vert + 3, horiz_base + i) = results[i].w;
732  }*/
733  for (int i = 0; i < 4; i++) {
734  if (horiz_base+i < n_size) {
735  output(lhs_vert, horiz_base + i) = results[i].x;
736  output(lhs_vert + 1, horiz_base + i) = results[i].y;
737  output(lhs_vert + 2, horiz_base + i) = results[i].z;
738  output(lhs_vert + 3, horiz_base + i) = results[i].w;
739  }
740  }
741  } else {
742  // CHECK both boundaries.
743  for (int i = 0; i < 4; i++) {
744  if (horiz_base+i < n_size) {
745  if (lhs_vert < m_size)
746  output(lhs_vert, horiz_base + i) = results[i].x;
747  if (lhs_vert + 1 < m_size)
748  output(lhs_vert + 1, horiz_base + i) = results[i].y;
749  if (lhs_vert + 2 < m_size)
750  output(lhs_vert + 2, horiz_base + i) = results[i].z;
751  if (lhs_vert + 3 < m_size)
752  output(lhs_vert + 3, horiz_base + i) = results[i].w;
753  }
754  }
755  }
756 }
757 
758 
759 template<typename Index, typename LhsMapper,
760  typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
761  bool CHECK_RHS_BOUNDARY>
762 __device__ EIGEN_STRONG_INLINE void
763 EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
764  const OutputMapper output, float2 lhs_shmem2[][32],
765  float2 rhs_shmem2[][8], const Index m_size,
766  const Index n_size, const Index k_size,
767  const Index base_m, const Index base_n) {
768  typedef float Scalar;
769 
770  // prefetch registers
771  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
772  float4 rhs_pf0, rhs_pf1;
773 
774  float4 results[8];
775  for (int i=0; i < 8; i++) {
776  results[i].x = results[i].y = results[i].z = results[i].w = 0;
777  }
778 
779 
780  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
781  for (Index k = 0; k < k_size; k += 32) {
782  lhs_pf0 = internal::pset1<float4>(0);
783  lhs_pf1 = internal::pset1<float4>(0);
784  lhs_pf2 = internal::pset1<float4>(0);
785  lhs_pf3 = internal::pset1<float4>(0);
786 
787  rhs_pf0 = internal::pset1<float4>(0);
788  rhs_pf1 = internal::pset1<float4>(0);
789 
790  if (!CHECK_LHS_BOUNDARY) {
791  if ((threadIdx.y/4+k+24) < k_size) {
792  lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
793  lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
794  lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
795  lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
796  } else if ((threadIdx.y/4+k+16) < k_size) {
797  lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
798  lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
799  lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
800  } else if ((threadIdx.y/4+k+8) < k_size) {
801  lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
802  lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
803  } else if ((threadIdx.y/4+k) < k_size) {
804  lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
805  }
806  } else {
807  // just CHECK_LHS_BOUNDARY
808  if (lhs_vert + 3 < m_size) {
809  if ((threadIdx.y/4+k+24) < k_size) {
810  lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
811  lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
812  lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
813  lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24));
814  } else if ((threadIdx.y/4+k+16) < k_size) {
815  lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
816  lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
817  lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16));
818  } else if ((threadIdx.y/4+k+8) < k_size) {
819  lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
820  lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8));
821  } else if ((threadIdx.y/4+k) < k_size) {
822  lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k));
823  }
824  } else if (lhs_vert + 2 < m_size) {
825  if ((threadIdx.y/4+k+24) < k_size) {
826  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
827  lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
828  lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
829  lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
830  lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
831  lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
832  lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
833  lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
834  lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
835  lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
836  lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
837  lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
838  } else if ((threadIdx.y/4+k+16) < k_size) {
839  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
840  lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
841  lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
842  lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
843  lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
844  lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
845  lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
846  lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
847  lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
848  } else if ((threadIdx.y/4+k+8) < k_size) {
849  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
850  lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
851  lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
852  lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
853  lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
854  lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
855  } else if ((threadIdx.y/4+k) < k_size) {
856  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
857  lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
858  lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
859  }
860  } else if (lhs_vert + 1 < m_size) {
861  if ((threadIdx.y/4+k+24) < k_size) {
862  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
863  lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
864  lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
865  lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
866  lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
867  lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
868  lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
869  lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
870  } else if ((threadIdx.y/4+k+16) < k_size) {
871  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
872  lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
873  lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
874  lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
875  lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
876  lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
877  } else if ((threadIdx.y/4+k+8) < k_size) {
878  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
879  lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
880  lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
881  lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
882  } else if ((threadIdx.y/4+k) < k_size) {
883  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
884  lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
885  }
886  } else if (lhs_vert < m_size) {
887  if ((threadIdx.y/4+k+24) < k_size) {
888  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
889  lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
890  lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
891  lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
892  } else if ((threadIdx.y/4+k+16) < k_size) {
893  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
894  lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
895  lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
896  } else if ((threadIdx.y/4+k+8) < k_size) {
897  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
898  lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
899  } else if ((threadIdx.y/4+k) < k_size) {
900  lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
901  }
902  }
903  }
904  __syncthreads();
905  Index rhs_vert = k+threadIdx.x*4;
906  Index rhs_horiz0 = threadIdx.y*2+base_n;
907  Index rhs_horiz1 = threadIdx.y*2+1+base_n;
908  if (!CHECK_RHS_BOUNDARY) {
909  if ((rhs_vert + 3) < k_size) {
910  // just CHECK_RHS_BOUNDARY
911  rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
912  rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
913  } else if (rhs_vert + 2 < k_size) {
914  // just CHECK_RHS_BOUNDARY
915  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
916  rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
917  rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
918  rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
919  rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
920  rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
921  } else if (rhs_vert + 1 < k_size) {
922  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
923  rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
924  rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
925  rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
926  } else if (rhs_vert < k_size) {
927  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
928  rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
929  }
930  } else {
931  if (rhs_horiz1 < n_size) {
932  if ((rhs_vert + 3) < k_size) {
933  // just CHECK_RHS_BOUNDARY
934  rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
935  rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1);
936  } else if (rhs_vert + 2 < k_size) {
937  // just CHECK_RHS_BOUNDARY
938  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
939  rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
940  rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
941  rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
942  rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
943  rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
944  } else if (k+threadIdx.x*4 + 1 < k_size) {
945  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
946  rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
947  rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
948  rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
949  } else if (k+threadIdx.x*4 < k_size) {
950  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
951  rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
952  }
953  } else if (rhs_horiz0 < n_size) {
954  if ((rhs_vert + 3) < k_size) {
955  // just CHECK_RHS_BOUNDARY
956  rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0);
957  } else if ((rhs_vert + 2) < k_size) {
958  // just CHECK_RHS_BOUNDARY
959  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
960  rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
961  rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
962  } else if ((rhs_vert + 1) < k_size) {
963  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
964  rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
965  } else if (rhs_vert < k_size) {
966  rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
967  }
968  }
969  }
970  __syncthreads();
971  // Loaded. Do computation
972  // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
973  // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
974  // ..
975  // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
976  rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
977  // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
978  // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
979  // ..
980  rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
981  // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
982  // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
983  rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
984  // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
985  // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
986  rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
987 
988  // LHS.
989  // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
990  // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
991  // ...
992  // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
993  // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
994 
995 
996 #define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
997  results[0].x += a_feat1.x * f1.x;\
998  results[1].x += a_feat1.x * f1.y;\
999  results[2].x += a_feat1.x * f2.x;\
1000  results[3].x += a_feat1.x * f2.y;\
1001  results[4].x += a_feat1.x * f3.x;\
1002  results[5].x += a_feat1.x * f3.y;\
1003  results[6].x += a_feat1.x * f4.x;\
1004  results[7].x += a_feat1.x * f4.y;\
1005 \
1006  results[0].y += a_feat1.y * f1.x;\
1007  results[1].y += a_feat1.y * f1.y;\
1008  results[2].y += a_feat1.y * f2.x;\
1009  results[3].y += a_feat1.y * f2.y;\
1010  results[4].y += a_feat1.y * f3.x;\
1011  results[5].y += a_feat1.y * f3.y;\
1012  results[6].y += a_feat1.y * f4.x;\
1013  results[7].y += a_feat1.y * f4.y;\
1014 \
1015  results[0].z += a_feat2.x * f1.x;\
1016  results[1].z += a_feat2.x * f1.y;\
1017  results[2].z += a_feat2.x * f2.x;\
1018  results[3].z += a_feat2.x * f2.y;\
1019  results[4].z += a_feat2.x * f3.x;\
1020  results[5].z += a_feat2.x * f3.y;\
1021  results[6].z += a_feat2.x * f4.x;\
1022  results[7].z += a_feat2.x * f4.y;\
1023 \
1024  results[0].w += a_feat2.y * f1.x;\
1025  results[1].w += a_feat2.y * f1.y;\
1026  results[2].w += a_feat2.y * f2.x;\
1027  results[3].w += a_feat2.y * f2.y;\
1028  results[4].w += a_feat2.y * f3.x;\
1029  results[5].w += a_feat2.y * f3.y;\
1030  results[6].w += a_feat2.y * f4.x;\
1031  results[7].w += a_feat2.y * f4.y;\
1032 
1033  lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
1034  lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
1035  lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
1036  lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
1037 
1038  lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
1039  lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
1040  lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
1041  lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
1042 
1043  __syncthreads();
1044 
1045  // Do the multiplies.
1046  #pragma unroll
1047  for (int koff = 0; koff < 32; koff ++) {
1048  float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
1049  float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
1050 
1051  // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
1052  int start_feature = (threadIdx.y / 4) * 8;
1053 
1054  float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4];
1055  float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
1056  float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
1057  float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
1058 
1059  add_vals(a3, a4, br1, br2, br3, br4)
1060  }
1061  __syncthreads();
1062  } // end loop over k
1063 
1064 
1065  __syncthreads();
1066  Index horiz_base = (threadIdx.y/4)*8+base_n;
1067  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
1068  for (int i = 0; i < 8; i++) {
1069  output(lhs_vert, horiz_base + i) = results[i].x;
1070  output(lhs_vert + 1, horiz_base + i) = results[i].y;
1071  output(lhs_vert + 2, horiz_base + i) = results[i].z;
1072  output(lhs_vert + 3, horiz_base + i) = results[i].w;
1073  }
1074  } else if (!CHECK_RHS_BOUNDARY) {
1075  if (lhs_vert + 3 < m_size) {
1076  for (int i = 0; i < 8; i++) {
1077  output(lhs_vert, horiz_base + i) = results[i].x;
1078  output(lhs_vert + 1, horiz_base + i) = results[i].y;
1079  output(lhs_vert + 2, horiz_base + i) = results[i].z;
1080  output(lhs_vert + 3, horiz_base + i) = results[i].w;
1081  }
1082  } else if (lhs_vert + 2 < m_size) {
1083  for (int i = 0; i < 8; i++) {
1084  output(lhs_vert, horiz_base + i) = results[i].x;
1085  output(lhs_vert + 1, horiz_base + i) = results[i].y;
1086  output(lhs_vert + 2, horiz_base + i) = results[i].z;
1087  }
1088  } else if (lhs_vert + 1 < m_size) {
1089  for (int i = 0; i < 8; i++) {
1090  output(lhs_vert, horiz_base + i) = results[i].x;
1091  output(lhs_vert + 1, horiz_base + i) = results[i].y;
1092  }
1093  } else if (lhs_vert < m_size) {
1094  for (int i = 0; i < 8; i++) {
1095  output(lhs_vert, horiz_base + i) = results[i].x;
1096  }
1097  }
1098  } else if (!CHECK_LHS_BOUNDARY) {
1099  // CHECK BOUNDARY_B
1100  for (int i = 0; i < 8; i++) {
1101  if (horiz_base + i < n_size) {
1102  output(lhs_vert, horiz_base + i) = results[i].x;
1103  output(lhs_vert + 1, horiz_base + i) = results[i].y;
1104  output(lhs_vert + 2, horiz_base + i) = results[i].z;
1105  output(lhs_vert + 3, horiz_base + i) = results[i].w;
1106  }
1107  }
1108  } else {
1109  // CHECK both boundaries.
1110  for (int i = 0; i < 8; i++) {
1111  if (horiz_base + i < n_size) {
1112  if (lhs_vert < m_size)
1113  output(lhs_vert, horiz_base + i) = results[i].x;
1114  if (lhs_vert + 1 < m_size)
1115  output(lhs_vert + 1, horiz_base + i) = results[i].y;
1116  if (lhs_vert + 2 < m_size)
1117  output(lhs_vert + 2, horiz_base + i) = results[i].z;
1118  if (lhs_vert + 3 < m_size)
1119  output(lhs_vert + 3, horiz_base + i) = results[i].w;
1120  }
1121  }
1122  }
1123 }
1124 
1125 
1126 template<typename Index, typename LhsMapper,
1127  typename RhsMapper, typename OutputMapper>
1128 __global__ void
1129 __launch_bounds__(256)
1130 EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
1131  const OutputMapper output,
1132  const Index m_size, const Index n_size, const Index k_size) {
1133  __shared__ float2 lhs_shmem[64*32];
1134  __shared__ float2 rhs_shmem[128*8];
1135 
1136  typedef float2 LHS_MEM[64][32];
1137  typedef float2 RHS_MEM[128][8];
1138 
1139  typedef float2 LHS_MEM16x16[32][16];
1140  typedef float2 RHS_MEM16x16[64][8];
1141 
1142  const Index m_block_idx = blockIdx.x;
1143  const Index n_block_idx = blockIdx.y;
1144 
1145  const Index base_m = 128 * m_block_idx;
1146  const Index base_n = 64 * n_block_idx;
1147 
1148  bool check_rhs = (base_n + 63) >= n_size;
1149  bool check_lhs128 = (base_m + 127) >= m_size;
1150  bool check_lhs64 = (base_m + 63) >= m_size;
1151 
1152  if (!check_rhs) {
1153  if (!check_lhs128) {
1154  // >= 128 rows left
1155  EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
1156  lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1157  } else {
1158  EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
1159  lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1160  }
1161  } else {
1162  if (!check_lhs128) {
1163  // >= 128 rows left
1164  EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
1165  lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1166  } else {
1167  EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
1168  lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
1169  }
1170  }
1171 }
1172 
1173 template<typename Index, typename LhsMapper,
1174  typename RhsMapper, typename OutputMapper>
1175 __global__ void
1176 __launch_bounds__(256)
1177 EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
1178  const OutputMapper output,
1179  const Index m_size, const Index n_size, const Index k_size) {
1180  __shared__ float2 lhs_shmem[32][16];
1181  __shared__ float2 rhs_shmem[64][8];
1182 
1183  const Index m_block_idx = blockIdx.x;
1184  const Index n_block_idx = blockIdx.y;
1185 
1186  const Index base_m = 64 * m_block_idx;
1187  const Index base_n = 64 * n_block_idx;
1188 
1189  if (base_m + 63 < m_size) {
1190  if (base_n + 63 < n_size) {
1191  EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1192  } else {
1193  EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1194  }
1195  } else {
1196  if (base_n + 63 < n_size) {
1197  EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1198  } else {
1199  EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
1200  }
1201  }
1202 }
1203 
1204 
1205 template<typename Indices, typename LeftArgType, typename RightArgType>
1206 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
1207  public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
1208 
1209  typedef GpuDevice Device;
1210 
1211  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
1212  typedef TensorContractionEvaluatorBase<Self> Base;
1213 
1214  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
1215  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
1216  typedef typename XprType::Packet Packet;
1217  typedef typename XprType::Index Index;
1218  typedef typename XprType::CoeffReturnType CoeffReturnType;
1219  typedef typename XprType::PacketReturnType PacketReturnType;
1220 
1221  enum {
1222  Layout = TensorEvaluator<LeftArgType, Device>::Layout,
1223  };
1224 
1225  // Most of the code is assuming that both input tensors are ColMajor. If the
1226  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
1227  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
1228  // will pretend B is LHS and A is RHS.
1229  typedef typename internal::conditional<
1230  Layout == ColMajor, LeftArgType, RightArgType>::type EvalLeftArgType;
1231  typedef typename internal::conditional<
1232  Layout == ColMajor, RightArgType, LeftArgType>::type EvalRightArgType;
1233 
1234  static const int LDims =
1235  internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
1236  static const int RDims =
1237  internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
1238  static const int ContractDims = internal::array_size<Indices>::value;
1239 
1240  typedef array<Index, LDims> left_dim_mapper_t;
1241  typedef array<Index, RDims> right_dim_mapper_t;
1242 
1243  typedef array<Index, ContractDims> contract_t;
1244  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
1245  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
1246 
1247  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
1248 
1249  typedef DSizes<Index, NumDims> Dimensions;
1250 
1251  // typedefs needed in evalTo
1252  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
1253  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
1254 
1255  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
1256  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
1257 
1258  typedef typename LeftEvaluator::Dimensions LeftDimensions;
1259  typedef typename RightEvaluator::Dimensions RightDimensions;
1260 
1261  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
1262  Base(op, device) {}
1263 
1264  // We need to redefine this method to make nvcc happy
1265  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
1266  this->m_leftImpl.evalSubExprsIfNeeded(NULL);
1267  this->m_rightImpl.evalSubExprsIfNeeded(NULL);
1268  if (data) {
1269  evalTo(data);
1270  return false;
1271  } else {
1272  this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
1273  evalTo(this->m_result);
1274  return true;
1275  }
1276  }
1277 
1278  void evalTo(Scalar* buffer) const {
1279  if (this->m_lhs_inner_dim_contiguous) {
1280  if (this->m_rhs_inner_dim_contiguous) {
1281  if (this->m_rhs_inner_dim_reordered) {
1282  evalTyped<true, true, true, Unaligned>(buffer);
1283  }
1284  else {
1285  evalTyped<true, true, false, Unaligned>(buffer);
1286  }
1287  }
1288  else {
1289  if (this->m_rhs_inner_dim_reordered) {
1290  evalTyped<true, false, true, Unaligned>(buffer);
1291  }
1292  else {
1293  evalTyped<true, false, false, Unaligned>(buffer);
1294  }
1295  }
1296  }
1297  else {
1298  if (this->m_rhs_inner_dim_contiguous) {
1299  if (this->m_rhs_inner_dim_reordered) {
1300  evalTyped<false, true, true, Unaligned>(buffer);
1301  }
1302  else {
1303  evalTyped<false, true, false, Unaligned>(buffer);
1304  }
1305  }
1306  else {
1307  if (this->m_rhs_inner_dim_reordered) {
1308  evalTyped<false, false, true, Unaligned>(buffer);
1309  }
1310  else {
1311  evalTyped<false, false, false, Unaligned>(buffer);
1312  }
1313  }
1314  }
1315  }
1316 
1317  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
1318  void evalTyped(Scalar* buffer) const {
1319  // columns in left side, rows in right side
1320  const Index k = this->m_k_size;
1321 
1322  // rows in left side
1323  const Index m = this->m_i_size;
1324 
1325  // columns in right side
1326  const Index n = this->m_j_size;
1327 
1328  // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
1329  this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
1330 
1331  typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
1332  LeftEvaluator, left_nocontract_t,
1333  contract_t, 4,
1334  lhs_inner_dim_contiguous,
1335  false, Unaligned> LhsMapper;
1336 
1337  typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
1338  RightEvaluator, right_nocontract_t,
1339  contract_t, 4,
1340  rhs_inner_dim_contiguous,
1341  rhs_inner_dim_reordered, Unaligned> RhsMapper;
1342 
1343  typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
1344 
1345 
1346  // initialize data mappers
1347  LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
1348  this->m_left_contracting_strides, this->m_k_strides);
1349 
1350  RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
1351  this->m_right_contracting_strides, this->m_k_strides);
1352 
1353  OutputMapper output(buffer, m);
1354 
1355  setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
1356  if (internal::is_same<LhsScalar, float>::value &&
1357  internal::is_same<RhsScalar, float>::value) {
1358  if (m < 768 || n < 768) {
1359  const Index m_blocks = (m + 63) / 64;
1360  const Index n_blocks = (n + 63) / 64;
1361  const dim3 num_blocks(m_blocks, n_blocks, 1);
1362  const dim3 block_size(16, 16, 1);
1363  LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
1364  } else {
1365  const Index m_blocks = (m + 127) / 128;
1366  const Index n_blocks = (n + 63) / 64;
1367  const dim3 num_blocks(m_blocks, n_blocks, 1);
1368  const dim3 block_size(8, 32, 1);
1369  LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
1370  }
1371  } else {
1372  const Index m_blocks = (m + 63) / 64;
1373  const Index n_blocks = (n + 63) / 64;
1374  const dim3 num_blocks(m_blocks, n_blocks, 1);
1375  const dim3 block_size(8, 8, 8);
1376  LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, this->m_device, lhs, rhs, output, m, n, k);
1377  }
1378  }
1379 };
1380 
1381 } // end namespace Eigen
1382 
1383 #endif // EIGEN_USE_GPU and __CUDACC__
1384 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13