ViennaCL - The Vienna Computing Library  1.5.2
matrix_operations_prod.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
28 namespace viennacl
29 {
30  namespace linalg
31  {
32  namespace cuda
33  {
34 
35  // matrix-matrix multiplication C = A * B
36  // matrix layouts: C...col_major, A...col_major, B...col_major
37  template <typename T>
39  T alpha,
40  const T * A,
41  unsigned int A_row_start,
42  unsigned int A_col_start,
43  unsigned int A_row_inc,
44  unsigned int A_col_inc,
45  unsigned int A_row_size,
46  unsigned int A_col_size,
47  unsigned int A_internal_rows,
48  unsigned int A_internal_cols,
49  const T * B,
50  unsigned int B_row_start,
51  unsigned int B_col_start,
52  unsigned int B_row_inc,
53  unsigned int B_col_inc,
54  unsigned int B_row_size,
55  unsigned int B_col_size,
56  unsigned int B_internal_rows,
57  unsigned int B_internal_cols,
58  T beta,
59  T * C,
60  unsigned int C_row_start,
61  unsigned int C_col_start,
62  unsigned int C_row_inc,
63  unsigned int C_col_inc,
64  unsigned int C_row_size,
65  unsigned int C_col_size,
66  unsigned int C_internal_rows,
67  unsigned int C_internal_cols)
68  {
69 
70  __shared__ T bufA[272];
71  __shared__ T bufB[272];
72 
73  vcl_size_t block_size = 16;//get_local_size(0);
74  vcl_size_t row_block_id = blockIdx.x;
75  vcl_size_t col_block_id = blockIdx.y;
76  vcl_size_t row_thread_id = threadIdx.x;
77  vcl_size_t col_thread_id = threadIdx.y;
78  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
79  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
80  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
81  vcl_size_t bStep = block_size * B_row_inc;
82  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
83  T Csub = 0;
84  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
85  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
86 
87  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
88  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
89  for (vcl_size_t block = 0;
90  block < block_num;
91  ++block)
92  {
93  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
94  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
95  __syncthreads();
96  T * bufAptr = bufA + row_thread_id_times_block_size;
97  T * bufBptr = bufB + col_thread_id_times_block_size;
98  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
99  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
100  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
101  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
102  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
103  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
104  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
105  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
106  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
107  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
108  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
109  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
110  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
111  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
112  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
113  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
114  __syncthreads();
115  aBegin += aStep;
116  bBegin += bStep;
117  }
118  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
119  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
120  }
121 
122  // matrix-matrix multiplication C = A * B^T
123  // matrix layouts: C...col_major, A...col_major, B...col_major
124  template <typename T>
126  T alpha,
127  const T * A,
128  unsigned int A_row_start,
129  unsigned int A_col_start,
130  unsigned int A_row_inc,
131  unsigned int A_col_inc,
132  unsigned int A_row_size,
133  unsigned int A_col_size,
134  unsigned int A_internal_rows,
135  unsigned int A_internal_cols,
136  const T * B,
137  unsigned int B_row_start,
138  unsigned int B_col_start,
139  unsigned int B_row_inc,
140  unsigned int B_col_inc,
141  unsigned int B_row_size,
142  unsigned int B_col_size,
143  unsigned int B_internal_rows,
144  unsigned int B_internal_cols,
145  T beta,
146  T * C,
147  unsigned int C_row_start,
148  unsigned int C_col_start,
149  unsigned int C_row_inc,
150  unsigned int C_col_inc,
151  unsigned int C_row_size,
152  unsigned int C_col_size,
153  unsigned int C_internal_rows,
154  unsigned int C_internal_cols)
155  {
156 
157  __shared__ T bufA[272];
158  __shared__ T bufB[272];
159 
160  vcl_size_t block_size = 16;//get_local_size(0);
161  vcl_size_t row_block_id = blockIdx.x;
162  vcl_size_t col_block_id = blockIdx.y;
163  vcl_size_t row_thread_id = threadIdx.x;
164  vcl_size_t col_thread_id = threadIdx.y;
165  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
166  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
167  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
168  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
169  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
170  T Csub = 0;
171  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
172  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
173 
174  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
175  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
176  for (vcl_size_t block = 0;
177  block < block_num;
178  ++block)
179  {
180  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
181  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
182  __syncthreads();
183  T * bufAptr = bufA + row_thread_id_times_block_size;
184  T * bufBptr = bufB + col_thread_id_times_block_size;
185  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
186  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
187  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
188  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
189  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
190  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
191  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
192  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
193  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
194  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
195  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
196  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
197  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
198  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
199  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
200  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
201  __syncthreads();
202  aBegin += aStep;
203  bBegin += bStep;
204  }
205  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
206  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
207  }
208 
209  // matrix-matrix multiplication C = A^T * B
210  // matrix layouts: C...col_major, A...col_major, B...col_major
211  template <typename T>
213  T alpha,
214  const T * A,
215  unsigned int A_row_start,
216  unsigned int A_col_start,
217  unsigned int A_row_inc,
218  unsigned int A_col_inc,
219  unsigned int A_row_size,
220  unsigned int A_col_size,
221  unsigned int A_internal_rows,
222  unsigned int A_internal_cols,
223  const T * B,
224  unsigned int B_row_start,
225  unsigned int B_col_start,
226  unsigned int B_row_inc,
227  unsigned int B_col_inc,
228  unsigned int B_row_size,
229  unsigned int B_col_size,
230  unsigned int B_internal_rows,
231  unsigned int B_internal_cols,
232  T beta,
233  T * C,
234  unsigned int C_row_start,
235  unsigned int C_col_start,
236  unsigned int C_row_inc,
237  unsigned int C_col_inc,
238  unsigned int C_row_size,
239  unsigned int C_col_size,
240  unsigned int C_internal_rows,
241  unsigned int C_internal_cols)
242  {
243 
244  __shared__ T bufA[272];
245  __shared__ T bufB[272];
246 
247  vcl_size_t block_size = 16;//get_local_size(0);
248  vcl_size_t row_block_id = blockIdx.x;
249  vcl_size_t col_block_id = blockIdx.y;
250  vcl_size_t row_thread_id = threadIdx.x;
251  vcl_size_t col_thread_id = threadIdx.y;
252  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
253  vcl_size_t aStep = block_size * A_row_inc;
254  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
255  vcl_size_t bStep = block_size * B_row_inc;
256  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
257  T Csub = 0;
258  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
259  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
260 
261  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
262  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
263  for (vcl_size_t block = 0;
264  block < block_num;
265  ++block)
266  {
267  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
268  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
269  __syncthreads();
270  T * bufAptr = bufA + row_thread_id_times_block_size;
271  T * bufBptr = bufB + col_thread_id_times_block_size;
272  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
273  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
274  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
275  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
276  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
277  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
278  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
279  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
280  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
281  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
282  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
283  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
284  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
285  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
286  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
287  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
288  __syncthreads();
289  aBegin += aStep;
290  bBegin += bStep;
291  }
292  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
293  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
294  }
295 
296  // matrix-matrix multiplication C = A^T * B^T
297  // matrix layouts: C...col_major, A...col_major, B...col_major
298  template <typename T>
300  T alpha,
301  const T * A,
302  unsigned int A_row_start,
303  unsigned int A_col_start,
304  unsigned int A_row_inc,
305  unsigned int A_col_inc,
306  unsigned int A_row_size,
307  unsigned int A_col_size,
308  unsigned int A_internal_rows,
309  unsigned int A_internal_cols,
310  const T * B,
311  unsigned int B_row_start,
312  unsigned int B_col_start,
313  unsigned int B_row_inc,
314  unsigned int B_col_inc,
315  unsigned int B_row_size,
316  unsigned int B_col_size,
317  unsigned int B_internal_rows,
318  unsigned int B_internal_cols,
319  T beta,
320  T * C,
321  unsigned int C_row_start,
322  unsigned int C_col_start,
323  unsigned int C_row_inc,
324  unsigned int C_col_inc,
325  unsigned int C_row_size,
326  unsigned int C_col_size,
327  unsigned int C_internal_rows,
328  unsigned int C_internal_cols)
329  {
330 
331  __shared__ T bufA[272];
332  __shared__ T bufB[272];
333 
334  vcl_size_t block_size = 16;//get_local_size(0);
335  vcl_size_t row_block_id = blockIdx.x;
336  vcl_size_t col_block_id = blockIdx.y;
337  vcl_size_t row_thread_id = threadIdx.x;
338  vcl_size_t col_thread_id = threadIdx.y;
339  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
340  vcl_size_t aStep = block_size * A_row_inc;
341  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
342  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
343  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
344  T Csub = 0;
345  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
346  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
347 
348  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
349  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
350  for (vcl_size_t block = 0;
351  block < block_num;
352  ++block)
353  {
354  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
355  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
356  __syncthreads();
357  T * bufAptr = bufA + row_thread_id_times_block_size;
358  T * bufBptr = bufB + col_thread_id_times_block_size;
359  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
360  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
361  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
362  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
363  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
364  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
365  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
366  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
367  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
368  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
369  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
370  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
371  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
372  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
373  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
374  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
375  __syncthreads();
376  aBegin += aStep;
377  bBegin += bStep;
378  }
379  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
380  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
381  }
382 
383 
384 
386 
387 
388 
389 
390  // matrix-matrix multiplication C = A * B
391  // matrix layouts: C...row_major, A...col_major, B...col_major
392  template <typename T>
394  T alpha,
395  const T * A,
396  unsigned int A_row_start,
397  unsigned int A_col_start,
398  unsigned int A_row_inc,
399  unsigned int A_col_inc,
400  unsigned int A_row_size,
401  unsigned int A_col_size,
402  unsigned int A_internal_rows,
403  unsigned int A_internal_cols,
404  const T * B,
405  unsigned int B_row_start,
406  unsigned int B_col_start,
407  unsigned int B_row_inc,
408  unsigned int B_col_inc,
409  unsigned int B_row_size,
410  unsigned int B_col_size,
411  unsigned int B_internal_rows,
412  unsigned int B_internal_cols,
413  T beta,
414  T * C,
415  unsigned int C_row_start,
416  unsigned int C_col_start,
417  unsigned int C_row_inc,
418  unsigned int C_col_inc,
419  unsigned int C_row_size,
420  unsigned int C_col_size,
421  unsigned int C_internal_rows,
422  unsigned int C_internal_cols)
423  {
424 
425  __shared__ T bufA[272];
426  __shared__ T bufB[272];
427 
428  vcl_size_t block_size = 16;//get_local_size(0);
429  vcl_size_t row_block_id = blockIdx.x;
430  vcl_size_t col_block_id = blockIdx.y;
431  vcl_size_t row_thread_id = threadIdx.x;
432  vcl_size_t col_thread_id = threadIdx.y;
433  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
434  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
435  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
436  vcl_size_t bStep = block_size * B_row_inc;
437  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
438  T Csub = 0;
439  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
440  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
441 
442  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
443  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
444  for (vcl_size_t block = 0;
445  block < block_num;
446  ++block)
447  {
448  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
449  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
450  __syncthreads();
451  T * bufAptr = bufA + row_thread_id_times_block_size;
452  T * bufBptr = bufB + col_thread_id_times_block_size;
453  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
454  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
455  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
456  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
457  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
458  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
459  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
460  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
461  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
462  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
463  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
464  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
465  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
466  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
467  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
468  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
469  __syncthreads();
470  aBegin += aStep;
471  bBegin += bStep;
472  }
473  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
474  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
475  }
476 
477  // matrix-matrix multiplication C = A * B^T
478  // matrix layouts: C...row_major, A...col_major, B...col_major
479  template <typename T>
481  T alpha,
482  const T * A,
483  unsigned int A_row_start,
484  unsigned int A_col_start,
485  unsigned int A_row_inc,
486  unsigned int A_col_inc,
487  unsigned int A_row_size,
488  unsigned int A_col_size,
489  unsigned int A_internal_rows,
490  unsigned int A_internal_cols,
491  const T * B,
492  unsigned int B_row_start,
493  unsigned int B_col_start,
494  unsigned int B_row_inc,
495  unsigned int B_col_inc,
496  unsigned int B_row_size,
497  unsigned int B_col_size,
498  unsigned int B_internal_rows,
499  unsigned int B_internal_cols,
500  T beta,
501  T * C,
502  unsigned int C_row_start,
503  unsigned int C_col_start,
504  unsigned int C_row_inc,
505  unsigned int C_col_inc,
506  unsigned int C_row_size,
507  unsigned int C_col_size,
508  unsigned int C_internal_rows,
509  unsigned int C_internal_cols)
510  {
511 
512  __shared__ T bufA[272];
513  __shared__ T bufB[272];
514 
515  vcl_size_t block_size = 16;//get_local_size(0);
516  vcl_size_t row_block_id = blockIdx.x;
517  vcl_size_t col_block_id = blockIdx.y;
518  vcl_size_t row_thread_id = threadIdx.x;
519  vcl_size_t col_thread_id = threadIdx.y;
520  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
521  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
522  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
523  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
524  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
525  T Csub = 0;
526  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
527  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
528 
529  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
530  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
531  for (vcl_size_t block = 0;
532  block < block_num;
533  ++block)
534  {
535  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
536  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
537  __syncthreads();
538  T * bufAptr = bufA + row_thread_id_times_block_size;
539  T * bufBptr = bufB + col_thread_id_times_block_size;
540  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
541  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
542  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
543  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
544  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
545  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
546  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
547  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
548  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
549  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
550  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
551  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
552  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
553  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
554  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
555  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
556  __syncthreads();
557  aBegin += aStep;
558  bBegin += bStep;
559  }
560  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
561  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
562  }
563 
564  // matrix-matrix multiplication C = A^T * B
565  // matrix layouts: C...row_major, A...col_major, B...col_major
566  template <typename T>
568  T alpha,
569  const T * A,
570  unsigned int A_row_start,
571  unsigned int A_col_start,
572  unsigned int A_row_inc,
573  unsigned int A_col_inc,
574  unsigned int A_row_size,
575  unsigned int A_col_size,
576  unsigned int A_internal_rows,
577  unsigned int A_internal_cols,
578  const T * B,
579  unsigned int B_row_start,
580  unsigned int B_col_start,
581  unsigned int B_row_inc,
582  unsigned int B_col_inc,
583  unsigned int B_row_size,
584  unsigned int B_col_size,
585  unsigned int B_internal_rows,
586  unsigned int B_internal_cols,
587  T beta,
588  T * C,
589  unsigned int C_row_start,
590  unsigned int C_col_start,
591  unsigned int C_row_inc,
592  unsigned int C_col_inc,
593  unsigned int C_row_size,
594  unsigned int C_col_size,
595  unsigned int C_internal_rows,
596  unsigned int C_internal_cols)
597  {
598 
599  __shared__ T bufA[272];
600  __shared__ T bufB[272];
601 
602  vcl_size_t block_size = 16;//get_local_size(0);
603  vcl_size_t row_block_id = blockIdx.x;
604  vcl_size_t col_block_id = blockIdx.y;
605  vcl_size_t row_thread_id = threadIdx.x;
606  vcl_size_t col_thread_id = threadIdx.y;
607  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
608  vcl_size_t aStep = block_size * A_row_inc;
609  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
610  vcl_size_t bStep = block_size * B_row_inc;
611  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
612  T Csub = 0;
613  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
614  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
615 
616  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
617  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
618  for (vcl_size_t block = 0;
619  block < block_num;
620  ++block)
621  {
622  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
623  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
624  __syncthreads();
625  T * bufAptr = bufA + row_thread_id_times_block_size;
626  T * bufBptr = bufB + col_thread_id_times_block_size;
627  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
628  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
629  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
630  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
631  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
632  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
633  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
634  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
635  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
636  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
637  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
638  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
639  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
640  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
641  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
642  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
643  __syncthreads();
644  aBegin += aStep;
645  bBegin += bStep;
646  }
647  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
648  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
649  }
650 
651  // matrix-matrix multiplication C = A^T * B^T
652  // matrix layouts: C...row_major, A...col_major, B...col_major
653  template <typename T>
655  T alpha,
656  const T * A,
657  unsigned int A_row_start,
658  unsigned int A_col_start,
659  unsigned int A_row_inc,
660  unsigned int A_col_inc,
661  unsigned int A_row_size,
662  unsigned int A_col_size,
663  unsigned int A_internal_rows,
664  unsigned int A_internal_cols,
665  const T * B,
666  unsigned int B_row_start,
667  unsigned int B_col_start,
668  unsigned int B_row_inc,
669  unsigned int B_col_inc,
670  unsigned int B_row_size,
671  unsigned int B_col_size,
672  unsigned int B_internal_rows,
673  unsigned int B_internal_cols,
674  T beta,
675  T * C,
676  unsigned int C_row_start,
677  unsigned int C_col_start,
678  unsigned int C_row_inc,
679  unsigned int C_col_inc,
680  unsigned int C_row_size,
681  unsigned int C_col_size,
682  unsigned int C_internal_rows,
683  unsigned int C_internal_cols)
684  {
685 
686  __shared__ T bufA[272];
687  __shared__ T bufB[272];
688 
689  vcl_size_t block_size = 16;//get_local_size(0);
690  vcl_size_t row_block_id = blockIdx.x;
691  vcl_size_t col_block_id = blockIdx.y;
692  vcl_size_t row_thread_id = threadIdx.x;
693  vcl_size_t col_thread_id = threadIdx.y;
694  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
695  vcl_size_t aStep = block_size * A_row_inc;
696  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
697  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
698  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
699  T Csub = 0;
700  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
701  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
702 
703  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
704  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
705  for (vcl_size_t block = 0;
706  block < block_num;
707  ++block)
708  {
709  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
710  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
711  __syncthreads();
712  T * bufAptr = bufA + row_thread_id_times_block_size;
713  T * bufBptr = bufB + col_thread_id_times_block_size;
714  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
715  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
716  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
717  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
718  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
719  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
720  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
721  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
722  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
723  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
724  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
725  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
726  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
727  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
728  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
729  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
730  __syncthreads();
731  aBegin += aStep;
732  bBegin += bStep;
733  }
734  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
735  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
736  }
737 
738 
739 
740 
742 
743 
744 
745 
746  // matrix-matrix multiplication C = A * B
747  // matrix layouts: C...col_major, A...col_major, B...row_major
748  template <typename T>
750  T alpha,
751  const T * A,
752  unsigned int A_row_start,
753  unsigned int A_col_start,
754  unsigned int A_row_inc,
755  unsigned int A_col_inc,
756  unsigned int A_row_size,
757  unsigned int A_col_size,
758  unsigned int A_internal_rows,
759  unsigned int A_internal_cols,
760  const T * B,
761  unsigned int B_row_start,
762  unsigned int B_col_start,
763  unsigned int B_row_inc,
764  unsigned int B_col_inc,
765  unsigned int B_row_size,
766  unsigned int B_col_size,
767  unsigned int B_internal_rows,
768  unsigned int B_internal_cols,
769  T beta,
770  T * C,
771  unsigned int C_row_start,
772  unsigned int C_col_start,
773  unsigned int C_row_inc,
774  unsigned int C_col_inc,
775  unsigned int C_row_size,
776  unsigned int C_col_size,
777  unsigned int C_internal_rows,
778  unsigned int C_internal_cols)
779  {
780 
781  __shared__ T bufA[272];
782  __shared__ T bufB[272];
783 
784  vcl_size_t block_size = 16;//get_local_size(0);
785  vcl_size_t row_block_id = blockIdx.x;
786  vcl_size_t col_block_id = blockIdx.y;
787  vcl_size_t row_thread_id = threadIdx.x;
788  vcl_size_t col_thread_id = threadIdx.y;
789  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
790  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
791  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
792  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
793  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
794  T Csub = 0;
795  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
796  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
797 
798  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
799  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
800  for (vcl_size_t block = 0;
801  block < block_num;
802  ++block)
803  {
804  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
805  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
806  __syncthreads();
807  T * bufAptr = bufA + row_thread_id_times_block_size;
808  T * bufBptr = bufB + col_thread_id_times_block_size;
809  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
810  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
811  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
812  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
813  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
814  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
815  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
816  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
817  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
818  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
819  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
820  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
821  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
822  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
823  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
824  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
825  __syncthreads();
826  aBegin += aStep;
827  bBegin += bStep;
828  }
829  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
830  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
831  }
832 
833  // matrix-matrix multiplication C = A * B^T
834  // matrix layouts: C...col_major, A...col_major, B...row_major
835  template <typename T>
837  T alpha,
838  const T * A,
839  unsigned int A_row_start,
840  unsigned int A_col_start,
841  unsigned int A_row_inc,
842  unsigned int A_col_inc,
843  unsigned int A_row_size,
844  unsigned int A_col_size,
845  unsigned int A_internal_rows,
846  unsigned int A_internal_cols,
847  const T * B,
848  unsigned int B_row_start,
849  unsigned int B_col_start,
850  unsigned int B_row_inc,
851  unsigned int B_col_inc,
852  unsigned int B_row_size,
853  unsigned int B_col_size,
854  unsigned int B_internal_rows,
855  unsigned int B_internal_cols,
856  T beta,
857  T * C,
858  unsigned int C_row_start,
859  unsigned int C_col_start,
860  unsigned int C_row_inc,
861  unsigned int C_col_inc,
862  unsigned int C_row_size,
863  unsigned int C_col_size,
864  unsigned int C_internal_rows,
865  unsigned int C_internal_cols)
866  {
867 
868  __shared__ T bufA[272];
869  __shared__ T bufB[272];
870 
871  vcl_size_t block_size = 16;//get_local_size(0);
872  vcl_size_t row_block_id = blockIdx.x;
873  vcl_size_t col_block_id = blockIdx.y;
874  vcl_size_t row_thread_id = threadIdx.x;
875  vcl_size_t col_thread_id = threadIdx.y;
876  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
877  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
878  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
879  vcl_size_t bStep = block_size * B_col_inc;
880  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
881  T Csub = 0;
882  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
883  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
884 
885  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
886  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
887  for (vcl_size_t block = 0;
888  block < block_num;
889  ++block)
890  {
891  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
892  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
893  __syncthreads();
894  T * bufAptr = bufA + row_thread_id_times_block_size;
895  T * bufBptr = bufB + col_thread_id_times_block_size;
896  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
897  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
898  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
899  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
900  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
901  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
902  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
903  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
904  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
905  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
906  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
907  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
908  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
909  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
910  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
911  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
912  __syncthreads();
913  aBegin += aStep;
914  bBegin += bStep;
915  }
916  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
917  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
918  }
919 
920  // matrix-matrix multiplication C = A^T * B
921  // matrix layouts: C...col_major, A...col_major, B...row_major
922  template <typename T>
924  T alpha,
925  const T * A,
926  unsigned int A_row_start,
927  unsigned int A_col_start,
928  unsigned int A_row_inc,
929  unsigned int A_col_inc,
930  unsigned int A_row_size,
931  unsigned int A_col_size,
932  unsigned int A_internal_rows,
933  unsigned int A_internal_cols,
934  const T * B,
935  unsigned int B_row_start,
936  unsigned int B_col_start,
937  unsigned int B_row_inc,
938  unsigned int B_col_inc,
939  unsigned int B_row_size,
940  unsigned int B_col_size,
941  unsigned int B_internal_rows,
942  unsigned int B_internal_cols,
943  T beta,
944  T * C,
945  unsigned int C_row_start,
946  unsigned int C_col_start,
947  unsigned int C_row_inc,
948  unsigned int C_col_inc,
949  unsigned int C_row_size,
950  unsigned int C_col_size,
951  unsigned int C_internal_rows,
952  unsigned int C_internal_cols)
953  {
954 
955  __shared__ T bufA[272];
956  __shared__ T bufB[272];
957 
958  vcl_size_t block_size = 16;//get_local_size(0);
959  vcl_size_t row_block_id = blockIdx.x;
960  vcl_size_t col_block_id = blockIdx.y;
961  vcl_size_t row_thread_id = threadIdx.x;
962  vcl_size_t col_thread_id = threadIdx.y;
963  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
964  vcl_size_t aStep = block_size * A_row_inc;
965  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
966  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
967  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
968  T Csub = 0;
969  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
970  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
971 
972  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
973  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
974  for (vcl_size_t block = 0;
975  block < block_num;
976  ++block)
977  {
978  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
979  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
980  __syncthreads();
981  T * bufAptr = bufA + row_thread_id_times_block_size;
982  T * bufBptr = bufB + col_thread_id_times_block_size;
983  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
984  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
985  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
986  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
987  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
988  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
989  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
990  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
991  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
992  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
993  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
994  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
995  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
996  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
997  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
998  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
999  __syncthreads();
1000  aBegin += aStep;
1001  bBegin += bStep;
1002  }
1003  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1004  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1005  }
1006 
1007  // matrix-matrix multiplication C = A^T * B^T
1008  // matrix layouts: C...col_major, A...col_major, B...row_major
1009  template <typename T>
1011  T alpha,
1012  const T * A,
1013  unsigned int A_row_start,
1014  unsigned int A_col_start,
1015  unsigned int A_row_inc,
1016  unsigned int A_col_inc,
1017  unsigned int A_row_size,
1018  unsigned int A_col_size,
1019  unsigned int A_internal_rows,
1020  unsigned int A_internal_cols,
1021  const T * B,
1022  unsigned int B_row_start,
1023  unsigned int B_col_start,
1024  unsigned int B_row_inc,
1025  unsigned int B_col_inc,
1026  unsigned int B_row_size,
1027  unsigned int B_col_size,
1028  unsigned int B_internal_rows,
1029  unsigned int B_internal_cols,
1030  T beta,
1031  T * C,
1032  unsigned int C_row_start,
1033  unsigned int C_col_start,
1034  unsigned int C_row_inc,
1035  unsigned int C_col_inc,
1036  unsigned int C_row_size,
1037  unsigned int C_col_size,
1038  unsigned int C_internal_rows,
1039  unsigned int C_internal_cols)
1040  {
1041 
1042  __shared__ T bufA[272];
1043  __shared__ T bufB[272];
1044 
1045  vcl_size_t block_size = 16;//get_local_size(0);
1046  vcl_size_t row_block_id = blockIdx.x;
1047  vcl_size_t col_block_id = blockIdx.y;
1048  vcl_size_t row_thread_id = threadIdx.x;
1049  vcl_size_t col_thread_id = threadIdx.y;
1050  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1051  vcl_size_t aStep = block_size * A_row_inc;
1052  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1053  vcl_size_t bStep = block_size * B_col_inc;
1054  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1055  T Csub = 0;
1056  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1057  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1058 
1059  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1060  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1061  for (vcl_size_t block = 0;
1062  block < block_num;
1063  ++block)
1064  {
1065  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1066  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1067  __syncthreads();
1068  T * bufAptr = bufA + row_thread_id_times_block_size;
1069  T * bufBptr = bufB + col_thread_id_times_block_size;
1070  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1071  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1072  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1073  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1074  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1075  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1076  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1077  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1078  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1079  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1080  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1081  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1082  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1083  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1084  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1085  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1086  __syncthreads();
1087  aBegin += aStep;
1088  bBegin += bStep;
1089  }
1090  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1091  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1092  }
1093 
1094 
1095 
1097 
1098 
1099 
1100 
1101  // matrix-matrix multiplication C = A * B
1102  // matrix layouts: C...row_major, A...col_major, B...row_major
1103  template <typename T>
1105  T alpha,
1106  const T * A,
1107  unsigned int A_row_start,
1108  unsigned int A_col_start,
1109  unsigned int A_row_inc,
1110  unsigned int A_col_inc,
1111  unsigned int A_row_size,
1112  unsigned int A_col_size,
1113  unsigned int A_internal_rows,
1114  unsigned int A_internal_cols,
1115  const T * B,
1116  unsigned int B_row_start,
1117  unsigned int B_col_start,
1118  unsigned int B_row_inc,
1119  unsigned int B_col_inc,
1120  unsigned int B_row_size,
1121  unsigned int B_col_size,
1122  unsigned int B_internal_rows,
1123  unsigned int B_internal_cols,
1124  T beta,
1125  T * C,
1126  unsigned int C_row_start,
1127  unsigned int C_col_start,
1128  unsigned int C_row_inc,
1129  unsigned int C_col_inc,
1130  unsigned int C_row_size,
1131  unsigned int C_col_size,
1132  unsigned int C_internal_rows,
1133  unsigned int C_internal_cols)
1134  {
1135 
1136  __shared__ T bufA[272];
1137  __shared__ T bufB[272];
1138 
1139  vcl_size_t block_size = 16;//get_local_size(0);
1140  vcl_size_t row_block_id = blockIdx.x;
1141  vcl_size_t col_block_id = blockIdx.y;
1142  vcl_size_t row_thread_id = threadIdx.x;
1143  vcl_size_t col_thread_id = threadIdx.y;
1144  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
1145  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
1146  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
1147  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
1148  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1149  T Csub = 0;
1150  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1151  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1152 
1153  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1154  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1155  for (vcl_size_t block = 0;
1156  block < block_num;
1157  ++block)
1158  {
1159  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1160  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1161  __syncthreads();
1162  T * bufAptr = bufA + row_thread_id_times_block_size;
1163  T * bufBptr = bufB + col_thread_id_times_block_size;
1164  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1165  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1166  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1167  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1168  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1169  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1170  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1171  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1172  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1173  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1174  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1175  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1176  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1177  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1178  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1179  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1180  __syncthreads();
1181  aBegin += aStep;
1182  bBegin += bStep;
1183  }
1184  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1185  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1186  }
1187 
1188  // matrix-matrix multiplication C = A * B^T
1189  // matrix layouts: C...row_major, A...col_major, B...row_major
1190  template <typename T>
1192  T alpha,
1193  const T * A,
1194  unsigned int A_row_start,
1195  unsigned int A_col_start,
1196  unsigned int A_row_inc,
1197  unsigned int A_col_inc,
1198  unsigned int A_row_size,
1199  unsigned int A_col_size,
1200  unsigned int A_internal_rows,
1201  unsigned int A_internal_cols,
1202  const T * B,
1203  unsigned int B_row_start,
1204  unsigned int B_col_start,
1205  unsigned int B_row_inc,
1206  unsigned int B_col_inc,
1207  unsigned int B_row_size,
1208  unsigned int B_col_size,
1209  unsigned int B_internal_rows,
1210  unsigned int B_internal_cols,
1211  T beta,
1212  T * C,
1213  unsigned int C_row_start,
1214  unsigned int C_col_start,
1215  unsigned int C_row_inc,
1216  unsigned int C_col_inc,
1217  unsigned int C_row_size,
1218  unsigned int C_col_size,
1219  unsigned int C_internal_rows,
1220  unsigned int C_internal_cols)
1221  {
1222 
1223  __shared__ T bufA[272];
1224  __shared__ T bufB[272];
1225 
1226  vcl_size_t block_size = 16;//get_local_size(0);
1227  vcl_size_t row_block_id = blockIdx.x;
1228  vcl_size_t col_block_id = blockIdx.y;
1229  vcl_size_t row_thread_id = threadIdx.x;
1230  vcl_size_t col_thread_id = threadIdx.y;
1231  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
1232  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
1233  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1234  vcl_size_t bStep = block_size * B_col_inc;
1235  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1236  T Csub = 0;
1237  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1238  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1239 
1240  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1241  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1242  for (vcl_size_t block = 0;
1243  block < block_num;
1244  ++block)
1245  {
1246  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1247  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1248  __syncthreads();
1249  T * bufAptr = bufA + row_thread_id_times_block_size;
1250  T * bufBptr = bufB + col_thread_id_times_block_size;
1251  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1252  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1253  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1254  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1255  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1256  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1257  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1258  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1259  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1260  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1261  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1262  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1263  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1264  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1265  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1266  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1267  __syncthreads();
1268  aBegin += aStep;
1269  bBegin += bStep;
1270  }
1271  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1272  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1273  }
1274 
1275  // matrix-matrix multiplication C = A^T * B
1276  // matrix layouts: C...row_major, A...col_major, B...row_major
1277  template <typename T>
1279  T alpha,
1280  const T * A,
1281  unsigned int A_row_start,
1282  unsigned int A_col_start,
1283  unsigned int A_row_inc,
1284  unsigned int A_col_inc,
1285  unsigned int A_row_size,
1286  unsigned int A_col_size,
1287  unsigned int A_internal_rows,
1288  unsigned int A_internal_cols,
1289  const T * B,
1290  unsigned int B_row_start,
1291  unsigned int B_col_start,
1292  unsigned int B_row_inc,
1293  unsigned int B_col_inc,
1294  unsigned int B_row_size,
1295  unsigned int B_col_size,
1296  unsigned int B_internal_rows,
1297  unsigned int B_internal_cols,
1298  T beta,
1299  T * C,
1300  unsigned int C_row_start,
1301  unsigned int C_col_start,
1302  unsigned int C_row_inc,
1303  unsigned int C_col_inc,
1304  unsigned int C_row_size,
1305  unsigned int C_col_size,
1306  unsigned int C_internal_rows,
1307  unsigned int C_internal_cols)
1308  {
1309 
1310  __shared__ T bufA[272];
1311  __shared__ T bufB[272];
1312 
1313  vcl_size_t block_size = 16;//get_local_size(0);
1314  vcl_size_t row_block_id = blockIdx.x;
1315  vcl_size_t col_block_id = blockIdx.y;
1316  vcl_size_t row_thread_id = threadIdx.x;
1317  vcl_size_t col_thread_id = threadIdx.y;
1318  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1319  vcl_size_t aStep = block_size * A_row_inc;
1320  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
1321  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
1322  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1323  T Csub = 0;
1324  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1325  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1326 
1327  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1328  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1329  for (vcl_size_t block = 0;
1330  block < block_num;
1331  ++block)
1332  {
1333  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1334  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1335  __syncthreads();
1336  T * bufAptr = bufA + row_thread_id_times_block_size;
1337  T * bufBptr = bufB + col_thread_id_times_block_size;
1338  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1339  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1340  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1341  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1342  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1343  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1344  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1345  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1346  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1347  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1348  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1349  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1350  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1351  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1352  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1353  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1354  __syncthreads();
1355  aBegin += aStep;
1356  bBegin += bStep;
1357  }
1358  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1359  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1360  }
1361 
1362  // matrix-matrix multiplication C = A^T * B^T
1363  // matrix layouts: C...row_major, A...col_major, B...row_major
1364  template <typename T>
1366  T alpha,
1367  const T * A,
1368  unsigned int A_row_start,
1369  unsigned int A_col_start,
1370  unsigned int A_row_inc,
1371  unsigned int A_col_inc,
1372  unsigned int A_row_size,
1373  unsigned int A_col_size,
1374  unsigned int A_internal_rows,
1375  unsigned int A_internal_cols,
1376  const T * B,
1377  unsigned int B_row_start,
1378  unsigned int B_col_start,
1379  unsigned int B_row_inc,
1380  unsigned int B_col_inc,
1381  unsigned int B_row_size,
1382  unsigned int B_col_size,
1383  unsigned int B_internal_rows,
1384  unsigned int B_internal_cols,
1385  T beta,
1386  T * C,
1387  unsigned int C_row_start,
1388  unsigned int C_col_start,
1389  unsigned int C_row_inc,
1390  unsigned int C_col_inc,
1391  unsigned int C_row_size,
1392  unsigned int C_col_size,
1393  unsigned int C_internal_rows,
1394  unsigned int C_internal_cols)
1395  {
1396 
1397  __shared__ T bufA[272];
1398  __shared__ T bufB[272];
1399 
1400  vcl_size_t block_size = 16;//get_local_size(0);
1401  vcl_size_t row_block_id = blockIdx.x;
1402  vcl_size_t col_block_id = blockIdx.y;
1403  vcl_size_t row_thread_id = threadIdx.x;
1404  vcl_size_t col_thread_id = threadIdx.y;
1405  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1406  vcl_size_t aStep = block_size * A_row_inc;
1407  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1408  vcl_size_t bStep = block_size * B_col_inc;
1409  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1410  T Csub = 0;
1411  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1412  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1413 
1414  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1415  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1416  for (vcl_size_t block = 0;
1417  block < block_num;
1418  ++block)
1419  {
1420  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1421  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1422  __syncthreads();
1423  T * bufAptr = bufA + row_thread_id_times_block_size;
1424  T * bufBptr = bufB + col_thread_id_times_block_size;
1425  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1426  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1427  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1428  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1429  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1430  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1431  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1432  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1433  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1434  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1435  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1436  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1437  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1438  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1439  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1440  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1441  __syncthreads();
1442  aBegin += aStep;
1443  bBegin += bStep;
1444  }
1445  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1446  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1447  }
1448 
1449 
1450 
1451 
1452 
1454 
1455 
1456 
1457 
1458 
1459 
1460  // matrix-matrix multiplication C = A * B
1461  // matrix layouts: C...col_major, A...row_major, B...col_major
1462  template <typename T>
1464  T alpha,
1465  const T * A,
1466  unsigned int A_row_start,
1467  unsigned int A_col_start,
1468  unsigned int A_row_inc,
1469  unsigned int A_col_inc,
1470  unsigned int A_row_size,
1471  unsigned int A_col_size,
1472  unsigned int A_internal_rows,
1473  unsigned int A_internal_cols,
1474  const T * B,
1475  unsigned int B_row_start,
1476  unsigned int B_col_start,
1477  unsigned int B_row_inc,
1478  unsigned int B_col_inc,
1479  unsigned int B_row_size,
1480  unsigned int B_col_size,
1481  unsigned int B_internal_rows,
1482  unsigned int B_internal_cols,
1483  T beta,
1484  T * C,
1485  unsigned int C_row_start,
1486  unsigned int C_col_start,
1487  unsigned int C_row_inc,
1488  unsigned int C_col_inc,
1489  unsigned int C_row_size,
1490  unsigned int C_col_size,
1491  unsigned int C_internal_rows,
1492  unsigned int C_internal_cols)
1493  {
1494 
1495  __shared__ T bufA[272];
1496  __shared__ T bufB[272];
1497 
1498  vcl_size_t block_size = 16;//get_local_size(0);
1499  vcl_size_t row_block_id = blockIdx.x;
1500  vcl_size_t col_block_id = blockIdx.y;
1501  vcl_size_t row_thread_id = threadIdx.x;
1502  vcl_size_t col_thread_id = threadIdx.y;
1503  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1504  vcl_size_t aStep = block_size * A_col_inc;
1505  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1506  vcl_size_t bStep = block_size * B_row_inc;
1507  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1508  T Csub = 0;
1509  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1510  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1511 
1512  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1513  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1514  for (vcl_size_t block = 0;
1515  block < block_num;
1516  ++block)
1517  {
1518  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1519  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1520  __syncthreads();
1521  T * bufAptr = bufA + row_thread_id_times_block_size;
1522  T * bufBptr = bufB + col_thread_id_times_block_size;
1523  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1524  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1525  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1526  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1527  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1528  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1529  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1530  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1531  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1532  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1533  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1534  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1535  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1536  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1537  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1538  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1539  __syncthreads();
1540  aBegin += aStep;
1541  bBegin += bStep;
1542  }
1543  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1544  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1545  }
1546 
1547  // matrix-matrix multiplication C = A * B^T
1548  // matrix layouts: C...col_major, A...row_major, B...col_major
1549  template <typename T>
1551  T alpha,
1552  const T * A,
1553  unsigned int A_row_start,
1554  unsigned int A_col_start,
1555  unsigned int A_row_inc,
1556  unsigned int A_col_inc,
1557  unsigned int A_row_size,
1558  unsigned int A_col_size,
1559  unsigned int A_internal_rows,
1560  unsigned int A_internal_cols,
1561  const T * B,
1562  unsigned int B_row_start,
1563  unsigned int B_col_start,
1564  unsigned int B_row_inc,
1565  unsigned int B_col_inc,
1566  unsigned int B_row_size,
1567  unsigned int B_col_size,
1568  unsigned int B_internal_rows,
1569  unsigned int B_internal_cols,
1570  T beta,
1571  T * C,
1572  unsigned int C_row_start,
1573  unsigned int C_col_start,
1574  unsigned int C_row_inc,
1575  unsigned int C_col_inc,
1576  unsigned int C_row_size,
1577  unsigned int C_col_size,
1578  unsigned int C_internal_rows,
1579  unsigned int C_internal_cols)
1580  {
1581 
1582  __shared__ T bufA[272];
1583  __shared__ T bufB[272];
1584 
1585  vcl_size_t block_size = 16;//get_local_size(0);
1586  vcl_size_t row_block_id = blockIdx.x;
1587  vcl_size_t col_block_id = blockIdx.y;
1588  vcl_size_t row_thread_id = threadIdx.x;
1589  vcl_size_t col_thread_id = threadIdx.y;
1590  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1591  vcl_size_t aStep = block_size * A_col_inc;
1592  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1593  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1594  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1595  T Csub = 0;
1596  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1597  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1598 
1599  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1600  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1601  for (vcl_size_t block = 0;
1602  block < block_num;
1603  ++block)
1604  {
1605  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1606  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1607  __syncthreads();
1608  T * bufAptr = bufA + row_thread_id_times_block_size;
1609  T * bufBptr = bufB + col_thread_id_times_block_size;
1610  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1611  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1612  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1613  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1614  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1615  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1616  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1617  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1618  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1619  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1620  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1621  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1622  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1623  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1624  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1625  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1626  __syncthreads();
1627  aBegin += aStep;
1628  bBegin += bStep;
1629  }
1630  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1631  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1632  }
1633 
1634  // matrix-matrix multiplication C = A^T * B
1635  // matrix layouts: C...col_major, A...row_major, B...col_major
1636  template <typename T>
1638  T alpha,
1639  const T * A,
1640  unsigned int A_row_start,
1641  unsigned int A_col_start,
1642  unsigned int A_row_inc,
1643  unsigned int A_col_inc,
1644  unsigned int A_row_size,
1645  unsigned int A_col_size,
1646  unsigned int A_internal_rows,
1647  unsigned int A_internal_cols,
1648  const T * B,
1649  unsigned int B_row_start,
1650  unsigned int B_col_start,
1651  unsigned int B_row_inc,
1652  unsigned int B_col_inc,
1653  unsigned int B_row_size,
1654  unsigned int B_col_size,
1655  unsigned int B_internal_rows,
1656  unsigned int B_internal_cols,
1657  T beta,
1658  T * C,
1659  unsigned int C_row_start,
1660  unsigned int C_col_start,
1661  unsigned int C_row_inc,
1662  unsigned int C_col_inc,
1663  unsigned int C_row_size,
1664  unsigned int C_col_size,
1665  unsigned int C_internal_rows,
1666  unsigned int C_internal_cols)
1667  {
1668 
1669  __shared__ T bufA[272];
1670  __shared__ T bufB[272];
1671 
1672  vcl_size_t block_size = 16;//get_local_size(0);
1673  vcl_size_t row_block_id = blockIdx.x;
1674  vcl_size_t col_block_id = blockIdx.y;
1675  vcl_size_t row_thread_id = threadIdx.x;
1676  vcl_size_t col_thread_id = threadIdx.y;
1677  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
1678  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
1679  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1680  vcl_size_t bStep = block_size * B_row_inc;
1681  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1682  T Csub = 0;
1683  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1684  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1685 
1686  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1687  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1688  for (vcl_size_t block = 0;
1689  block < block_num;
1690  ++block)
1691  {
1692  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1693  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1694  __syncthreads();
1695  T * bufAptr = bufA + row_thread_id_times_block_size;
1696  T * bufBptr = bufB + col_thread_id_times_block_size;
1697  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1698  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1699  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1700  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1701  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1702  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1703  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1704  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1705  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1706  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1707  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1708  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1709  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1710  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1711  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1712  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1713  __syncthreads();
1714  aBegin += aStep;
1715  bBegin += bStep;
1716  }
1717  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1718  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1719  }
1720 
1721  // matrix-matrix multiplication C = A^T * B^T
1722  // matrix layouts: C...col_major, A...row_major, B...col_major
1723  template <typename T>
1725  T alpha,
1726  const T * A,
1727  unsigned int A_row_start,
1728  unsigned int A_col_start,
1729  unsigned int A_row_inc,
1730  unsigned int A_col_inc,
1731  unsigned int A_row_size,
1732  unsigned int A_col_size,
1733  unsigned int A_internal_rows,
1734  unsigned int A_internal_cols,
1735  const T * B,
1736  unsigned int B_row_start,
1737  unsigned int B_col_start,
1738  unsigned int B_row_inc,
1739  unsigned int B_col_inc,
1740  unsigned int B_row_size,
1741  unsigned int B_col_size,
1742  unsigned int B_internal_rows,
1743  unsigned int B_internal_cols,
1744  T beta,
1745  T * C,
1746  unsigned int C_row_start,
1747  unsigned int C_col_start,
1748  unsigned int C_row_inc,
1749  unsigned int C_col_inc,
1750  unsigned int C_row_size,
1751  unsigned int C_col_size,
1752  unsigned int C_internal_rows,
1753  unsigned int C_internal_cols)
1754  {
1755 
1756  __shared__ T bufA[272];
1757  __shared__ T bufB[272];
1758 
1759  vcl_size_t block_size = 16;//get_local_size(0);
1760  vcl_size_t row_block_id = blockIdx.x;
1761  vcl_size_t col_block_id = blockIdx.y;
1762  vcl_size_t row_thread_id = threadIdx.x;
1763  vcl_size_t col_thread_id = threadIdx.y;
1764  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
1765  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
1766  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1767  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1768  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1769  T Csub = 0;
1770  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1771  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1772 
1773  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1774  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1775  for (vcl_size_t block = 0;
1776  block < block_num;
1777  ++block)
1778  {
1779  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1780  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1781  __syncthreads();
1782  T * bufAptr = bufA + row_thread_id_times_block_size;
1783  T * bufBptr = bufB + col_thread_id_times_block_size;
1784  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1785  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1786  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1787  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1788  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1789  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1790  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1791  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1792  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1793  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1794  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1795  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1796  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1797  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1798  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1799  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1800  __syncthreads();
1801  aBegin += aStep;
1802  bBegin += bStep;
1803  }
1804  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1805  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1806  }
1807 
1808 
1809 
1810 
1812 
1813 
1814 
1815 
1816  // matrix-matrix multiplication C = A * B
1817  // matrix layouts: C...row_major, A...row_major, B...col_major
1818  template <typename T>
1820  T alpha,
1821  const T * A,
1822  unsigned int A_row_start,
1823  unsigned int A_col_start,
1824  unsigned int A_row_inc,
1825  unsigned int A_col_inc,
1826  unsigned int A_row_size,
1827  unsigned int A_col_size,
1828  unsigned int A_internal_rows,
1829  unsigned int A_internal_cols,
1830  const T * B,
1831  unsigned int B_row_start,
1832  unsigned int B_col_start,
1833  unsigned int B_row_inc,
1834  unsigned int B_col_inc,
1835  unsigned int B_row_size,
1836  unsigned int B_col_size,
1837  unsigned int B_internal_rows,
1838  unsigned int B_internal_cols,
1839  T beta,
1840  T * C,
1841  unsigned int C_row_start,
1842  unsigned int C_col_start,
1843  unsigned int C_row_inc,
1844  unsigned int C_col_inc,
1845  unsigned int C_row_size,
1846  unsigned int C_col_size,
1847  unsigned int C_internal_rows,
1848  unsigned int C_internal_cols)
1849  {
1850 
1851  __shared__ T bufA[272];
1852  __shared__ T bufB[272];
1853 
1854  vcl_size_t block_size = 16;//get_local_size(0);
1855  vcl_size_t row_block_id = blockIdx.x;
1856  vcl_size_t col_block_id = blockIdx.y;
1857  vcl_size_t row_thread_id = threadIdx.x;
1858  vcl_size_t col_thread_id = threadIdx.y;
1859  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1860  vcl_size_t aStep = block_size * A_col_inc;
1861  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1862  vcl_size_t bStep = block_size * B_row_inc;
1863  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1864  T Csub = 0;
1865  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1866  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1867 
1868  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1869  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1870  for (vcl_size_t block = 0;
1871  block < block_num;
1872  ++block)
1873  {
1874  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1875  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1876  __syncthreads();
1877  T * bufAptr = bufA + row_thread_id_times_block_size;
1878  T * bufBptr = bufB + col_thread_id_times_block_size;
1879  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1880  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1881  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1882  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1883  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1884  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1885  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1886  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1887  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1888  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1889  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1890  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1891  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1892  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1893  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1894  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1895  __syncthreads();
1896  aBegin += aStep;
1897  bBegin += bStep;
1898  }
1899  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1900  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1901  }
1902 
1903  // matrix-matrix multiplication C = A * B^T
1904  // matrix layouts: C...row_major, A...row_major, B...col_major
1905  template <typename T>
1907  T alpha,
1908  const T * A,
1909  unsigned int A_row_start,
1910  unsigned int A_col_start,
1911  unsigned int A_row_inc,
1912  unsigned int A_col_inc,
1913  unsigned int A_row_size,
1914  unsigned int A_col_size,
1915  unsigned int A_internal_rows,
1916  unsigned int A_internal_cols,
1917  const T * B,
1918  unsigned int B_row_start,
1919  unsigned int B_col_start,
1920  unsigned int B_row_inc,
1921  unsigned int B_col_inc,
1922  unsigned int B_row_size,
1923  unsigned int B_col_size,
1924  unsigned int B_internal_rows,
1925  unsigned int B_internal_cols,
1926  T beta,
1927  T * C,
1928  unsigned int C_row_start,
1929  unsigned int C_col_start,
1930  unsigned int C_row_inc,
1931  unsigned int C_col_inc,
1932  unsigned int C_row_size,
1933  unsigned int C_col_size,
1934  unsigned int C_internal_rows,
1935  unsigned int C_internal_cols)
1936  {
1937 
1938  __shared__ T bufA[272];
1939  __shared__ T bufB[272];
1940 
1941  vcl_size_t block_size = 16;//get_local_size(0);
1942  vcl_size_t row_block_id = blockIdx.x;
1943  vcl_size_t col_block_id = blockIdx.y;
1944  vcl_size_t row_thread_id = threadIdx.x;
1945  vcl_size_t col_thread_id = threadIdx.y;
1946  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1947  vcl_size_t aStep = block_size * A_col_inc;
1948  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1949  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1950  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1951  T Csub = 0;
1952  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1953  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1954 
1955  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1956  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1957  for (vcl_size_t block = 0;
1958  block < block_num;
1959  ++block)
1960  {
1961  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1962  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1963  __syncthreads();
1964  T * bufAptr = bufA + row_thread_id_times_block_size;
1965  T * bufBptr = bufB + col_thread_id_times_block_size;
1966  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1967  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1968  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1969  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1970  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1971  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1972  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1973  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1974  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1975  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1976  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1977  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1978  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1979  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1980  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1981  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1982  __syncthreads();
1983  aBegin += aStep;
1984  bBegin += bStep;
1985  }
1986  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1987  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1988  }
1989 
1990  // matrix-matrix multiplication C = A^T * B
1991  // matrix layouts: C...row_major, A...row_major, B...col_major
1992  template <typename T>
1994  T alpha,
1995  const T * A,
1996  unsigned int A_row_start,
1997  unsigned int A_col_start,
1998  unsigned int A_row_inc,
1999  unsigned int A_col_inc,
2000  unsigned int A_row_size,
2001  unsigned int A_col_size,
2002  unsigned int A_internal_rows,
2003  unsigned int A_internal_cols,
2004  const T * B,
2005  unsigned int B_row_start,
2006  unsigned int B_col_start,
2007  unsigned int B_row_inc,
2008  unsigned int B_col_inc,
2009  unsigned int B_row_size,
2010  unsigned int B_col_size,
2011  unsigned int B_internal_rows,
2012  unsigned int B_internal_cols,
2013  T beta,
2014  T * C,
2015  unsigned int C_row_start,
2016  unsigned int C_col_start,
2017  unsigned int C_row_inc,
2018  unsigned int C_col_inc,
2019  unsigned int C_row_size,
2020  unsigned int C_col_size,
2021  unsigned int C_internal_rows,
2022  unsigned int C_internal_cols)
2023  {
2024 
2025  __shared__ T bufA[272];
2026  __shared__ T bufB[272];
2027 
2028  vcl_size_t block_size = 16;//get_local_size(0);
2029  vcl_size_t row_block_id = blockIdx.x;
2030  vcl_size_t col_block_id = blockIdx.y;
2031  vcl_size_t row_thread_id = threadIdx.x;
2032  vcl_size_t col_thread_id = threadIdx.y;
2033  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2034  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2035  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
2036  vcl_size_t bStep = block_size * B_row_inc;
2037  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2038  T Csub = 0;
2039  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2040  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
2041 
2042  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2043  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2044  for (vcl_size_t block = 0;
2045  block < block_num;
2046  ++block)
2047  {
2048  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2049  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2050  __syncthreads();
2051  T * bufAptr = bufA + row_thread_id_times_block_size;
2052  T * bufBptr = bufB + col_thread_id_times_block_size;
2053  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2054  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2055  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2056  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2057  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2058  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2059  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2060  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2061  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2062  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2063  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2064  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2065  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2066  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2067  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2068  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2069  __syncthreads();
2070  aBegin += aStep;
2071  bBegin += bStep;
2072  }
2073  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2074  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2075  }
2076 
2077  // matrix-matrix multiplication C = A^T * B^T
2078  // matrix layouts: C...row_major, A...row_major, B...col_major
2079  template <typename T>
2081  T alpha,
2082  const T * A,
2083  unsigned int A_row_start,
2084  unsigned int A_col_start,
2085  unsigned int A_row_inc,
2086  unsigned int A_col_inc,
2087  unsigned int A_row_size,
2088  unsigned int A_col_size,
2089  unsigned int A_internal_rows,
2090  unsigned int A_internal_cols,
2091  const T * B,
2092  unsigned int B_row_start,
2093  unsigned int B_col_start,
2094  unsigned int B_row_inc,
2095  unsigned int B_col_inc,
2096  unsigned int B_row_size,
2097  unsigned int B_col_size,
2098  unsigned int B_internal_rows,
2099  unsigned int B_internal_cols,
2100  T beta,
2101  T * C,
2102  unsigned int C_row_start,
2103  unsigned int C_col_start,
2104  unsigned int C_row_inc,
2105  unsigned int C_col_inc,
2106  unsigned int C_row_size,
2107  unsigned int C_col_size,
2108  unsigned int C_internal_rows,
2109  unsigned int C_internal_cols)
2110  {
2111 
2112  __shared__ T bufA[272];
2113  __shared__ T bufB[272];
2114 
2115  vcl_size_t block_size = 16;//get_local_size(0);
2116  vcl_size_t row_block_id = blockIdx.x;
2117  vcl_size_t col_block_id = blockIdx.y;
2118  vcl_size_t row_thread_id = threadIdx.x;
2119  vcl_size_t col_thread_id = threadIdx.y;
2120  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2121  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2122  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
2123  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
2124  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2125  T Csub = 0;
2126  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2127  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
2128 
2129  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2130  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2131  for (vcl_size_t block = 0;
2132  block < block_num;
2133  ++block)
2134  {
2135  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2136  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2137  __syncthreads();
2138  T * bufAptr = bufA + row_thread_id_times_block_size;
2139  T * bufBptr = bufB + col_thread_id_times_block_size;
2140  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2141  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2142  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2143  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2144  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2145  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2146  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2147  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2148  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2149  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2150  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2151  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2152  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2153  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2154  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2155  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2156  __syncthreads();
2157  aBegin += aStep;
2158  bBegin += bStep;
2159  }
2160  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2161  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2162  }
2163 
2164 
2165 
2166 
2167 
2169 
2170 
2171 
2172 
2173 
2174 
2175  // matrix-matrix multiplication C = A * B
2176  // matrix layouts: C...col_major, A...row_major, B...row_major
2177  template <typename T>
2179  T alpha,
2180  const T * A,
2181  unsigned int A_row_start,
2182  unsigned int A_col_start,
2183  unsigned int A_row_inc,
2184  unsigned int A_col_inc,
2185  unsigned int A_row_size,
2186  unsigned int A_col_size,
2187  unsigned int A_internal_rows,
2188  unsigned int A_internal_cols,
2189  const T * B,
2190  unsigned int B_row_start,
2191  unsigned int B_col_start,
2192  unsigned int B_row_inc,
2193  unsigned int B_col_inc,
2194  unsigned int B_row_size,
2195  unsigned int B_col_size,
2196  unsigned int B_internal_rows,
2197  unsigned int B_internal_cols,
2198  T beta,
2199  T * C,
2200  unsigned int C_row_start,
2201  unsigned int C_col_start,
2202  unsigned int C_row_inc,
2203  unsigned int C_col_inc,
2204  unsigned int C_row_size,
2205  unsigned int C_col_size,
2206  unsigned int C_internal_rows,
2207  unsigned int C_internal_cols)
2208  {
2209 
2210  __shared__ T bufA[272];
2211  __shared__ T bufB[272];
2212 
2213  vcl_size_t block_size = 16;//get_local_size(0);
2214  vcl_size_t row_block_id = blockIdx.x;
2215  vcl_size_t col_block_id = blockIdx.y;
2216  vcl_size_t row_thread_id = threadIdx.x;
2217  vcl_size_t col_thread_id = threadIdx.y;
2218  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2219  vcl_size_t aStep = block_size * A_col_inc;
2220  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2221  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2222  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2223  T Csub = 0;
2224  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2225  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2226 
2227  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2228  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2229  for (vcl_size_t block = 0;
2230  block < block_num;
2231  ++block)
2232  {
2233  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2234  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2235  __syncthreads();
2236  T * bufAptr = bufA + row_thread_id_times_block_size;
2237  T * bufBptr = bufB + col_thread_id_times_block_size;
2238  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2239  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2240  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2241  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2242  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2243  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2244  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2245  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2246  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2247  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2248  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2249  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2250  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2251  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2252  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2253  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2254  __syncthreads();
2255  aBegin += aStep;
2256  bBegin += bStep;
2257  }
2258  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2259  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2260  }
2261 
2262  // matrix-matrix multiplication C = A * B^T
2263  // matrix layouts: C...col_major, A...row_major, B...row_major
2264  template <typename T>
2266  T alpha,
2267  const T * A,
2268  unsigned int A_row_start,
2269  unsigned int A_col_start,
2270  unsigned int A_row_inc,
2271  unsigned int A_col_inc,
2272  unsigned int A_row_size,
2273  unsigned int A_col_size,
2274  unsigned int A_internal_rows,
2275  unsigned int A_internal_cols,
2276  const T * B,
2277  unsigned int B_row_start,
2278  unsigned int B_col_start,
2279  unsigned int B_row_inc,
2280  unsigned int B_col_inc,
2281  unsigned int B_row_size,
2282  unsigned int B_col_size,
2283  unsigned int B_internal_rows,
2284  unsigned int B_internal_cols,
2285  T beta,
2286  T * C,
2287  unsigned int C_row_start,
2288  unsigned int C_col_start,
2289  unsigned int C_row_inc,
2290  unsigned int C_col_inc,
2291  unsigned int C_row_size,
2292  unsigned int C_col_size,
2293  unsigned int C_internal_rows,
2294  unsigned int C_internal_cols)
2295  {
2296 
2297  __shared__ T bufA[272];
2298  __shared__ T bufB[272];
2299 
2300  vcl_size_t block_size = 16;//get_local_size(0);
2301  vcl_size_t row_block_id = blockIdx.x;
2302  vcl_size_t col_block_id = blockIdx.y;
2303  vcl_size_t row_thread_id = threadIdx.x;
2304  vcl_size_t col_thread_id = threadIdx.y;
2305  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2306  vcl_size_t aStep = block_size * A_col_inc;
2307  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2308  vcl_size_t bStep = block_size * B_col_inc;
2309  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2310  T Csub = 0;
2311  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2312  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2313 
2314  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2315  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2316  for (vcl_size_t block = 0;
2317  block < block_num;
2318  ++block)
2319  {
2320  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2321  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2322  __syncthreads();
2323  T * bufAptr = bufA + row_thread_id_times_block_size;
2324  T * bufBptr = bufB + col_thread_id_times_block_size;
2325  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2326  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2327  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2328  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2329  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2330  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2331  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2332  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2333  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2334  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2335  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2336  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2337  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2338  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2339  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2340  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2341  __syncthreads();
2342  aBegin += aStep;
2343  bBegin += bStep;
2344  }
2345  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2346  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2347  }
2348 
2349  // matrix-matrix multiplication C = A^T * B
2350  // matrix layouts: C...col_major, A...row_major, B...row_major
2351  template <typename T>
2353  T alpha,
2354  const T * A,
2355  unsigned int A_row_start,
2356  unsigned int A_col_start,
2357  unsigned int A_row_inc,
2358  unsigned int A_col_inc,
2359  unsigned int A_row_size,
2360  unsigned int A_col_size,
2361  unsigned int A_internal_rows,
2362  unsigned int A_internal_cols,
2363  const T * B,
2364  unsigned int B_row_start,
2365  unsigned int B_col_start,
2366  unsigned int B_row_inc,
2367  unsigned int B_col_inc,
2368  unsigned int B_row_size,
2369  unsigned int B_col_size,
2370  unsigned int B_internal_rows,
2371  unsigned int B_internal_cols,
2372  T beta,
2373  T * C,
2374  unsigned int C_row_start,
2375  unsigned int C_col_start,
2376  unsigned int C_row_inc,
2377  unsigned int C_col_inc,
2378  unsigned int C_row_size,
2379  unsigned int C_col_size,
2380  unsigned int C_internal_rows,
2381  unsigned int C_internal_cols)
2382  {
2383 
2384  __shared__ T bufA[272];
2385  __shared__ T bufB[272];
2386 
2387  vcl_size_t block_size = 16;//get_local_size(0);
2388  vcl_size_t row_block_id = blockIdx.x;
2389  vcl_size_t col_block_id = blockIdx.y;
2390  vcl_size_t row_thread_id = threadIdx.x;
2391  vcl_size_t col_thread_id = threadIdx.y;
2392  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2393  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2394  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2395  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2396  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2397  T Csub = 0;
2398  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2399  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2400 
2401  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2402  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2403  for (vcl_size_t block = 0;
2404  block < block_num;
2405  ++block)
2406  {
2407  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2408  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2409  __syncthreads();
2410  T * bufAptr = bufA + row_thread_id_times_block_size;
2411  T * bufBptr = bufB + col_thread_id_times_block_size;
2412  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2413  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2414  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2415  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2416  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2417  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2418  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2419  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2420  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2421  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2422  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2423  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2424  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2425  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2426  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2427  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2428  __syncthreads();
2429  aBegin += aStep;
2430  bBegin += bStep;
2431  }
2432  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2433  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2434  }
2435 
2436  // matrix-matrix multiplication C = A^T * B^T
2437  // matrix layouts: C...col_major, A...row_major, B...row_major
2438  template <typename T>
2440  T alpha,
2441  const T * A,
2442  unsigned int A_row_start,
2443  unsigned int A_col_start,
2444  unsigned int A_row_inc,
2445  unsigned int A_col_inc,
2446  unsigned int A_row_size,
2447  unsigned int A_col_size,
2448  unsigned int A_internal_rows,
2449  unsigned int A_internal_cols,
2450  const T * B,
2451  unsigned int B_row_start,
2452  unsigned int B_col_start,
2453  unsigned int B_row_inc,
2454  unsigned int B_col_inc,
2455  unsigned int B_row_size,
2456  unsigned int B_col_size,
2457  unsigned int B_internal_rows,
2458  unsigned int B_internal_cols,
2459  T beta,
2460  T * C,
2461  unsigned int C_row_start,
2462  unsigned int C_col_start,
2463  unsigned int C_row_inc,
2464  unsigned int C_col_inc,
2465  unsigned int C_row_size,
2466  unsigned int C_col_size,
2467  unsigned int C_internal_rows,
2468  unsigned int C_internal_cols)
2469  {
2470 
2471  __shared__ T bufA[272];
2472  __shared__ T bufB[272];
2473 
2474  vcl_size_t block_size = 16;//get_local_size(0);
2475  vcl_size_t row_block_id = blockIdx.x;
2476  vcl_size_t col_block_id = blockIdx.y;
2477  vcl_size_t row_thread_id = threadIdx.x;
2478  vcl_size_t col_thread_id = threadIdx.y;
2479  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2480  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2481  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2482  vcl_size_t bStep = block_size * B_col_inc;
2483  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2484  T Csub = 0;
2485  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2486  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2487 
2488  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2489  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2490  for (vcl_size_t block = 0;
2491  block < block_num;
2492  ++block)
2493  {
2494  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2495  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2496  __syncthreads();
2497  T * bufAptr = bufA + row_thread_id_times_block_size;
2498  T * bufBptr = bufB + col_thread_id_times_block_size;
2499  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2500  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2501  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2502  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2503  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2504  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2505  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2506  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2507  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2508  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2509  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2510  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2511  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2512  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2513  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2514  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2515  __syncthreads();
2516  aBegin += aStep;
2517  bBegin += bStep;
2518  }
2519  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2520  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2521  }
2522 
2523 
2524 
2525 
2526 
2528 
2529 
2530 
2531 
2532  // matrix-matrix multiplication C = A * B
2533  // matrix layouts: C...row_major, A...row_major, B...row_major
2534  template <typename T>
2536  T alpha,
2537  const T * A,
2538  unsigned int A_row_start,
2539  unsigned int A_col_start,
2540  unsigned int A_row_inc,
2541  unsigned int A_col_inc,
2542  unsigned int A_row_size,
2543  unsigned int A_col_size,
2544  unsigned int A_internal_rows,
2545  unsigned int A_internal_cols,
2546  const T * B,
2547  unsigned int B_row_start,
2548  unsigned int B_col_start,
2549  unsigned int B_row_inc,
2550  unsigned int B_col_inc,
2551  unsigned int B_row_size,
2552  unsigned int B_col_size,
2553  unsigned int B_internal_rows,
2554  unsigned int B_internal_cols,
2555  T beta,
2556  T * C,
2557  unsigned int C_row_start,
2558  unsigned int C_col_start,
2559  unsigned int C_row_inc,
2560  unsigned int C_col_inc,
2561  unsigned int C_row_size,
2562  unsigned int C_col_size,
2563  unsigned int C_internal_rows,
2564  unsigned int C_internal_cols)
2565  {
2566 
2567  __shared__ T bufA[272];
2568  __shared__ T bufB[272];
2569 
2570  vcl_size_t block_size = 16;//get_local_size(0);
2571  vcl_size_t row_block_id = blockIdx.x;
2572  vcl_size_t col_block_id = blockIdx.y;
2573  vcl_size_t row_thread_id = threadIdx.x;
2574  vcl_size_t col_thread_id = threadIdx.y;
2575  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2576  vcl_size_t aStep = block_size * A_col_inc;
2577  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2578  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2579  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2580  T Csub = 0;
2581  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2582  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2583 
2584  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2585  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2586  for (vcl_size_t block = 0;
2587  block < block_num;
2588  ++block)
2589  {
2590  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2591  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2592  __syncthreads();
2593  T * bufAptr = bufA + row_thread_id_times_block_size;
2594  T * bufBptr = bufB + col_thread_id_times_block_size;
2595  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2596  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2597  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2598  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2599  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2600  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2601  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2602  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2603  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2604  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2605  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2606  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2607  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2608  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2609  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2610  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2611  __syncthreads();
2612  aBegin += aStep;
2613  bBegin += bStep;
2614  }
2615  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2616  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2617  }
2618 
2619  // matrix-matrix multiplication C = A * B^T
2620  // matrix layouts: C...row_major, A...row_major, B...row_major
2621  template <typename T>
2623  T alpha,
2624  const T * A,
2625  unsigned int A_row_start,
2626  unsigned int A_col_start,
2627  unsigned int A_row_inc,
2628  unsigned int A_col_inc,
2629  unsigned int A_row_size,
2630  unsigned int A_col_size,
2631  unsigned int A_internal_rows,
2632  unsigned int A_internal_cols,
2633  const T * B,
2634  unsigned int B_row_start,
2635  unsigned int B_col_start,
2636  unsigned int B_row_inc,
2637  unsigned int B_col_inc,
2638  unsigned int B_row_size,
2639  unsigned int B_col_size,
2640  unsigned int B_internal_rows,
2641  unsigned int B_internal_cols,
2642  T beta,
2643  T * C,
2644  unsigned int C_row_start,
2645  unsigned int C_col_start,
2646  unsigned int C_row_inc,
2647  unsigned int C_col_inc,
2648  unsigned int C_row_size,
2649  unsigned int C_col_size,
2650  unsigned int C_internal_rows,
2651  unsigned int C_internal_cols)
2652  {
2653 
2654  __shared__ T bufA[272];
2655  __shared__ T bufB[272];
2656 
2657  vcl_size_t block_size = 16;//get_local_size(0);
2658  vcl_size_t row_block_id = blockIdx.x;
2659  vcl_size_t col_block_id = blockIdx.y;
2660  vcl_size_t row_thread_id = threadIdx.x;
2661  vcl_size_t col_thread_id = threadIdx.y;
2662  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2663  vcl_size_t aStep = block_size * A_col_inc;
2664  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2665  vcl_size_t bStep = block_size * B_col_inc;
2666  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2667  T Csub = 0;
2668  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2669  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2670 
2671  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2672  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2673  for (vcl_size_t block = 0;
2674  block < block_num;
2675  ++block)
2676  {
2677  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2678  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2679  __syncthreads();
2680  T * bufAptr = bufA + row_thread_id_times_block_size;
2681  T * bufBptr = bufB + col_thread_id_times_block_size;
2682  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2683  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2684  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2685  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2686  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2687  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2688  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2689  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2690  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2691  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2692  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2693  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2694  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2695  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2696  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2697  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2698  __syncthreads();
2699  aBegin += aStep;
2700  bBegin += bStep;
2701  }
2702  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2703  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2704  }
2705 
2706  // matrix-matrix multiplication C = A^T * B
2707  // matrix layouts: C...row_major, A...row_major, B...row_major
2708  template <typename T>
2710  T alpha,
2711  const T * A,
2712  unsigned int A_row_start,
2713  unsigned int A_col_start,
2714  unsigned int A_row_inc,
2715  unsigned int A_col_inc,
2716  unsigned int A_row_size,
2717  unsigned int A_col_size,
2718  unsigned int A_internal_rows,
2719  unsigned int A_internal_cols,
2720  const T * B,
2721  unsigned int B_row_start,
2722  unsigned int B_col_start,
2723  unsigned int B_row_inc,
2724  unsigned int B_col_inc,
2725  unsigned int B_row_size,
2726  unsigned int B_col_size,
2727  unsigned int B_internal_rows,
2728  unsigned int B_internal_cols,
2729  T beta,
2730  T * C,
2731  unsigned int C_row_start,
2732  unsigned int C_col_start,
2733  unsigned int C_row_inc,
2734  unsigned int C_col_inc,
2735  unsigned int C_row_size,
2736  unsigned int C_col_size,
2737  unsigned int C_internal_rows,
2738  unsigned int C_internal_cols)
2739  {
2740 
2741  __shared__ T bufA[272];
2742  __shared__ T bufB[272];
2743 
2744  vcl_size_t block_size = 16;//get_local_size(0);
2745  vcl_size_t row_block_id = blockIdx.x;
2746  vcl_size_t col_block_id = blockIdx.y;
2747  vcl_size_t row_thread_id = threadIdx.x;
2748  vcl_size_t col_thread_id = threadIdx.y;
2749  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2750  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2751  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2752  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2753  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2754  T Csub = 0;
2755  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2756  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2757 
2758  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2759  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2760  for (vcl_size_t block = 0;
2761  block < block_num;
2762  ++block)
2763  {
2764  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2765  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2766  __syncthreads();
2767  T * bufAptr = bufA + row_thread_id_times_block_size;
2768  T * bufBptr = bufB + col_thread_id_times_block_size;
2769  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2770  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2771  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2772  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2773  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2774  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2775  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2776  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2777  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2778  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2779  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2780  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2781  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2782  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2783  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2784  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2785  __syncthreads();
2786  aBegin += aStep;
2787  bBegin += bStep;
2788  }
2789  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2790  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2791  }
2792 
2793  // matrix-matrix multiplication C = A^T * B^T
2794  // matrix layouts: C...row_major, A...row_major, B...row_major
2795  template <typename T>
2797  T alpha,
2798  const T * A,
2799  unsigned int A_row_start,
2800  unsigned int A_col_start,
2801  unsigned int A_row_inc,
2802  unsigned int A_col_inc,
2803  unsigned int A_row_size,
2804  unsigned int A_col_size,
2805  unsigned int A_internal_rows,
2806  unsigned int A_internal_cols,
2807  const T * B,
2808  unsigned int B_row_start,
2809  unsigned int B_col_start,
2810  unsigned int B_row_inc,
2811  unsigned int B_col_inc,
2812  unsigned int B_row_size,
2813  unsigned int B_col_size,
2814  unsigned int B_internal_rows,
2815  unsigned int B_internal_cols,
2816  T beta,
2817  T * C,
2818  unsigned int C_row_start,
2819  unsigned int C_col_start,
2820  unsigned int C_row_inc,
2821  unsigned int C_col_inc,
2822  unsigned int C_row_size,
2823  unsigned int C_col_size,
2824  unsigned int C_internal_rows,
2825  unsigned int C_internal_cols)
2826  {
2827 
2828  __shared__ T bufA[272];
2829  __shared__ T bufB[272];
2830 
2831  vcl_size_t block_size = 16;//get_local_size(0);
2832  vcl_size_t row_block_id = blockIdx.x;
2833  vcl_size_t col_block_id = blockIdx.y;
2834  vcl_size_t row_thread_id = threadIdx.x;
2835  vcl_size_t col_thread_id = threadIdx.y;
2836  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2837  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2838  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2839  vcl_size_t bStep = block_size * B_col_inc;
2840  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2841  T Csub = 0;
2842  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2843  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2844 
2845  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2846  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2847  for (vcl_size_t block = 0;
2848  block < block_num;
2849  ++block)
2850  {
2851  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2852  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2853  __syncthreads();
2854  T * bufAptr = bufA + row_thread_id_times_block_size;
2855  T * bufBptr = bufB + col_thread_id_times_block_size;
2856  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2857  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2858  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2859  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2860  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2861  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2862  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2863  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2864  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2865  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2866  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2867  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2868  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2869  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2870  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2871  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2872  __syncthreads();
2873  aBegin += aStep;
2874  bBegin += bStep;
2875  }
2876  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2877  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2878  }
2879 
2880 
2881  } // namespace cuda
2882  } //namespace linalg
2883 } //namespace viennacl
2884 
2885 
2886 #endif
__global__ void matrix_matrix_col_col_row_prod_AT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:836
std::size_t vcl_size_t
Definition: forwards.h:58
__global__ void matrix_matrix_row_col_col_prod_TT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:654
__global__ void matrix_matrix_col_row_col_prod_TT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1724
__global__ void matrix_matrix_row_col_row_prod_TA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1278
__global__ void matrix_matrix_col_col_row_prod_TA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:923
__global__ void matrix_matrix_col_row_row_prod_AT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:2265
__global__ void matrix_matrix_row_row_col_prod_TA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1993
__global__ void matrix_matrix_col_col_row_prod_TT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1010
__global__ void matrix_matrix_row_row_col_prod_AT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1906
__global__ void matrix_matrix_row_col_row_prod_AA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1104
__global__ void matrix_matrix_col_row_row_prod_AA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:2178
__global__ void matrix_matrix_row_col_col_prod_AA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:393
__global__ void matrix_matrix_row_col_row_prod_AT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1191
__global__ void matrix_matrix_col_row_row_prod_TT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:2439
__global__ void matrix_matrix_row_col_row_prod_TT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1365
__global__ void matrix_matrix_col_col_row_prod_AA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:749
__global__ void matrix_matrix_row_row_col_prod_AA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1819
__global__ void matrix_matrix_row_row_row_prod_TT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:2796
__global__ void matrix_matrix_col_col_col_prod_TA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:212
__global__ void matrix_matrix_row_row_col_prod_TT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:2080
__global__ void matrix_matrix_col_row_col_prod_TA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1637
__global__ void matrix_matrix_col_row_row_prod_TA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:2352
__global__ void matrix_matrix_col_col_col_prod_AA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:38
__global__ void matrix_matrix_row_col_col_prod_AT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:480
__global__ void matrix_matrix_col_col_col_prod_AT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:125
__global__ void matrix_matrix_row_row_row_prod_AT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:2622
__global__ void matrix_matrix_row_row_row_prod_TA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:2709
__global__ void matrix_matrix_row_row_row_prod_AA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:2535
__global__ void matrix_matrix_col_col_col_prod_TT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:299
__global__ void matrix_matrix_row_col_col_prod_TA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:567
__global__ void matrix_matrix_col_row_col_prod_AA_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1463
__global__ void matrix_matrix_col_row_col_prod_AT_kernel(T alpha, const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, T beta, T *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Definition: matrix_operations_prod.hpp:1550