1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
40 unsigned int A_start1,
unsigned int A_start2,
41 unsigned int A_inc1,
unsigned int A_inc2,
42 unsigned int A_size1,
unsigned int A_size2,
43 unsigned int A_internal_size1,
unsigned int A_internal_size2,
46 unsigned int options2,
48 unsigned int B_start1,
unsigned int B_start2,
49 unsigned int B_inc1,
unsigned int B_inc2,
50 unsigned int B_internal_size1,
unsigned int B_internal_size2)
53 if (options2 & (1 << 0))
56 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
57 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
59 if (options2 & (1 << 1))
61 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
62 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
63 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
67 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
68 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
69 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
77 unsigned int A_start1,
unsigned int A_start2,
78 unsigned int A_inc1,
unsigned int A_inc2,
79 unsigned int A_size1,
unsigned int A_size2,
80 unsigned int A_internal_size1,
unsigned int A_internal_size2,
83 unsigned int options2,
85 unsigned int B_start1,
unsigned int B_start2,
86 unsigned int B_inc1,
unsigned int B_inc2,
87 unsigned int B_internal_size1,
unsigned int B_internal_size2)
90 if (options2 & (1 << 0))
93 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
94 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
96 if (options2 & (1 << 1))
98 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
99 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
100 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
104 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
105 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
106 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
116 template <
typename T>
119 unsigned int A_start1,
unsigned int A_start2,
120 unsigned int A_inc1,
unsigned int A_inc2,
121 unsigned int A_size1,
unsigned int A_size2,
122 unsigned int A_internal_size1,
unsigned int A_internal_size2,
125 unsigned int options2,
127 unsigned int B_start1,
unsigned int B_start2,
128 unsigned int B_inc1,
unsigned int B_inc2,
129 unsigned int B_internal_size1,
unsigned int B_internal_size2,
132 unsigned int options3,
134 unsigned int C_start1,
unsigned int C_start2,
135 unsigned int C_inc1,
unsigned int C_inc2,
136 unsigned int C_internal_size1,
unsigned int C_internal_size2)
139 if (options2 & (1 << 0))
143 if (options3 & (1 << 0))
146 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
147 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
149 if (options2 & (1 << 1))
151 if (options3 & (1 << 1))
153 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
154 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
155 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
156 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
157 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
161 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
162 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
163 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
164 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
165 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
170 if (options3 & (1 << 1))
172 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
173 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
174 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
175 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
176 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
180 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
181 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
182 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
183 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
184 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
191 template <
typename T>
194 unsigned int A_start1,
unsigned int A_start2,
195 unsigned int A_inc1,
unsigned int A_inc2,
196 unsigned int A_size1,
unsigned int A_size2,
197 unsigned int A_internal_size1,
unsigned int A_internal_size2,
200 unsigned int options2,
202 unsigned int B_start1,
unsigned int B_start2,
203 unsigned int B_inc1,
unsigned int B_inc2,
204 unsigned int B_internal_size1,
unsigned int B_internal_size2,
207 unsigned int options3,
209 unsigned int C_start1,
unsigned int C_start2,
210 unsigned int C_inc1,
unsigned int C_inc2,
211 unsigned int C_internal_size1,
unsigned int C_internal_size2)
214 if (options2 & (1 << 0))
218 if (options3 & (1 << 0))
221 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
222 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
224 if (options2 & (1 << 1))
226 if (options3 & (1 << 1))
228 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
229 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
230 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
231 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
232 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
236 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
237 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
238 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
239 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
240 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
245 if (options3 & (1 << 1))
247 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
248 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
249 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
250 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
251 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
255 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
256 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
257 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
258 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
259 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
265 template <
typename T>
268 unsigned int A_start1,
unsigned int A_start2,
269 unsigned int A_inc1,
unsigned int A_inc2,
270 unsigned int A_size1,
unsigned int A_size2,
271 unsigned int A_internal_size1,
unsigned int A_internal_size2,
274 unsigned int options2,
276 unsigned int B_start1,
unsigned int B_start2,
277 unsigned int B_inc1,
unsigned int B_inc2,
278 unsigned int B_internal_size1,
unsigned int B_internal_size2,
281 unsigned int options3,
283 unsigned int C_start1,
unsigned int C_start2,
284 unsigned int C_inc1,
unsigned int C_inc2,
285 unsigned int C_internal_size1,
unsigned int C_internal_size2)
288 if (options2 & (1 << 0))
292 if (options3 & (1 << 0))
295 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
296 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
298 if (options2 & (1 << 1))
300 if (options3 & (1 << 1))
302 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
303 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
304 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
305 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
306 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
310 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
311 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
312 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
313 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
314 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
319 if (options3 & (1 << 1))
321 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
322 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
323 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
324 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
325 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
329 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
330 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
331 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
332 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
333 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
340 template <
typename T>
343 unsigned int A_start1,
unsigned int A_start2,
344 unsigned int A_inc1,
unsigned int A_inc2,
345 unsigned int A_size1,
unsigned int A_size2,
346 unsigned int A_internal_size1,
unsigned int A_internal_size2,
349 unsigned int options2,
351 unsigned int B_start1,
unsigned int B_start2,
352 unsigned int B_inc1,
unsigned int B_inc2,
353 unsigned int B_internal_size1,
unsigned int B_internal_size2,
356 unsigned int options3,
358 unsigned int C_start1,
unsigned int C_start2,
359 unsigned int C_inc1,
unsigned int C_inc2,
360 unsigned int C_internal_size1,
unsigned int C_internal_size2)
363 if (options2 & (1 << 0))
367 if (options3 & (1 << 0))
370 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
371 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
373 if (options2 & (1 << 1))
375 if (options3 & (1 << 1))
377 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
378 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
379 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
380 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
381 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
385 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
386 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
387 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
388 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
389 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
394 if (options3 & (1 << 1))
396 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
397 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
398 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
399 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
400 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
404 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
405 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
406 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
407 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
408 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
419 template <
typename T>
422 unsigned int A_start1,
unsigned int A_start2,
423 unsigned int A_inc1,
unsigned int A_inc2,
424 unsigned int A_size1,
unsigned int A_size2,
425 unsigned int A_internal_size1,
unsigned int A_internal_size2,
428 unsigned int options2,
430 unsigned int B_start1,
unsigned int B_start2,
431 unsigned int B_inc1,
unsigned int B_inc2,
432 unsigned int B_internal_size1,
unsigned int B_internal_size2,
435 unsigned int options3,
437 unsigned int C_start1,
unsigned int C_start2,
438 unsigned int C_inc1,
unsigned int C_inc2,
439 unsigned int C_internal_size1,
unsigned int C_internal_size2)
442 if (options2 & (1 << 0))
446 if (options3 & (1 << 0))
449 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
450 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
452 if (options2 & (1 << 1))
454 if (options3 & (1 << 1))
456 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
457 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
458 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
459 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
460 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
464 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
465 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
466 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
467 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
468 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
473 if (options3 & (1 << 1))
475 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
476 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
477 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
478 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
479 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
483 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
484 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
485 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
486 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
487 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
494 template <
typename T>
497 unsigned int A_start1,
unsigned int A_start2,
498 unsigned int A_inc1,
unsigned int A_inc2,
499 unsigned int A_size1,
unsigned int A_size2,
500 unsigned int A_internal_size1,
unsigned int A_internal_size2,
503 unsigned int options2,
505 unsigned int B_start1,
unsigned int B_start2,
506 unsigned int B_inc1,
unsigned int B_inc2,
507 unsigned int B_internal_size1,
unsigned int B_internal_size2,
510 unsigned int options3,
512 unsigned int C_start1,
unsigned int C_start2,
513 unsigned int C_inc1,
unsigned int C_inc2,
514 unsigned int C_internal_size1,
unsigned int C_internal_size2)
517 if (options2 & (1 << 0))
521 if (options3 & (1 << 0))
524 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
525 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
527 if (options2 & (1 << 1))
529 if (options3 & (1 << 1))
531 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
532 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
533 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
534 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
535 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
539 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
540 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
541 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
542 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
543 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
548 if (options3 & (1 << 1))
550 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
551 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
552 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
553 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
554 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
558 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
559 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
560 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
561 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
562 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
568 template <
typename T>
571 unsigned int A_start1,
unsigned int A_start2,
572 unsigned int A_inc1,
unsigned int A_inc2,
573 unsigned int A_size1,
unsigned int A_size2,
574 unsigned int A_internal_size1,
unsigned int A_internal_size2,
577 unsigned int options2,
579 unsigned int B_start1,
unsigned int B_start2,
580 unsigned int B_inc1,
unsigned int B_inc2,
581 unsigned int B_internal_size1,
unsigned int B_internal_size2,
584 unsigned int options3,
586 unsigned int C_start1,
unsigned int C_start2,
587 unsigned int C_inc1,
unsigned int C_inc2,
588 unsigned int C_internal_size1,
unsigned int C_internal_size2)
591 if (options2 & (1 << 0))
595 if (options3 & (1 << 0))
598 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
599 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
601 if (options2 & (1 << 1))
603 if (options3 & (1 << 1))
605 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
606 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
607 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
608 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
609 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
613 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
614 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
615 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
616 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
617 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
622 if (options3 & (1 << 1))
624 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
625 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
626 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
627 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
628 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
632 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
633 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
634 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
635 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
636 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
643 template <
typename T>
646 unsigned int A_start1,
unsigned int A_start2,
647 unsigned int A_inc1,
unsigned int A_inc2,
648 unsigned int A_size1,
unsigned int A_size2,
649 unsigned int A_internal_size1,
unsigned int A_internal_size2,
652 unsigned int options2,
654 unsigned int B_start1,
unsigned int B_start2,
655 unsigned int B_inc1,
unsigned int B_inc2,
656 unsigned int B_internal_size1,
unsigned int B_internal_size2,
659 unsigned int options3,
661 unsigned int C_start1,
unsigned int C_start2,
662 unsigned int C_inc1,
unsigned int C_inc2,
663 unsigned int C_internal_size1,
unsigned int C_internal_size2)
666 if (options2 & (1 << 0))
670 if (options3 & (1 << 0))
673 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
674 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
676 if (options2 & (1 << 1))
678 if (options3 & (1 << 1))
680 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
681 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
682 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
683 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
684 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
688 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
689 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
690 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
691 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
692 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
697 if (options3 & (1 << 1))
699 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
700 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
701 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
702 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
703 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
707 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
708 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
709 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
710 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
711 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
722 template <
typename T>
725 unsigned int A_start1,
unsigned int A_start2,
726 unsigned int A_inc1,
unsigned int A_inc2,
727 unsigned int A_size1,
unsigned int A_size2,
728 unsigned int A_internal_size1,
unsigned int A_internal_size2,
731 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
732 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
734 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
735 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
736 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
740 template <
typename T>
743 unsigned int A_start1,
unsigned int A_start2,
744 unsigned int A_inc1,
unsigned int A_inc2,
745 unsigned int A_size1,
unsigned int A_size2,
746 unsigned int A_internal_size1,
unsigned int A_internal_size2,
749 unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
751 for (
unsigned int row = gid;
row < A_size1;
row += blockDim.x * gridDim.x)
752 A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2) * A_internal_size1] = alpha;
759 template <
typename T>
762 unsigned int A_start1,
unsigned int A_start2,
763 unsigned int A_inc1,
unsigned int A_inc2,
764 unsigned int A_size1,
unsigned int A_size2,
765 unsigned int A_internal_size1,
unsigned int A_internal_size2,
768 unsigned int B_start1,
unsigned int B_start2,
769 unsigned int B_inc1,
unsigned int B_inc2,
770 unsigned int B_internal_size1,
unsigned int B_internal_size2,
773 unsigned int C_start1,
unsigned int C_start2,
774 unsigned int C_inc1,
unsigned int C_inc2,
775 unsigned int C_internal_size1,
unsigned int C_internal_size2,
777 unsigned int op_type)
779 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
780 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
784 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
785 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
786 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
787 = pow(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
788 C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
790 else if (op_type == 1)
792 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
793 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
794 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
795 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
796 / C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
798 else if (op_type == 0)
800 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
801 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
802 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
803 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
804 * C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
808 template <
typename T>
811 unsigned int A_start1,
unsigned int A_start2,
812 unsigned int A_inc1,
unsigned int A_inc2,
813 unsigned int A_size1,
unsigned int A_size2,
814 unsigned int A_internal_size1,
unsigned int A_internal_size2,
817 unsigned int B_start1,
unsigned int B_start2,
818 unsigned int B_inc1,
unsigned int B_inc2,
819 unsigned int B_internal_size1,
unsigned int B_internal_size2,
822 unsigned int C_start1,
unsigned int C_start2,
823 unsigned int C_inc1,
unsigned int C_inc2,
824 unsigned int C_internal_size1,
unsigned int C_internal_size2,
826 unsigned int op_type)
828 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
829 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
833 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
834 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
835 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
836 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
837 / C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
839 else if (op_type == 0)
841 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
842 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
843 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
844 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
845 * C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
855 template <
typename T>
858 unsigned int A_start1,
unsigned int A_start2,
859 unsigned int A_inc1,
unsigned int A_inc2,
860 unsigned int A_size1,
unsigned int A_size2,
861 unsigned int A_internal_size1,
unsigned int A_internal_size2,
864 unsigned int B_start1,
unsigned int B_start2,
865 unsigned int B_inc1,
unsigned int B_inc2,
866 unsigned int B_internal_size1,
unsigned int B_internal_size2)
868 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
869 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
871 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
872 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
873 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
878 template <
typename T>
881 unsigned int A_start1,
unsigned int A_start2,
882 unsigned int A_inc1,
unsigned int A_inc2,
883 unsigned int A_size1,
unsigned int A_size2,
884 unsigned int A_internal_size1,
unsigned int A_internal_size2,
887 unsigned int B_start1,
unsigned int B_start2,
888 unsigned int B_inc1,
unsigned int B_inc2,
889 unsigned int B_internal_size1,
unsigned int B_internal_size2)
891 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
892 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
894 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
895 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
896 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
901 template <
typename T>
904 unsigned int A_start1,
unsigned int A_start2,
905 unsigned int A_inc1,
unsigned int A_inc2,
906 unsigned int A_size1,
unsigned int A_size2,
907 unsigned int A_internal_size1,
unsigned int A_internal_size2,
910 unsigned int B_start1,
unsigned int B_start2,
911 unsigned int B_inc1,
unsigned int B_inc2,
912 unsigned int B_internal_size1,
unsigned int B_internal_size2)
914 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
915 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
917 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
918 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
919 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
924 template <
typename T>
927 unsigned int A_start1,
unsigned int A_start2,
928 unsigned int A_inc1,
unsigned int A_inc2,
929 unsigned int A_size1,
unsigned int A_size2,
930 unsigned int A_internal_size1,
unsigned int A_internal_size2,
933 unsigned int B_start1,
unsigned int B_start2,
934 unsigned int B_inc1,
unsigned int B_inc2,
935 unsigned int B_internal_size1,
unsigned int B_internal_size2)
937 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
938 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
940 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
941 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
942 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
947 template <
typename T>
950 unsigned int A_start1,
unsigned int A_start2,
951 unsigned int A_inc1,
unsigned int A_inc2,
952 unsigned int A_size1,
unsigned int A_size2,
953 unsigned int A_internal_size1,
unsigned int A_internal_size2,
956 unsigned int B_start1,
unsigned int B_start2,
957 unsigned int B_inc1,
unsigned int B_inc2,
958 unsigned int B_internal_size1,
unsigned int B_internal_size2)
960 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
961 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
963 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
964 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
965 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
970 template <
typename T>
973 unsigned int A_start1,
unsigned int A_start2,
974 unsigned int A_inc1,
unsigned int A_inc2,
975 unsigned int A_size1,
unsigned int A_size2,
976 unsigned int A_internal_size1,
unsigned int A_internal_size2,
979 unsigned int B_start1,
unsigned int B_start2,
980 unsigned int B_inc1,
unsigned int B_inc2,
981 unsigned int B_internal_size1,
unsigned int B_internal_size2)
983 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
984 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
986 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
987 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
988 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
993 template <
typename T>
996 unsigned int A_start1,
unsigned int A_start2,
997 unsigned int A_inc1,
unsigned int A_inc2,
998 unsigned int A_size1,
unsigned int A_size2,
999 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1002 unsigned int B_start1,
unsigned int B_start2,
1003 unsigned int B_inc1,
unsigned int B_inc2,
1004 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1006 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1007 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1009 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1010 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1011 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1016 template <
typename T>
1019 unsigned int A_start1,
unsigned int A_start2,
1020 unsigned int A_inc1,
unsigned int A_inc2,
1021 unsigned int A_size1,
unsigned int A_size2,
1022 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1025 unsigned int B_start1,
unsigned int B_start2,
1026 unsigned int B_inc1,
unsigned int B_inc2,
1027 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1029 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1030 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1032 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1033 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1034 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1039 template <
typename T>
1042 unsigned int A_start1,
unsigned int A_start2,
1043 unsigned int A_inc1,
unsigned int A_inc2,
1044 unsigned int A_size1,
unsigned int A_size2,
1045 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1048 unsigned int B_start1,
unsigned int B_start2,
1049 unsigned int B_inc1,
unsigned int B_inc2,
1050 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1052 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1053 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1055 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1056 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1057 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1062 template <
typename T>
1065 unsigned int A_start1,
unsigned int A_start2,
1066 unsigned int A_inc1,
unsigned int A_inc2,
1067 unsigned int A_size1,
unsigned int A_size2,
1068 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1071 unsigned int B_start1,
unsigned int B_start2,
1072 unsigned int B_inc1,
unsigned int B_inc2,
1073 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1075 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1076 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1078 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1079 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1080 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1085 template <
typename T>
1088 unsigned int A_start1,
unsigned int A_start2,
1089 unsigned int A_inc1,
unsigned int A_inc2,
1090 unsigned int A_size1,
unsigned int A_size2,
1091 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1094 unsigned int B_start1,
unsigned int B_start2,
1095 unsigned int B_inc1,
unsigned int B_inc2,
1096 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1098 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1099 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1101 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1102 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1103 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1108 template <
typename T>
1111 unsigned int A_start1,
unsigned int A_start2,
1112 unsigned int A_inc1,
unsigned int A_inc2,
1113 unsigned int A_size1,
unsigned int A_size2,
1114 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1117 unsigned int B_start1,
unsigned int B_start2,
1118 unsigned int B_inc1,
unsigned int B_inc2,
1119 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1121 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1122 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1124 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1125 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1126 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1131 template <
typename T>
1134 unsigned int A_start1,
unsigned int A_start2,
1135 unsigned int A_inc1,
unsigned int A_inc2,
1136 unsigned int A_size1,
unsigned int A_size2,
1137 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1140 unsigned int B_start1,
unsigned int B_start2,
1141 unsigned int B_inc1,
unsigned int B_inc2,
1142 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1144 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1145 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1147 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1148 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1149 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1154 template <
typename T>
1157 unsigned int A_start1,
unsigned int A_start2,
1158 unsigned int A_inc1,
unsigned int A_inc2,
1159 unsigned int A_size1,
unsigned int A_size2,
1160 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1163 unsigned int B_start1,
unsigned int B_start2,
1164 unsigned int B_inc1,
unsigned int B_inc2,
1165 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1167 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1168 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1170 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1171 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1172 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1177 template <
typename T>
1180 unsigned int A_start1,
unsigned int A_start2,
1181 unsigned int A_inc1,
unsigned int A_inc2,
1182 unsigned int A_size1,
unsigned int A_size2,
1183 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1186 unsigned int B_start1,
unsigned int B_start2,
1187 unsigned int B_inc1,
unsigned int B_inc2,
1188 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1190 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1191 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1193 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1194 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1195 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1200 template <
typename T>
1203 unsigned int A_start1,
unsigned int A_start2,
1204 unsigned int A_inc1,
unsigned int A_inc2,
1205 unsigned int A_size1,
unsigned int A_size2,
1206 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1209 unsigned int B_start1,
unsigned int B_start2,
1210 unsigned int B_inc1,
unsigned int B_inc2,
1211 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1213 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1214 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1216 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1217 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1218 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1223 template <
typename T>
1226 unsigned int A_start1,
unsigned int A_start2,
1227 unsigned int A_inc1,
unsigned int A_inc2,
1228 unsigned int A_size1,
unsigned int A_size2,
1229 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1232 unsigned int B_start1,
unsigned int B_start2,
1233 unsigned int B_inc1,
unsigned int B_inc2,
1234 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1236 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1237 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1239 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1240 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1241 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1250 template <
typename T>
1253 unsigned int A_row_start,
1254 unsigned int A_col_start,
1255 unsigned int A_row_inc,
1256 unsigned int A_col_inc,
1257 unsigned int A_row_size,
1258 unsigned int A_col_size,
1259 unsigned int A_internal_rows,
1260 unsigned int A_internal_cols,
1262 unsigned int v_start,
1264 unsigned int v_size,
1266 unsigned int result_start,
1267 unsigned int result_inc,
1268 unsigned int result_size)
1271 for (
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
row < A_row_size;
row += gridDim.x * blockDim.x)
1274 for (
unsigned int col = 0; col < A_col_size; ++col)
1275 dot_prod += A[(
row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
1276 result[
row * result_inc + result_start] =
dot_prod;
1281 template <
typename T>
1284 unsigned int A_row_start,
1285 unsigned int A_col_start,
1286 unsigned int A_row_inc,
1287 unsigned int A_col_inc,
1288 unsigned int A_row_size,
1289 unsigned int A_col_size,
1290 unsigned int A_internal_rows,
1291 unsigned int A_internal_cols,
1293 unsigned int v_start,
1295 unsigned int v_size,
1297 unsigned int result_start,
1298 unsigned int result_inc,
1299 unsigned int result_size)
1301 __shared__ T work[128];
1303 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1304 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1305 unsigned int lid = threadIdx.x;
1307 for (
unsigned int row = row_gid;
row < A_col_size;
row += gridDim.x)
1310 for (
unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
1311 dot_prod += A[(
row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
1317 work[lid] += work[lid+
stride];
1321 result[
row * result_inc + result_start] = work[0];
1338 template <
typename T>
1341 unsigned int A_start1,
unsigned int A_start2,
1342 unsigned int A_inc1,
unsigned int A_inc2,
1343 unsigned int A_size1,
unsigned int A_size2,
1344 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1347 unsigned int options2,
1360 if (options2 & (1 << 0))
1362 if (options2 & (1 << 1))
1363 alpha = ((T)(1)) / alpha;
1365 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1366 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1368 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1370 T tmp = alpha * vec1[
row * inc1 +
start1];
1371 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1372 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
1378 template <
typename T>
1381 unsigned int A_start1,
unsigned int A_start2,
1382 unsigned int A_inc1,
unsigned int A_inc2,
1383 unsigned int A_size1,
unsigned int A_size2,
1384 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1387 unsigned int options2,
1400 if (options2 & (1 << 0))
1402 if (options2 & (1 << 1))
1403 alpha = ((T)(1)) / alpha;
1405 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1406 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1408 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1410 T tmp = alpha * vec1[
row * inc1 +
start1];
1411 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1412 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
__global__ void scaled_rank1_update_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T val, unsigned int options2, const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
Definition: matrix_operations_col.hpp:1339
__global__ void matrix_col_element_log_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1086
__global__ void matrix_col_element_abs_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:856
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
__global__ void matrix_col_diagonal_assign_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T alpha)
Definition: matrix_operations_col.hpp:741
__global__ void matrix_col_element_atan_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:925
__global__ void matrix_col_element_acos_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:879
__global__ void matrix_col_element_floor_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1063
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
__global__ void matrix_col_element_cos_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:971
__global__ void ambm_m_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, T fac3, unsigned int options3, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
Definition: matrix_operations_col.hpp:420
__global__ void matrix_col_element_asin_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:902
__global__ void matrix_col_element_exp_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1017
__global__ void matrix_col_element_ceil_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:948
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
__global__ void ambm_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, T fac3, unsigned int options3, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
Definition: matrix_operations_col.hpp:117
__global__ void matrix_col_element_tanh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1224
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
__global__ void am_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:38
__global__ void vec_mul_col_kernel(const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, T *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
Definition: matrix_operations_col.hpp:1251
__global__ void matrix_col_element_fabs_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1040
__global__ void matrix_col_element_tan_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1201
__global__ void element_op_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
Definition: matrix_operations_col.hpp:760
__global__ void element_op_int_col_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
Definition: matrix_operations_col.hpp:809
__global__ void trans_vec_mul_col_kernel(const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, T *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
Definition: matrix_operations_col.hpp:1282
__global__ void matrix_col_element_log10_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1109
__global__ void matrix_col_element_cosh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:994
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
__global__ void matrix_col_element_sinh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1155
__global__ void matrix_col_element_sin_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1132
void dot_prod(const MatrixType &A, unsigned int beg_ind, ScalarType &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:154
__global__ void matrix_col_element_sqrt_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_col.hpp:1178
__global__ void matrix_col_assign_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T alpha)
Definition: matrix_operations_col.hpp:723