1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
38 template<
typename NumericT>
42 unsigned int A_row_start,
43 unsigned int A_col_start,
44 unsigned int A_row_inc,
45 unsigned int A_col_inc,
46 unsigned int A_row_size,
47 unsigned int A_col_size,
48 unsigned int A_internal_rows,
49 unsigned int A_internal_cols,
51 unsigned int B_row_start,
52 unsigned int B_col_start,
53 unsigned int B_row_inc,
54 unsigned int B_col_inc,
55 unsigned int B_row_size,
56 unsigned int B_col_size,
57 unsigned int B_internal_rows,
58 unsigned int B_internal_cols,
61 unsigned int C_row_start,
62 unsigned int C_col_start,
63 unsigned int C_row_inc,
64 unsigned int C_col_inc,
65 unsigned int C_row_size,
66 unsigned int C_col_size,
67 unsigned int C_internal_rows,
68 unsigned int C_internal_cols)
79 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
80 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
81 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
83 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
85 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
86 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
88 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
89 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
94 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
95 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
97 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
98 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
99 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
100 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
101 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
102 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
103 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
104 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
105 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
106 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
107 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
108 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
109 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
110 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
111 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
112 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
113 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
114 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
119 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
120 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
125 template<
typename NumericT>
129 unsigned int A_row_start,
130 unsigned int A_col_start,
131 unsigned int A_row_inc,
132 unsigned int A_col_inc,
133 unsigned int A_row_size,
134 unsigned int A_col_size,
135 unsigned int A_internal_rows,
136 unsigned int A_internal_cols,
138 unsigned int B_row_start,
139 unsigned int B_col_start,
140 unsigned int B_row_inc,
141 unsigned int B_col_inc,
142 unsigned int B_row_size,
143 unsigned int B_col_size,
144 unsigned int B_internal_rows,
145 unsigned int B_internal_cols,
148 unsigned int C_row_start,
149 unsigned int C_col_start,
150 unsigned int C_row_inc,
151 unsigned int C_col_inc,
152 unsigned int C_row_size,
153 unsigned int C_col_size,
154 unsigned int C_internal_rows,
155 unsigned int C_internal_cols)
166 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
167 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
168 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
169 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
170 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
172 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
173 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
175 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
176 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
181 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
182 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
184 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
185 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
186 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
187 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
188 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
189 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
190 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
191 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
192 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
193 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
194 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
195 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
196 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
197 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
198 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
199 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
200 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
201 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
206 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
207 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
212 template<
typename NumericT>
216 unsigned int A_row_start,
217 unsigned int A_col_start,
218 unsigned int A_row_inc,
219 unsigned int A_col_inc,
220 unsigned int A_row_size,
221 unsigned int A_col_size,
222 unsigned int A_internal_rows,
223 unsigned int A_internal_cols,
225 unsigned int B_row_start,
226 unsigned int B_col_start,
227 unsigned int B_row_inc,
228 unsigned int B_col_inc,
229 unsigned int B_row_size,
230 unsigned int B_col_size,
231 unsigned int B_internal_rows,
232 unsigned int B_internal_cols,
235 unsigned int C_row_start,
236 unsigned int C_col_start,
237 unsigned int C_row_inc,
238 unsigned int C_col_inc,
239 unsigned int C_row_size,
240 unsigned int C_col_size,
241 unsigned int C_internal_rows,
242 unsigned int C_internal_cols)
253 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
255 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
257 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
259 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
260 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
262 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
263 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
268 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
269 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
271 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
272 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
273 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
274 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
275 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
276 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
277 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
278 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
279 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
280 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
281 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
282 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
283 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
284 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
285 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
286 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
287 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
288 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
293 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
294 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
299 template<
typename NumericT>
303 unsigned int A_row_start,
304 unsigned int A_col_start,
305 unsigned int A_row_inc,
306 unsigned int A_col_inc,
307 unsigned int A_row_size,
308 unsigned int A_col_size,
309 unsigned int A_internal_rows,
310 unsigned int A_internal_cols,
312 unsigned int B_row_start,
313 unsigned int B_col_start,
314 unsigned int B_row_inc,
315 unsigned int B_col_inc,
316 unsigned int B_row_size,
317 unsigned int B_col_size,
318 unsigned int B_internal_rows,
319 unsigned int B_internal_cols,
322 unsigned int C_row_start,
323 unsigned int C_col_start,
324 unsigned int C_row_inc,
325 unsigned int C_col_inc,
326 unsigned int C_row_size,
327 unsigned int C_col_size,
328 unsigned int C_internal_rows,
329 unsigned int C_internal_cols)
340 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
342 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
343 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
344 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
346 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
347 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
349 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
350 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
355 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
356 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
358 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
359 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
360 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
361 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
362 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
363 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
364 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
365 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
366 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
367 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
368 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
369 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
370 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
371 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
372 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
373 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
374 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
375 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
380 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
381 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
393 template<
typename NumericT>
397 unsigned int A_row_start,
398 unsigned int A_col_start,
399 unsigned int A_row_inc,
400 unsigned int A_col_inc,
401 unsigned int A_row_size,
402 unsigned int A_col_size,
403 unsigned int A_internal_rows,
404 unsigned int A_internal_cols,
406 unsigned int B_row_start,
407 unsigned int B_col_start,
408 unsigned int B_row_inc,
409 unsigned int B_col_inc,
410 unsigned int B_row_size,
411 unsigned int B_col_size,
412 unsigned int B_internal_rows,
413 unsigned int B_internal_cols,
416 unsigned int C_row_start,
417 unsigned int C_col_start,
418 unsigned int C_row_inc,
419 unsigned int C_col_inc,
420 unsigned int C_row_size,
421 unsigned int C_col_size,
422 unsigned int C_internal_rows,
423 unsigned int C_internal_cols)
434 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
435 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
436 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
438 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
440 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
441 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
443 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
444 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
449 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
450 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
452 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
453 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
454 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
455 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
456 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
457 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
458 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
459 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
460 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
461 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
462 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
463 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
464 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
465 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
466 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
467 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
468 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
469 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
474 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
475 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
480 template<
typename NumericT>
484 unsigned int A_row_start,
485 unsigned int A_col_start,
486 unsigned int A_row_inc,
487 unsigned int A_col_inc,
488 unsigned int A_row_size,
489 unsigned int A_col_size,
490 unsigned int A_internal_rows,
491 unsigned int A_internal_cols,
493 unsigned int B_row_start,
494 unsigned int B_col_start,
495 unsigned int B_row_inc,
496 unsigned int B_col_inc,
497 unsigned int B_row_size,
498 unsigned int B_col_size,
499 unsigned int B_internal_rows,
500 unsigned int B_internal_cols,
503 unsigned int C_row_start,
504 unsigned int C_col_start,
505 unsigned int C_row_inc,
506 unsigned int C_col_inc,
507 unsigned int C_row_size,
508 unsigned int C_col_size,
509 unsigned int C_internal_rows,
510 unsigned int C_internal_cols)
521 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
522 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
523 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
524 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
525 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
527 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
528 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
530 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
531 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
536 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
537 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
539 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
540 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
541 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
542 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
543 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
544 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
545 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
546 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
547 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
548 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
549 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
550 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
551 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
552 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
553 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
554 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
555 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
556 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
561 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
562 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
567 template<
typename NumericT>
571 unsigned int A_row_start,
572 unsigned int A_col_start,
573 unsigned int A_row_inc,
574 unsigned int A_col_inc,
575 unsigned int A_row_size,
576 unsigned int A_col_size,
577 unsigned int A_internal_rows,
578 unsigned int A_internal_cols,
580 unsigned int B_row_start,
581 unsigned int B_col_start,
582 unsigned int B_row_inc,
583 unsigned int B_col_inc,
584 unsigned int B_row_size,
585 unsigned int B_col_size,
586 unsigned int B_internal_rows,
587 unsigned int B_internal_cols,
590 unsigned int C_row_start,
591 unsigned int C_col_start,
592 unsigned int C_row_inc,
593 unsigned int C_col_inc,
594 unsigned int C_row_size,
595 unsigned int C_col_size,
596 unsigned int C_internal_rows,
597 unsigned int C_internal_cols)
608 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
610 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
612 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
614 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
615 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
617 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
618 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
623 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
624 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
626 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
627 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
628 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
629 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
630 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
631 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
632 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
633 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
634 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
635 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
636 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
637 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
638 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
639 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
640 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
641 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
642 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
643 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
648 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
649 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
654 template<
typename NumericT>
658 unsigned int A_row_start,
659 unsigned int A_col_start,
660 unsigned int A_row_inc,
661 unsigned int A_col_inc,
662 unsigned int A_row_size,
663 unsigned int A_col_size,
664 unsigned int A_internal_rows,
665 unsigned int A_internal_cols,
667 unsigned int B_row_start,
668 unsigned int B_col_start,
669 unsigned int B_row_inc,
670 unsigned int B_col_inc,
671 unsigned int B_row_size,
672 unsigned int B_col_size,
673 unsigned int B_internal_rows,
674 unsigned int B_internal_cols,
677 unsigned int C_row_start,
678 unsigned int C_col_start,
679 unsigned int C_row_inc,
680 unsigned int C_col_inc,
681 unsigned int C_row_size,
682 unsigned int C_col_size,
683 unsigned int C_internal_rows,
684 unsigned int C_internal_cols)
695 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
697 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
698 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
699 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
701 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
702 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
704 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
705 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
710 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
711 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
713 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
714 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
715 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
716 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
717 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
718 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
719 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
720 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
721 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
722 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
723 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
724 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
725 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
726 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
727 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
728 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
729 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
730 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
735 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
736 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
749 template<
typename NumericT>
753 unsigned int A_row_start,
754 unsigned int A_col_start,
755 unsigned int A_row_inc,
756 unsigned int A_col_inc,
757 unsigned int A_row_size,
758 unsigned int A_col_size,
759 unsigned int A_internal_rows,
760 unsigned int A_internal_cols,
762 unsigned int B_row_start,
763 unsigned int B_col_start,
764 unsigned int B_row_inc,
765 unsigned int B_col_inc,
766 unsigned int B_row_size,
767 unsigned int B_col_size,
768 unsigned int B_internal_rows,
769 unsigned int B_internal_cols,
772 unsigned int C_row_start,
773 unsigned int C_col_start,
774 unsigned int C_row_inc,
775 unsigned int C_col_inc,
776 unsigned int C_row_size,
777 unsigned int C_col_size,
778 unsigned int C_internal_rows,
779 unsigned int C_internal_cols)
790 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
791 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
792 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
793 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
794 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
796 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
797 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
799 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
800 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
805 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
806 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
808 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
809 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
810 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
811 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
812 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
813 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
814 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
815 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
816 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
817 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
818 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
819 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
820 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
821 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
822 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
823 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
824 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
825 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
830 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
831 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
836 template<
typename NumericT>
840 unsigned int A_row_start,
841 unsigned int A_col_start,
842 unsigned int A_row_inc,
843 unsigned int A_col_inc,
844 unsigned int A_row_size,
845 unsigned int A_col_size,
846 unsigned int A_internal_rows,
847 unsigned int A_internal_cols,
849 unsigned int B_row_start,
850 unsigned int B_col_start,
851 unsigned int B_row_inc,
852 unsigned int B_col_inc,
853 unsigned int B_row_size,
854 unsigned int B_col_size,
855 unsigned int B_internal_rows,
856 unsigned int B_internal_cols,
859 unsigned int C_row_start,
860 unsigned int C_col_start,
861 unsigned int C_row_inc,
862 unsigned int C_col_inc,
863 unsigned int C_row_size,
864 unsigned int C_col_size,
865 unsigned int C_internal_rows,
866 unsigned int C_internal_cols)
877 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
878 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
879 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
881 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
883 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
884 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
886 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
887 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
892 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
893 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
895 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
896 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
897 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
898 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
899 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
900 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
901 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
902 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
903 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
904 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
905 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
906 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
907 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
908 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
909 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
910 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
911 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
912 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
917 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
918 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
923 template<
typename NumericT>
927 unsigned int A_row_start,
928 unsigned int A_col_start,
929 unsigned int A_row_inc,
930 unsigned int A_col_inc,
931 unsigned int A_row_size,
932 unsigned int A_col_size,
933 unsigned int A_internal_rows,
934 unsigned int A_internal_cols,
936 unsigned int B_row_start,
937 unsigned int B_col_start,
938 unsigned int B_row_inc,
939 unsigned int B_col_inc,
940 unsigned int B_row_size,
941 unsigned int B_col_size,
942 unsigned int B_internal_rows,
943 unsigned int B_internal_cols,
946 unsigned int C_row_start,
947 unsigned int C_col_start,
948 unsigned int C_row_inc,
949 unsigned int C_col_inc,
950 unsigned int C_row_size,
951 unsigned int C_col_size,
952 unsigned int C_internal_rows,
953 unsigned int C_internal_cols)
964 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
966 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
967 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
968 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
970 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
971 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
973 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
974 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
979 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
980 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
982 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
983 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
984 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
985 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
986 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
987 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
988 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
989 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
990 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
991 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
992 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
993 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
994 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
995 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
996 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
997 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
998 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
999 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1004 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1005 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1010 template<
typename NumericT>
1014 unsigned int A_row_start,
1015 unsigned int A_col_start,
1016 unsigned int A_row_inc,
1017 unsigned int A_col_inc,
1018 unsigned int A_row_size,
1019 unsigned int A_col_size,
1020 unsigned int A_internal_rows,
1021 unsigned int A_internal_cols,
1023 unsigned int B_row_start,
1024 unsigned int B_col_start,
1025 unsigned int B_row_inc,
1026 unsigned int B_col_inc,
1027 unsigned int B_row_size,
1028 unsigned int B_col_size,
1029 unsigned int B_internal_rows,
1030 unsigned int B_internal_cols,
1033 unsigned int C_row_start,
1034 unsigned int C_col_start,
1035 unsigned int C_row_inc,
1036 unsigned int C_col_inc,
1037 unsigned int C_row_size,
1038 unsigned int C_col_size,
1039 unsigned int C_internal_rows,
1040 unsigned int C_internal_cols)
1051 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1053 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1055 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1057 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1058 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1060 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1061 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1066 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1067 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1069 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1070 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1071 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1072 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1073 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1074 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1075 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1076 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1077 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1078 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1079 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1080 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1081 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1082 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1083 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1084 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1085 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1086 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1091 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1092 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1104 template<
typename NumericT>
1108 unsigned int A_row_start,
1109 unsigned int A_col_start,
1110 unsigned int A_row_inc,
1111 unsigned int A_col_inc,
1112 unsigned int A_row_size,
1113 unsigned int A_col_size,
1114 unsigned int A_internal_rows,
1115 unsigned int A_internal_cols,
1117 unsigned int B_row_start,
1118 unsigned int B_col_start,
1119 unsigned int B_row_inc,
1120 unsigned int B_col_inc,
1121 unsigned int B_row_size,
1122 unsigned int B_col_size,
1123 unsigned int B_internal_rows,
1124 unsigned int B_internal_cols,
1127 unsigned int C_row_start,
1128 unsigned int C_col_start,
1129 unsigned int C_row_inc,
1130 unsigned int C_col_inc,
1131 unsigned int C_row_size,
1132 unsigned int C_col_size,
1133 unsigned int C_internal_rows,
1134 unsigned int C_internal_cols)
1145 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
1146 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
1147 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
1148 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
1149 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1151 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1152 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1154 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1155 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1160 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1161 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1163 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1164 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1165 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1166 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1167 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1168 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1169 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1170 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1171 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1172 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1173 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1174 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1175 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1176 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1177 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1178 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1179 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1180 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1185 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1186 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1191 template<
typename NumericT>
1195 unsigned int A_row_start,
1196 unsigned int A_col_start,
1197 unsigned int A_row_inc,
1198 unsigned int A_col_inc,
1199 unsigned int A_row_size,
1200 unsigned int A_col_size,
1201 unsigned int A_internal_rows,
1202 unsigned int A_internal_cols,
1204 unsigned int B_row_start,
1205 unsigned int B_col_start,
1206 unsigned int B_row_inc,
1207 unsigned int B_col_inc,
1208 unsigned int B_row_size,
1209 unsigned int B_col_size,
1210 unsigned int B_internal_rows,
1211 unsigned int B_internal_cols,
1214 unsigned int C_row_start,
1215 unsigned int C_col_start,
1216 unsigned int C_row_inc,
1217 unsigned int C_col_inc,
1218 unsigned int C_row_size,
1219 unsigned int C_col_size,
1220 unsigned int C_internal_rows,
1221 unsigned int C_internal_cols)
1232 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
1233 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
1234 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1236 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1238 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1239 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1241 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1242 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1247 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1248 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1250 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1251 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1252 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1253 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1254 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1255 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1256 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1257 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1258 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1259 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1260 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1261 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1262 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1263 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1264 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1265 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1266 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1267 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1272 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1273 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1278 template<
typename NumericT>
1282 unsigned int A_row_start,
1283 unsigned int A_col_start,
1284 unsigned int A_row_inc,
1285 unsigned int A_col_inc,
1286 unsigned int A_row_size,
1287 unsigned int A_col_size,
1288 unsigned int A_internal_rows,
1289 unsigned int A_internal_cols,
1291 unsigned int B_row_start,
1292 unsigned int B_col_start,
1293 unsigned int B_row_inc,
1294 unsigned int B_col_inc,
1295 unsigned int B_row_size,
1296 unsigned int B_col_size,
1297 unsigned int B_internal_rows,
1298 unsigned int B_internal_cols,
1301 unsigned int C_row_start,
1302 unsigned int C_col_start,
1303 unsigned int C_row_inc,
1304 unsigned int C_col_inc,
1305 unsigned int C_row_size,
1306 unsigned int C_col_size,
1307 unsigned int C_internal_rows,
1308 unsigned int C_internal_cols)
1319 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1321 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
1322 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
1323 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1325 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1326 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1328 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1329 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1334 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1335 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1337 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1338 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1339 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1340 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1341 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1342 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1343 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1344 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1345 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1346 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1347 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1348 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1349 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1350 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1351 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1352 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1353 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1354 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1359 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1360 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1365 template<
typename NumericT>
1369 unsigned int A_row_start,
1370 unsigned int A_col_start,
1371 unsigned int A_row_inc,
1372 unsigned int A_col_inc,
1373 unsigned int A_row_size,
1374 unsigned int A_col_size,
1375 unsigned int A_internal_rows,
1376 unsigned int A_internal_cols,
1378 unsigned int B_row_start,
1379 unsigned int B_col_start,
1380 unsigned int B_row_inc,
1381 unsigned int B_col_inc,
1382 unsigned int B_row_size,
1383 unsigned int B_col_size,
1384 unsigned int B_internal_rows,
1385 unsigned int B_internal_cols,
1388 unsigned int C_row_start,
1389 unsigned int C_col_start,
1390 unsigned int C_row_inc,
1391 unsigned int C_col_inc,
1392 unsigned int C_row_size,
1393 unsigned int C_col_size,
1394 unsigned int C_internal_rows,
1395 unsigned int C_internal_cols)
1406 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1408 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1410 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1412 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1413 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1415 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1416 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1421 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1422 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1424 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1425 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1426 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1427 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1428 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1429 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1430 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1431 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1432 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1433 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1434 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1435 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1436 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1437 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1438 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1439 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1440 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1441 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1446 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1447 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1463 template<
typename NumericT>
1467 unsigned int A_row_start,
1468 unsigned int A_col_start,
1469 unsigned int A_row_inc,
1470 unsigned int A_col_inc,
1471 unsigned int A_row_size,
1472 unsigned int A_col_size,
1473 unsigned int A_internal_rows,
1474 unsigned int A_internal_cols,
1476 unsigned int B_row_start,
1477 unsigned int B_col_start,
1478 unsigned int B_row_inc,
1479 unsigned int B_col_inc,
1480 unsigned int B_row_size,
1481 unsigned int B_col_size,
1482 unsigned int B_internal_rows,
1483 unsigned int B_internal_cols,
1486 unsigned int C_row_start,
1487 unsigned int C_col_start,
1488 unsigned int C_row_inc,
1489 unsigned int C_col_inc,
1490 unsigned int C_row_size,
1491 unsigned int C_col_size,
1492 unsigned int C_internal_rows,
1493 unsigned int C_internal_cols)
1504 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1506 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1508 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1510 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1511 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1513 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1514 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1519 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1520 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1522 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1523 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1524 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1525 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1526 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1527 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1528 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1529 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1530 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1531 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1532 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1533 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1534 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1535 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1536 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1537 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1538 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1539 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1544 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1545 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1550 template<
typename NumericT>
1554 unsigned int A_row_start,
1555 unsigned int A_col_start,
1556 unsigned int A_row_inc,
1557 unsigned int A_col_inc,
1558 unsigned int A_row_size,
1559 unsigned int A_col_size,
1560 unsigned int A_internal_rows,
1561 unsigned int A_internal_cols,
1563 unsigned int B_row_start,
1564 unsigned int B_col_start,
1565 unsigned int B_row_inc,
1566 unsigned int B_col_inc,
1567 unsigned int B_row_size,
1568 unsigned int B_col_size,
1569 unsigned int B_internal_rows,
1570 unsigned int B_internal_cols,
1573 unsigned int C_row_start,
1574 unsigned int C_col_start,
1575 unsigned int C_row_inc,
1576 unsigned int C_col_inc,
1577 unsigned int C_row_size,
1578 unsigned int C_col_size,
1579 unsigned int C_internal_rows,
1580 unsigned int C_internal_cols)
1591 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1593 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1594 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1595 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1597 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1598 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1600 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1601 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1606 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1607 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1609 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1610 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1611 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1612 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1613 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1614 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1615 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1616 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1617 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1618 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1619 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1620 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1621 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1622 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1623 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1624 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1625 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1626 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1631 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1632 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1637 template<
typename NumericT>
1641 unsigned int A_row_start,
1642 unsigned int A_col_start,
1643 unsigned int A_row_inc,
1644 unsigned int A_col_inc,
1645 unsigned int A_row_size,
1646 unsigned int A_col_size,
1647 unsigned int A_internal_rows,
1648 unsigned int A_internal_cols,
1650 unsigned int B_row_start,
1651 unsigned int B_col_start,
1652 unsigned int B_row_inc,
1653 unsigned int B_col_inc,
1654 unsigned int B_row_size,
1655 unsigned int B_col_size,
1656 unsigned int B_internal_rows,
1657 unsigned int B_internal_cols,
1660 unsigned int C_row_start,
1661 unsigned int C_col_start,
1662 unsigned int C_row_inc,
1663 unsigned int C_col_inc,
1664 unsigned int C_row_size,
1665 unsigned int C_col_size,
1666 unsigned int C_internal_rows,
1667 unsigned int C_internal_cols)
1678 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
1679 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
1680 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1682 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1684 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1685 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1687 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1688 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1693 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1694 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1696 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1697 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1698 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1699 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1700 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1701 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1702 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1703 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1704 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1705 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1706 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1707 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1708 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1709 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1710 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1711 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1712 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1713 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1718 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1719 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1724 template<
typename NumericT>
1728 unsigned int A_row_start,
1729 unsigned int A_col_start,
1730 unsigned int A_row_inc,
1731 unsigned int A_col_inc,
1732 unsigned int A_row_size,
1733 unsigned int A_col_size,
1734 unsigned int A_internal_rows,
1735 unsigned int A_internal_cols,
1737 unsigned int B_row_start,
1738 unsigned int B_col_start,
1739 unsigned int B_row_inc,
1740 unsigned int B_col_inc,
1741 unsigned int B_row_size,
1742 unsigned int B_col_size,
1743 unsigned int B_internal_rows,
1744 unsigned int B_internal_cols,
1747 unsigned int C_row_start,
1748 unsigned int C_col_start,
1749 unsigned int C_row_inc,
1750 unsigned int C_col_inc,
1751 unsigned int C_row_size,
1752 unsigned int C_col_size,
1753 unsigned int C_internal_rows,
1754 unsigned int C_internal_cols)
1765 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
1766 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
1767 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1768 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1769 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1771 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1772 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1774 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1775 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1780 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1781 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1783 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1784 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1785 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1786 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1787 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1788 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1789 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1790 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1791 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1792 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1793 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1794 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1795 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1796 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1797 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1798 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1799 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1800 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1805 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1806 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1819 template<
typename NumericT>
1823 unsigned int A_row_start,
1824 unsigned int A_col_start,
1825 unsigned int A_row_inc,
1826 unsigned int A_col_inc,
1827 unsigned int A_row_size,
1828 unsigned int A_col_size,
1829 unsigned int A_internal_rows,
1830 unsigned int A_internal_cols,
1832 unsigned int B_row_start,
1833 unsigned int B_col_start,
1834 unsigned int B_row_inc,
1835 unsigned int B_col_inc,
1836 unsigned int B_row_size,
1837 unsigned int B_col_size,
1838 unsigned int B_internal_rows,
1839 unsigned int B_internal_cols,
1842 unsigned int C_row_start,
1843 unsigned int C_col_start,
1844 unsigned int C_row_inc,
1845 unsigned int C_col_inc,
1846 unsigned int C_row_size,
1847 unsigned int C_col_size,
1848 unsigned int C_internal_rows,
1849 unsigned int C_internal_cols)
1860 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1862 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1864 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1866 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1867 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1869 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1870 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1875 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1876 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1878 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1879 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1880 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1881 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1882 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1883 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1884 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1885 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1886 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1887 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1888 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1889 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1890 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1891 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1892 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1893 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1894 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1895 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1900 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1901 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1906 template<
typename NumericT>
1910 unsigned int A_row_start,
1911 unsigned int A_col_start,
1912 unsigned int A_row_inc,
1913 unsigned int A_col_inc,
1914 unsigned int A_row_size,
1915 unsigned int A_col_size,
1916 unsigned int A_internal_rows,
1917 unsigned int A_internal_cols,
1919 unsigned int B_row_start,
1920 unsigned int B_col_start,
1921 unsigned int B_row_inc,
1922 unsigned int B_col_inc,
1923 unsigned int B_row_size,
1924 unsigned int B_col_size,
1925 unsigned int B_internal_rows,
1926 unsigned int B_internal_cols,
1929 unsigned int C_row_start,
1930 unsigned int C_col_start,
1931 unsigned int C_row_inc,
1932 unsigned int C_col_inc,
1933 unsigned int C_row_size,
1934 unsigned int C_col_size,
1935 unsigned int C_internal_rows,
1936 unsigned int C_internal_cols)
1947 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1949 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1950 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1951 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1953 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1954 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1956 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1957 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1962 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1963 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1965 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1966 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1967 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1968 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1969 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1970 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1971 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1972 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1973 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1974 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1975 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1976 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1977 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1978 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1979 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1980 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1981 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1982 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1987 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1988 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1993 template<
typename NumericT>
1997 unsigned int A_row_start,
1998 unsigned int A_col_start,
1999 unsigned int A_row_inc,
2000 unsigned int A_col_inc,
2001 unsigned int A_row_size,
2002 unsigned int A_col_size,
2003 unsigned int A_internal_rows,
2004 unsigned int A_internal_cols,
2006 unsigned int B_row_start,
2007 unsigned int B_col_start,
2008 unsigned int B_row_inc,
2009 unsigned int B_col_inc,
2010 unsigned int B_row_size,
2011 unsigned int B_col_size,
2012 unsigned int B_internal_rows,
2013 unsigned int B_internal_cols,
2016 unsigned int C_row_start,
2017 unsigned int C_col_start,
2018 unsigned int C_row_inc,
2019 unsigned int C_col_inc,
2020 unsigned int C_row_size,
2021 unsigned int C_col_size,
2022 unsigned int C_internal_rows,
2023 unsigned int C_internal_cols)
2034 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2035 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2036 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
2038 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2040 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2041 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
2043 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2044 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2049 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2050 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2052 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2053 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2054 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2055 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2056 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2057 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2058 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2059 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2060 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2061 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2062 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2063 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2064 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2065 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2066 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2067 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2068 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2069 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2074 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2075 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2080 template<
typename NumericT>
2084 unsigned int A_row_start,
2085 unsigned int A_col_start,
2086 unsigned int A_row_inc,
2087 unsigned int A_col_inc,
2088 unsigned int A_row_size,
2089 unsigned int A_col_size,
2090 unsigned int A_internal_rows,
2091 unsigned int A_internal_cols,
2093 unsigned int B_row_start,
2094 unsigned int B_col_start,
2095 unsigned int B_row_inc,
2096 unsigned int B_col_inc,
2097 unsigned int B_row_size,
2098 unsigned int B_col_size,
2099 unsigned int B_internal_rows,
2100 unsigned int B_internal_cols,
2103 unsigned int C_row_start,
2104 unsigned int C_col_start,
2105 unsigned int C_row_inc,
2106 unsigned int C_col_inc,
2107 unsigned int C_row_size,
2108 unsigned int C_col_size,
2109 unsigned int C_internal_rows,
2110 unsigned int C_internal_cols)
2121 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2122 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2123 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
2124 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
2125 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2127 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2128 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
2130 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2131 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2136 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2137 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2139 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2140 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2141 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2142 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2143 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2144 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2145 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2146 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2147 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2148 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2149 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2150 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2151 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2152 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2153 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2154 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2155 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2156 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2161 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2162 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2178 template<
typename NumericT>
2182 unsigned int A_row_start,
2183 unsigned int A_col_start,
2184 unsigned int A_row_inc,
2185 unsigned int A_col_inc,
2186 unsigned int A_row_size,
2187 unsigned int A_col_size,
2188 unsigned int A_internal_rows,
2189 unsigned int A_internal_cols,
2191 unsigned int B_row_start,
2192 unsigned int B_col_start,
2193 unsigned int B_row_inc,
2194 unsigned int B_col_inc,
2195 unsigned int B_row_size,
2196 unsigned int B_col_size,
2197 unsigned int B_internal_rows,
2198 unsigned int B_internal_cols,
2201 unsigned int C_row_start,
2202 unsigned int C_col_start,
2203 unsigned int C_row_inc,
2204 unsigned int C_col_inc,
2205 unsigned int C_row_size,
2206 unsigned int C_col_size,
2207 unsigned int C_internal_rows,
2208 unsigned int C_internal_cols)
2219 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2221 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2222 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2223 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2225 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2226 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2228 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2229 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2234 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2235 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2237 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2238 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2239 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2240 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2241 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2242 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2243 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2244 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2245 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2246 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2247 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2248 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2249 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2250 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2251 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2252 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2253 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2254 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2259 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2260 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2265 template<
typename NumericT>
2269 unsigned int A_row_start,
2270 unsigned int A_col_start,
2271 unsigned int A_row_inc,
2272 unsigned int A_col_inc,
2273 unsigned int A_row_size,
2274 unsigned int A_col_size,
2275 unsigned int A_internal_rows,
2276 unsigned int A_internal_cols,
2278 unsigned int B_row_start,
2279 unsigned int B_col_start,
2280 unsigned int B_row_inc,
2281 unsigned int B_col_inc,
2282 unsigned int B_row_size,
2283 unsigned int B_col_size,
2284 unsigned int B_internal_rows,
2285 unsigned int B_internal_cols,
2288 unsigned int C_row_start,
2289 unsigned int C_col_start,
2290 unsigned int C_row_inc,
2291 unsigned int C_col_inc,
2292 unsigned int C_row_size,
2293 unsigned int C_col_size,
2294 unsigned int C_internal_rows,
2295 unsigned int C_internal_cols)
2306 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2308 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2310 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2312 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2313 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2315 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2316 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2321 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2322 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2324 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2325 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2326 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2327 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2328 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2329 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2330 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2331 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2332 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2333 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2334 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2335 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2336 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2337 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2338 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2339 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2340 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2341 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2346 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2347 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2352 template<
typename NumericT>
2356 unsigned int A_row_start,
2357 unsigned int A_col_start,
2358 unsigned int A_row_inc,
2359 unsigned int A_col_inc,
2360 unsigned int A_row_size,
2361 unsigned int A_col_size,
2362 unsigned int A_internal_rows,
2363 unsigned int A_internal_cols,
2365 unsigned int B_row_start,
2366 unsigned int B_col_start,
2367 unsigned int B_row_inc,
2368 unsigned int B_col_inc,
2369 unsigned int B_row_size,
2370 unsigned int B_col_size,
2371 unsigned int B_internal_rows,
2372 unsigned int B_internal_cols,
2375 unsigned int C_row_start,
2376 unsigned int C_col_start,
2377 unsigned int C_row_inc,
2378 unsigned int C_col_inc,
2379 unsigned int C_row_size,
2380 unsigned int C_col_size,
2381 unsigned int C_internal_rows,
2382 unsigned int C_internal_cols)
2393 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2394 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2395 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2396 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2397 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2399 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2400 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2402 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2403 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2408 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2409 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2411 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2412 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2413 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2414 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2415 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2416 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2417 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2418 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2419 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2420 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2421 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2422 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2423 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2424 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2425 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2426 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2427 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2428 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2433 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2434 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2439 template<
typename NumericT>
2443 unsigned int A_row_start,
2444 unsigned int A_col_start,
2445 unsigned int A_row_inc,
2446 unsigned int A_col_inc,
2447 unsigned int A_row_size,
2448 unsigned int A_col_size,
2449 unsigned int A_internal_rows,
2450 unsigned int A_internal_cols,
2452 unsigned int B_row_start,
2453 unsigned int B_col_start,
2454 unsigned int B_row_inc,
2455 unsigned int B_col_inc,
2456 unsigned int B_row_size,
2457 unsigned int B_col_size,
2458 unsigned int B_internal_rows,
2459 unsigned int B_internal_cols,
2462 unsigned int C_row_start,
2463 unsigned int C_col_start,
2464 unsigned int C_row_inc,
2465 unsigned int C_col_inc,
2466 unsigned int C_row_size,
2467 unsigned int C_col_size,
2468 unsigned int C_internal_rows,
2469 unsigned int C_internal_cols)
2480 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2481 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2482 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2484 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2486 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2487 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2489 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2490 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2495 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2496 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2498 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2499 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2500 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2501 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2502 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2503 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2504 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2505 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2506 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2507 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2508 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2509 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2510 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2511 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2512 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2513 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2514 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2515 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2520 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2521 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2535 template<
typename NumericT>
2539 unsigned int A_row_start,
2540 unsigned int A_col_start,
2541 unsigned int A_row_inc,
2542 unsigned int A_col_inc,
2543 unsigned int A_row_size,
2544 unsigned int A_col_size,
2545 unsigned int A_internal_rows,
2546 unsigned int A_internal_cols,
2548 unsigned int B_row_start,
2549 unsigned int B_col_start,
2550 unsigned int B_row_inc,
2551 unsigned int B_col_inc,
2552 unsigned int B_row_size,
2553 unsigned int B_col_size,
2554 unsigned int B_internal_rows,
2555 unsigned int B_internal_cols,
2558 unsigned int C_row_start,
2559 unsigned int C_col_start,
2560 unsigned int C_row_inc,
2561 unsigned int C_col_inc,
2562 unsigned int C_row_size,
2563 unsigned int C_col_size,
2564 unsigned int C_internal_rows,
2565 unsigned int C_internal_cols)
2576 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2578 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2579 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2580 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2582 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2583 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2585 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2586 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2591 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2592 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2594 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2595 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2596 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2597 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2598 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2599 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2600 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2601 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2602 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2603 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2604 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2605 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2606 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2607 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2608 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2609 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2610 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2611 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2616 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2617 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2622 template<
typename NumericT>
2626 unsigned int A_row_start,
2627 unsigned int A_col_start,
2628 unsigned int A_row_inc,
2629 unsigned int A_col_inc,
2630 unsigned int A_row_size,
2631 unsigned int A_col_size,
2632 unsigned int A_internal_rows,
2633 unsigned int A_internal_cols,
2635 unsigned int B_row_start,
2636 unsigned int B_col_start,
2637 unsigned int B_row_inc,
2638 unsigned int B_col_inc,
2639 unsigned int B_row_size,
2640 unsigned int B_col_size,
2641 unsigned int B_internal_rows,
2642 unsigned int B_internal_cols,
2645 unsigned int C_row_start,
2646 unsigned int C_col_start,
2647 unsigned int C_row_inc,
2648 unsigned int C_col_inc,
2649 unsigned int C_row_size,
2650 unsigned int C_col_size,
2651 unsigned int C_internal_rows,
2652 unsigned int C_internal_cols)
2663 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2665 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2667 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2669 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2670 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2672 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2673 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2678 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2679 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2681 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2682 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2683 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2684 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2685 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2686 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2687 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2688 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2689 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2690 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2691 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2692 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2693 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2694 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2695 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2696 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2697 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2698 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2703 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2704 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2709 template<
typename NumericT>
2713 unsigned int A_row_start,
2714 unsigned int A_col_start,
2715 unsigned int A_row_inc,
2716 unsigned int A_col_inc,
2717 unsigned int A_row_size,
2718 unsigned int A_col_size,
2719 unsigned int A_internal_rows,
2720 unsigned int A_internal_cols,
2722 unsigned int B_row_start,
2723 unsigned int B_col_start,
2724 unsigned int B_row_inc,
2725 unsigned int B_col_inc,
2726 unsigned int B_row_size,
2727 unsigned int B_col_size,
2728 unsigned int B_internal_rows,
2729 unsigned int B_internal_cols,
2732 unsigned int C_row_start,
2733 unsigned int C_col_start,
2734 unsigned int C_row_inc,
2735 unsigned int C_col_inc,
2736 unsigned int C_row_size,
2737 unsigned int C_col_size,
2738 unsigned int C_internal_rows,
2739 unsigned int C_internal_cols)
2750 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2751 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2752 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2753 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2754 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2756 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2757 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2759 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2760 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2765 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2766 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2768 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2769 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2770 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2771 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2772 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2773 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2774 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2775 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2776 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2777 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2778 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2779 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2780 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2781 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2782 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2783 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2784 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2785 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2790 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2791 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2796 template<
typename NumericT>
2800 unsigned int A_row_start,
2801 unsigned int A_col_start,
2802 unsigned int A_row_inc,
2803 unsigned int A_col_inc,
2804 unsigned int A_row_size,
2805 unsigned int A_col_size,
2806 unsigned int A_internal_rows,
2807 unsigned int A_internal_cols,
2809 unsigned int B_row_start,
2810 unsigned int B_col_start,
2811 unsigned int B_row_inc,
2812 unsigned int B_col_inc,
2813 unsigned int B_row_size,
2814 unsigned int B_col_size,
2815 unsigned int B_internal_rows,
2816 unsigned int B_internal_cols,
2819 unsigned int C_row_start,
2820 unsigned int C_col_start,
2821 unsigned int C_row_inc,
2822 unsigned int C_col_inc,
2823 unsigned int C_row_size,
2824 unsigned int C_col_size,
2825 unsigned int C_internal_rows,
2826 unsigned int C_internal_cols)
2837 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2838 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2839 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2841 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2843 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2844 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2846 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2847 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2852 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2853 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2855 NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2856 NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2857 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2858 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2859 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2860 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2861 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2862 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2863 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2864 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2865 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2866 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2867 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2868 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2869 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2870 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2871 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2872 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2877 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2878 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
__global__ void matrix_matrix_col_col_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
This file provides the forward declarations for the main types used within ViennaCL.
__global__ void matrix_matrix_col_row_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
__global__ void matrix_matrix_col_col_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)