ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
matrix_operations_prod.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
27 #include "viennacl/forwards.h"
28 
29 namespace viennacl
30 {
31 namespace linalg
32 {
33 namespace cuda
34 {
35 
36 // matrix-matrix multiplication C = A * B
37 // matrix layouts: C...col_major, A...col_major, B...col_major
38 template<typename NumericT>
40  NumericT alpha,
41  const NumericT * A,
42  unsigned int A_row_start,
43  unsigned int A_col_start,
44  unsigned int A_row_inc,
45  unsigned int A_col_inc,
46  unsigned int A_row_size,
47  unsigned int A_col_size,
48  unsigned int A_internal_rows,
49  unsigned int A_internal_cols,
50  const NumericT * B,
51  unsigned int B_row_start,
52  unsigned int B_col_start,
53  unsigned int B_row_inc,
54  unsigned int B_col_inc,
55  unsigned int B_row_size,
56  unsigned int B_col_size,
57  unsigned int B_internal_rows,
58  unsigned int B_internal_cols,
59  NumericT beta,
60  NumericT * C,
61  unsigned int C_row_start,
62  unsigned int C_col_start,
63  unsigned int C_row_inc,
64  unsigned int C_col_inc,
65  unsigned int C_row_size,
66  unsigned int C_col_size,
67  unsigned int C_internal_rows,
68  unsigned int C_internal_cols)
69 {
70 
71  __shared__ NumericT bufA[272];
72  __shared__ NumericT bufB[272];
73 
74  vcl_size_t block_size = 16;//get_local_size(0);
75  vcl_size_t row_block_id = blockIdx.x;
76  vcl_size_t col_block_id = blockIdx.y;
77  vcl_size_t row_thread_id = threadIdx.x;
78  vcl_size_t col_thread_id = threadIdx.y;
79  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
80  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
81  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
82  vcl_size_t bStep = block_size * B_row_inc;
83  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
84  NumericT Csub = 0;
85  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
86  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
87 
88  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
89  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
90  for (vcl_size_t block = 0;
91  block < block_num;
92  ++block)
93  {
94  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
95  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
96  __syncthreads();
97  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
98  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
99  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
100  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
101  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
102  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
103  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
104  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
105  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
106  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
107  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
108  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
109  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
110  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
111  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
112  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
113  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
114  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
115  __syncthreads();
116  aBegin += aStep;
117  bBegin += bStep;
118  }
119  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
120  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
121 }
122 
123 // matrix-matrix multiplication C = A * B^T
124 // matrix layouts: C...col_major, A...col_major, B...col_major
125 template<typename NumericT>
127  NumericT alpha,
128  const NumericT * A,
129  unsigned int A_row_start,
130  unsigned int A_col_start,
131  unsigned int A_row_inc,
132  unsigned int A_col_inc,
133  unsigned int A_row_size,
134  unsigned int A_col_size,
135  unsigned int A_internal_rows,
136  unsigned int A_internal_cols,
137  const NumericT * B,
138  unsigned int B_row_start,
139  unsigned int B_col_start,
140  unsigned int B_row_inc,
141  unsigned int B_col_inc,
142  unsigned int B_row_size,
143  unsigned int B_col_size,
144  unsigned int B_internal_rows,
145  unsigned int B_internal_cols,
146  NumericT beta,
147  NumericT * C,
148  unsigned int C_row_start,
149  unsigned int C_col_start,
150  unsigned int C_row_inc,
151  unsigned int C_col_inc,
152  unsigned int C_row_size,
153  unsigned int C_col_size,
154  unsigned int C_internal_rows,
155  unsigned int C_internal_cols)
156 {
157 
158  __shared__ NumericT bufA[272];
159  __shared__ NumericT bufB[272];
160 
161  vcl_size_t block_size = 16;//get_local_size(0);
162  vcl_size_t row_block_id = blockIdx.x;
163  vcl_size_t col_block_id = blockIdx.y;
164  vcl_size_t row_thread_id = threadIdx.x;
165  vcl_size_t col_thread_id = threadIdx.y;
166  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
167  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
168  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
169  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
170  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
171  NumericT Csub = 0;
172  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
173  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
174 
175  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
176  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
177  for (vcl_size_t block = 0;
178  block < block_num;
179  ++block)
180  {
181  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
182  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
183  __syncthreads();
184  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
185  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
186  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
187  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
188  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
189  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
190  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
191  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
192  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
193  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
194  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
195  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
196  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
197  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
198  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
199  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
200  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
201  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
202  __syncthreads();
203  aBegin += aStep;
204  bBegin += bStep;
205  }
206  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
207  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
208 }
209 
210 // matrix-matrix multiplication C = A^T * B
211 // matrix layouts: C...col_major, A...col_major, B...col_major
212 template<typename NumericT>
214  NumericT alpha,
215  const NumericT * A,
216  unsigned int A_row_start,
217  unsigned int A_col_start,
218  unsigned int A_row_inc,
219  unsigned int A_col_inc,
220  unsigned int A_row_size,
221  unsigned int A_col_size,
222  unsigned int A_internal_rows,
223  unsigned int A_internal_cols,
224  const NumericT * B,
225  unsigned int B_row_start,
226  unsigned int B_col_start,
227  unsigned int B_row_inc,
228  unsigned int B_col_inc,
229  unsigned int B_row_size,
230  unsigned int B_col_size,
231  unsigned int B_internal_rows,
232  unsigned int B_internal_cols,
233  NumericT beta,
234  NumericT * C,
235  unsigned int C_row_start,
236  unsigned int C_col_start,
237  unsigned int C_row_inc,
238  unsigned int C_col_inc,
239  unsigned int C_row_size,
240  unsigned int C_col_size,
241  unsigned int C_internal_rows,
242  unsigned int C_internal_cols)
243 {
244 
245  __shared__ NumericT bufA[272];
246  __shared__ NumericT bufB[272];
247 
248  vcl_size_t block_size = 16;//get_local_size(0);
249  vcl_size_t row_block_id = blockIdx.x;
250  vcl_size_t col_block_id = blockIdx.y;
251  vcl_size_t row_thread_id = threadIdx.x;
252  vcl_size_t col_thread_id = threadIdx.y;
253  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
254  vcl_size_t aStep = block_size * A_row_inc;
255  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
256  vcl_size_t bStep = block_size * B_row_inc;
257  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
258  NumericT Csub = 0;
259  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
260  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
261 
262  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
263  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
264  for (vcl_size_t block = 0;
265  block < block_num;
266  ++block)
267  {
268  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
269  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
270  __syncthreads();
271  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
272  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
273  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
274  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
275  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
276  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
277  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
278  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
279  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
280  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
281  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
282  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
283  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
284  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
285  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
286  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
287  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
288  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
289  __syncthreads();
290  aBegin += aStep;
291  bBegin += bStep;
292  }
293  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
294  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
295 }
296 
297 // matrix-matrix multiplication C = A^T * B^T
298 // matrix layouts: C...col_major, A...col_major, B...col_major
299 template<typename NumericT>
301  NumericT alpha,
302  const NumericT * A,
303  unsigned int A_row_start,
304  unsigned int A_col_start,
305  unsigned int A_row_inc,
306  unsigned int A_col_inc,
307  unsigned int A_row_size,
308  unsigned int A_col_size,
309  unsigned int A_internal_rows,
310  unsigned int A_internal_cols,
311  const NumericT * B,
312  unsigned int B_row_start,
313  unsigned int B_col_start,
314  unsigned int B_row_inc,
315  unsigned int B_col_inc,
316  unsigned int B_row_size,
317  unsigned int B_col_size,
318  unsigned int B_internal_rows,
319  unsigned int B_internal_cols,
320  NumericT beta,
321  NumericT * C,
322  unsigned int C_row_start,
323  unsigned int C_col_start,
324  unsigned int C_row_inc,
325  unsigned int C_col_inc,
326  unsigned int C_row_size,
327  unsigned int C_col_size,
328  unsigned int C_internal_rows,
329  unsigned int C_internal_cols)
330 {
331 
332  __shared__ NumericT bufA[272];
333  __shared__ NumericT bufB[272];
334 
335  vcl_size_t block_size = 16;//get_local_size(0);
336  vcl_size_t row_block_id = blockIdx.x;
337  vcl_size_t col_block_id = blockIdx.y;
338  vcl_size_t row_thread_id = threadIdx.x;
339  vcl_size_t col_thread_id = threadIdx.y;
340  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
341  vcl_size_t aStep = block_size * A_row_inc;
342  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
343  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
344  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
345  NumericT Csub = 0;
346  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
347  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
348 
349  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
350  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
351  for (vcl_size_t block = 0;
352  block < block_num;
353  ++block)
354  {
355  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
356  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
357  __syncthreads();
358  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
359  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
360  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
361  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
362  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
363  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
364  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
365  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
366  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
367  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
368  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
369  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
370  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
371  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
372  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
373  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
374  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
375  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
376  __syncthreads();
377  aBegin += aStep;
378  bBegin += bStep;
379  }
380  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
381  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
382 }
383 
384 
385 
387 
388 
389 
390 
391 // matrix-matrix multiplication C = A * B
392 // matrix layouts: C...row_major, A...col_major, B...col_major
393 template<typename NumericT>
395  NumericT alpha,
396  const NumericT * A,
397  unsigned int A_row_start,
398  unsigned int A_col_start,
399  unsigned int A_row_inc,
400  unsigned int A_col_inc,
401  unsigned int A_row_size,
402  unsigned int A_col_size,
403  unsigned int A_internal_rows,
404  unsigned int A_internal_cols,
405  const NumericT * B,
406  unsigned int B_row_start,
407  unsigned int B_col_start,
408  unsigned int B_row_inc,
409  unsigned int B_col_inc,
410  unsigned int B_row_size,
411  unsigned int B_col_size,
412  unsigned int B_internal_rows,
413  unsigned int B_internal_cols,
414  NumericT beta,
415  NumericT * C,
416  unsigned int C_row_start,
417  unsigned int C_col_start,
418  unsigned int C_row_inc,
419  unsigned int C_col_inc,
420  unsigned int C_row_size,
421  unsigned int C_col_size,
422  unsigned int C_internal_rows,
423  unsigned int C_internal_cols)
424 {
425 
426  __shared__ NumericT bufA[272];
427  __shared__ NumericT bufB[272];
428 
429  vcl_size_t block_size = 16;//get_local_size(0);
430  vcl_size_t row_block_id = blockIdx.x;
431  vcl_size_t col_block_id = blockIdx.y;
432  vcl_size_t row_thread_id = threadIdx.x;
433  vcl_size_t col_thread_id = threadIdx.y;
434  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
435  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
436  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
437  vcl_size_t bStep = block_size * B_row_inc;
438  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
439  NumericT Csub = 0;
440  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
441  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
442 
443  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
444  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
445  for (vcl_size_t block = 0;
446  block < block_num;
447  ++block)
448  {
449  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
450  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
451  __syncthreads();
452  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
453  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
454  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
455  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
456  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
457  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
458  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
459  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
460  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
461  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
462  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
463  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
464  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
465  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
466  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
467  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
468  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
469  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
470  __syncthreads();
471  aBegin += aStep;
472  bBegin += bStep;
473  }
474  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
475  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
476 }
477 
478 // matrix-matrix multiplication C = A * B^T
479 // matrix layouts: C...row_major, A...col_major, B...col_major
480 template<typename NumericT>
482  NumericT alpha,
483  const NumericT * A,
484  unsigned int A_row_start,
485  unsigned int A_col_start,
486  unsigned int A_row_inc,
487  unsigned int A_col_inc,
488  unsigned int A_row_size,
489  unsigned int A_col_size,
490  unsigned int A_internal_rows,
491  unsigned int A_internal_cols,
492  const NumericT * B,
493  unsigned int B_row_start,
494  unsigned int B_col_start,
495  unsigned int B_row_inc,
496  unsigned int B_col_inc,
497  unsigned int B_row_size,
498  unsigned int B_col_size,
499  unsigned int B_internal_rows,
500  unsigned int B_internal_cols,
501  NumericT beta,
502  NumericT * C,
503  unsigned int C_row_start,
504  unsigned int C_col_start,
505  unsigned int C_row_inc,
506  unsigned int C_col_inc,
507  unsigned int C_row_size,
508  unsigned int C_col_size,
509  unsigned int C_internal_rows,
510  unsigned int C_internal_cols)
511 {
512 
513  __shared__ NumericT bufA[272];
514  __shared__ NumericT bufB[272];
515 
516  vcl_size_t block_size = 16;//get_local_size(0);
517  vcl_size_t row_block_id = blockIdx.x;
518  vcl_size_t col_block_id = blockIdx.y;
519  vcl_size_t row_thread_id = threadIdx.x;
520  vcl_size_t col_thread_id = threadIdx.y;
521  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
522  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
523  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
524  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
525  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
526  NumericT Csub = 0;
527  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
528  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
529 
530  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
531  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
532  for (vcl_size_t block = 0;
533  block < block_num;
534  ++block)
535  {
536  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
537  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
538  __syncthreads();
539  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
540  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
541  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
542  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
543  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
544  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
545  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
546  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
547  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
548  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
549  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
550  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
551  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
552  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
553  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
554  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
555  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
556  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
557  __syncthreads();
558  aBegin += aStep;
559  bBegin += bStep;
560  }
561  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
562  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
563 }
564 
565 // matrix-matrix multiplication C = A^T * B
566 // matrix layouts: C...row_major, A...col_major, B...col_major
567 template<typename NumericT>
569  NumericT alpha,
570  const NumericT * A,
571  unsigned int A_row_start,
572  unsigned int A_col_start,
573  unsigned int A_row_inc,
574  unsigned int A_col_inc,
575  unsigned int A_row_size,
576  unsigned int A_col_size,
577  unsigned int A_internal_rows,
578  unsigned int A_internal_cols,
579  const NumericT * B,
580  unsigned int B_row_start,
581  unsigned int B_col_start,
582  unsigned int B_row_inc,
583  unsigned int B_col_inc,
584  unsigned int B_row_size,
585  unsigned int B_col_size,
586  unsigned int B_internal_rows,
587  unsigned int B_internal_cols,
588  NumericT beta,
589  NumericT * C,
590  unsigned int C_row_start,
591  unsigned int C_col_start,
592  unsigned int C_row_inc,
593  unsigned int C_col_inc,
594  unsigned int C_row_size,
595  unsigned int C_col_size,
596  unsigned int C_internal_rows,
597  unsigned int C_internal_cols)
598 {
599 
600  __shared__ NumericT bufA[272];
601  __shared__ NumericT bufB[272];
602 
603  vcl_size_t block_size = 16;//get_local_size(0);
604  vcl_size_t row_block_id = blockIdx.x;
605  vcl_size_t col_block_id = blockIdx.y;
606  vcl_size_t row_thread_id = threadIdx.x;
607  vcl_size_t col_thread_id = threadIdx.y;
608  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
609  vcl_size_t aStep = block_size * A_row_inc;
610  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
611  vcl_size_t bStep = block_size * B_row_inc;
612  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
613  NumericT Csub = 0;
614  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
615  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
616 
617  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
618  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
619  for (vcl_size_t block = 0;
620  block < block_num;
621  ++block)
622  {
623  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
624  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
625  __syncthreads();
626  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
627  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
628  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
629  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
630  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
631  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
632  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
633  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
634  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
635  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
636  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
637  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
638  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
639  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
640  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
641  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
642  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
643  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
644  __syncthreads();
645  aBegin += aStep;
646  bBegin += bStep;
647  }
648  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
649  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
650 }
651 
652 // matrix-matrix multiplication C = A^T * B^T
653 // matrix layouts: C...row_major, A...col_major, B...col_major
654 template<typename NumericT>
656  NumericT alpha,
657  const NumericT * A,
658  unsigned int A_row_start,
659  unsigned int A_col_start,
660  unsigned int A_row_inc,
661  unsigned int A_col_inc,
662  unsigned int A_row_size,
663  unsigned int A_col_size,
664  unsigned int A_internal_rows,
665  unsigned int A_internal_cols,
666  const NumericT * B,
667  unsigned int B_row_start,
668  unsigned int B_col_start,
669  unsigned int B_row_inc,
670  unsigned int B_col_inc,
671  unsigned int B_row_size,
672  unsigned int B_col_size,
673  unsigned int B_internal_rows,
674  unsigned int B_internal_cols,
675  NumericT beta,
676  NumericT * C,
677  unsigned int C_row_start,
678  unsigned int C_col_start,
679  unsigned int C_row_inc,
680  unsigned int C_col_inc,
681  unsigned int C_row_size,
682  unsigned int C_col_size,
683  unsigned int C_internal_rows,
684  unsigned int C_internal_cols)
685 {
686 
687  __shared__ NumericT bufA[272];
688  __shared__ NumericT bufB[272];
689 
690  vcl_size_t block_size = 16;//get_local_size(0);
691  vcl_size_t row_block_id = blockIdx.x;
692  vcl_size_t col_block_id = blockIdx.y;
693  vcl_size_t row_thread_id = threadIdx.x;
694  vcl_size_t col_thread_id = threadIdx.y;
695  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
696  vcl_size_t aStep = block_size * A_row_inc;
697  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
698  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
699  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
700  NumericT Csub = 0;
701  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
702  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
703 
704  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
705  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
706  for (vcl_size_t block = 0;
707  block < block_num;
708  ++block)
709  {
710  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
711  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
712  __syncthreads();
713  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
714  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
715  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
716  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
717  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
718  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
719  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
720  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
721  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
722  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
723  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
724  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
725  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
726  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
727  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
728  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
729  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
730  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
731  __syncthreads();
732  aBegin += aStep;
733  bBegin += bStep;
734  }
735  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
736  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
737 }
738 
739 
740 
741 
743 
744 
745 
746 
747 // matrix-matrix multiplication C = A * B
748 // matrix layouts: C...col_major, A...col_major, B...row_major
749 template<typename NumericT>
751  NumericT alpha,
752  const NumericT * A,
753  unsigned int A_row_start,
754  unsigned int A_col_start,
755  unsigned int A_row_inc,
756  unsigned int A_col_inc,
757  unsigned int A_row_size,
758  unsigned int A_col_size,
759  unsigned int A_internal_rows,
760  unsigned int A_internal_cols,
761  const NumericT * B,
762  unsigned int B_row_start,
763  unsigned int B_col_start,
764  unsigned int B_row_inc,
765  unsigned int B_col_inc,
766  unsigned int B_row_size,
767  unsigned int B_col_size,
768  unsigned int B_internal_rows,
769  unsigned int B_internal_cols,
770  NumericT beta,
771  NumericT * C,
772  unsigned int C_row_start,
773  unsigned int C_col_start,
774  unsigned int C_row_inc,
775  unsigned int C_col_inc,
776  unsigned int C_row_size,
777  unsigned int C_col_size,
778  unsigned int C_internal_rows,
779  unsigned int C_internal_cols)
780 {
781 
782  __shared__ NumericT bufA[272];
783  __shared__ NumericT bufB[272];
784 
785  vcl_size_t block_size = 16;//get_local_size(0);
786  vcl_size_t row_block_id = blockIdx.x;
787  vcl_size_t col_block_id = blockIdx.y;
788  vcl_size_t row_thread_id = threadIdx.x;
789  vcl_size_t col_thread_id = threadIdx.y;
790  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
791  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
792  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
793  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
794  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
795  NumericT Csub = 0;
796  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
797  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
798 
799  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
800  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
801  for (vcl_size_t block = 0;
802  block < block_num;
803  ++block)
804  {
805  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
806  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
807  __syncthreads();
808  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
809  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
810  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
811  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
812  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
813  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
814  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
815  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
816  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
817  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
818  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
819  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
820  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
821  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
822  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
823  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
824  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
825  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
826  __syncthreads();
827  aBegin += aStep;
828  bBegin += bStep;
829  }
830  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
831  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
832 }
833 
834 // matrix-matrix multiplication C = A * B^T
835 // matrix layouts: C...col_major, A...col_major, B...row_major
836 template<typename NumericT>
838  NumericT alpha,
839  const NumericT * A,
840  unsigned int A_row_start,
841  unsigned int A_col_start,
842  unsigned int A_row_inc,
843  unsigned int A_col_inc,
844  unsigned int A_row_size,
845  unsigned int A_col_size,
846  unsigned int A_internal_rows,
847  unsigned int A_internal_cols,
848  const NumericT * B,
849  unsigned int B_row_start,
850  unsigned int B_col_start,
851  unsigned int B_row_inc,
852  unsigned int B_col_inc,
853  unsigned int B_row_size,
854  unsigned int B_col_size,
855  unsigned int B_internal_rows,
856  unsigned int B_internal_cols,
857  NumericT beta,
858  NumericT * C,
859  unsigned int C_row_start,
860  unsigned int C_col_start,
861  unsigned int C_row_inc,
862  unsigned int C_col_inc,
863  unsigned int C_row_size,
864  unsigned int C_col_size,
865  unsigned int C_internal_rows,
866  unsigned int C_internal_cols)
867 {
868 
869  __shared__ NumericT bufA[272];
870  __shared__ NumericT bufB[272];
871 
872  vcl_size_t block_size = 16;//get_local_size(0);
873  vcl_size_t row_block_id = blockIdx.x;
874  vcl_size_t col_block_id = blockIdx.y;
875  vcl_size_t row_thread_id = threadIdx.x;
876  vcl_size_t col_thread_id = threadIdx.y;
877  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
878  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
879  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
880  vcl_size_t bStep = block_size * B_col_inc;
881  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
882  NumericT Csub = 0;
883  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
884  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
885 
886  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
887  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
888  for (vcl_size_t block = 0;
889  block < block_num;
890  ++block)
891  {
892  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
893  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
894  __syncthreads();
895  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
896  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
897  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
898  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
899  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
900  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
901  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
902  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
903  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
904  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
905  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
906  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
907  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
908  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
909  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
910  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
911  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
912  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
913  __syncthreads();
914  aBegin += aStep;
915  bBegin += bStep;
916  }
917  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
918  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
919 }
920 
921 // matrix-matrix multiplication C = A^T * B
922 // matrix layouts: C...col_major, A...col_major, B...row_major
923 template<typename NumericT>
925  NumericT alpha,
926  const NumericT * A,
927  unsigned int A_row_start,
928  unsigned int A_col_start,
929  unsigned int A_row_inc,
930  unsigned int A_col_inc,
931  unsigned int A_row_size,
932  unsigned int A_col_size,
933  unsigned int A_internal_rows,
934  unsigned int A_internal_cols,
935  const NumericT * B,
936  unsigned int B_row_start,
937  unsigned int B_col_start,
938  unsigned int B_row_inc,
939  unsigned int B_col_inc,
940  unsigned int B_row_size,
941  unsigned int B_col_size,
942  unsigned int B_internal_rows,
943  unsigned int B_internal_cols,
944  NumericT beta,
945  NumericT * C,
946  unsigned int C_row_start,
947  unsigned int C_col_start,
948  unsigned int C_row_inc,
949  unsigned int C_col_inc,
950  unsigned int C_row_size,
951  unsigned int C_col_size,
952  unsigned int C_internal_rows,
953  unsigned int C_internal_cols)
954 {
955 
956  __shared__ NumericT bufA[272];
957  __shared__ NumericT bufB[272];
958 
959  vcl_size_t block_size = 16;//get_local_size(0);
960  vcl_size_t row_block_id = blockIdx.x;
961  vcl_size_t col_block_id = blockIdx.y;
962  vcl_size_t row_thread_id = threadIdx.x;
963  vcl_size_t col_thread_id = threadIdx.y;
964  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
965  vcl_size_t aStep = block_size * A_row_inc;
966  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
967  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
968  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
969  NumericT Csub = 0;
970  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
971  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
972 
973  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
974  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
975  for (vcl_size_t block = 0;
976  block < block_num;
977  ++block)
978  {
979  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
980  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
981  __syncthreads();
982  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
983  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
984  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
985  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
986  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
987  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
988  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
989  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
990  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
991  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
992  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
993  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
994  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
995  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
996  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
997  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
998  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
999  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1000  __syncthreads();
1001  aBegin += aStep;
1002  bBegin += bStep;
1003  }
1004  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1005  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1006 }
1007 
1008 // matrix-matrix multiplication C = A^T * B^T
1009 // matrix layouts: C...col_major, A...col_major, B...row_major
1010 template<typename NumericT>
1012  NumericT alpha,
1013  const NumericT * A,
1014  unsigned int A_row_start,
1015  unsigned int A_col_start,
1016  unsigned int A_row_inc,
1017  unsigned int A_col_inc,
1018  unsigned int A_row_size,
1019  unsigned int A_col_size,
1020  unsigned int A_internal_rows,
1021  unsigned int A_internal_cols,
1022  const NumericT * B,
1023  unsigned int B_row_start,
1024  unsigned int B_col_start,
1025  unsigned int B_row_inc,
1026  unsigned int B_col_inc,
1027  unsigned int B_row_size,
1028  unsigned int B_col_size,
1029  unsigned int B_internal_rows,
1030  unsigned int B_internal_cols,
1031  NumericT beta,
1032  NumericT * C,
1033  unsigned int C_row_start,
1034  unsigned int C_col_start,
1035  unsigned int C_row_inc,
1036  unsigned int C_col_inc,
1037  unsigned int C_row_size,
1038  unsigned int C_col_size,
1039  unsigned int C_internal_rows,
1040  unsigned int C_internal_cols)
1041 {
1042 
1043  __shared__ NumericT bufA[272];
1044  __shared__ NumericT bufB[272];
1045 
1046  vcl_size_t block_size = 16;//get_local_size(0);
1047  vcl_size_t row_block_id = blockIdx.x;
1048  vcl_size_t col_block_id = blockIdx.y;
1049  vcl_size_t row_thread_id = threadIdx.x;
1050  vcl_size_t col_thread_id = threadIdx.y;
1051  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1052  vcl_size_t aStep = block_size * A_row_inc;
1053  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1054  vcl_size_t bStep = block_size * B_col_inc;
1055  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1056  NumericT Csub = 0;
1057  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1058  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1059 
1060  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1061  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1062  for (vcl_size_t block = 0;
1063  block < block_num;
1064  ++block)
1065  {
1066  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1067  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1068  __syncthreads();
1069  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1070  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1071  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1072  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1073  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1074  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1075  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1076  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1077  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1078  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1079  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1080  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1081  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1082  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1083  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1084  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1085  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1086  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1087  __syncthreads();
1088  aBegin += aStep;
1089  bBegin += bStep;
1090  }
1091  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1092  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1093 }
1094 
1095 
1096 
1098 
1099 
1100 
1101 
1102 // matrix-matrix multiplication C = A * B
1103 // matrix layouts: C...row_major, A...col_major, B...row_major
1104 template<typename NumericT>
1106  NumericT alpha,
1107  const NumericT * A,
1108  unsigned int A_row_start,
1109  unsigned int A_col_start,
1110  unsigned int A_row_inc,
1111  unsigned int A_col_inc,
1112  unsigned int A_row_size,
1113  unsigned int A_col_size,
1114  unsigned int A_internal_rows,
1115  unsigned int A_internal_cols,
1116  const NumericT * B,
1117  unsigned int B_row_start,
1118  unsigned int B_col_start,
1119  unsigned int B_row_inc,
1120  unsigned int B_col_inc,
1121  unsigned int B_row_size,
1122  unsigned int B_col_size,
1123  unsigned int B_internal_rows,
1124  unsigned int B_internal_cols,
1125  NumericT beta,
1126  NumericT * C,
1127  unsigned int C_row_start,
1128  unsigned int C_col_start,
1129  unsigned int C_row_inc,
1130  unsigned int C_col_inc,
1131  unsigned int C_row_size,
1132  unsigned int C_col_size,
1133  unsigned int C_internal_rows,
1134  unsigned int C_internal_cols)
1135 {
1136 
1137  __shared__ NumericT bufA[272];
1138  __shared__ NumericT bufB[272];
1139 
1140  vcl_size_t block_size = 16;//get_local_size(0);
1141  vcl_size_t row_block_id = blockIdx.x;
1142  vcl_size_t col_block_id = blockIdx.y;
1143  vcl_size_t row_thread_id = threadIdx.x;
1144  vcl_size_t col_thread_id = threadIdx.y;
1145  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
1146  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
1147  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
1148  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
1149  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1150  NumericT Csub = 0;
1151  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1152  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1153 
1154  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1155  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1156  for (vcl_size_t block = 0;
1157  block < block_num;
1158  ++block)
1159  {
1160  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1161  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1162  __syncthreads();
1163  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1164  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1165  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1166  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1167  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1168  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1169  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1170  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1171  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1172  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1173  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1174  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1175  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1176  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1177  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1178  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1179  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1180  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1181  __syncthreads();
1182  aBegin += aStep;
1183  bBegin += bStep;
1184  }
1185  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1186  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1187 }
1188 
1189 // matrix-matrix multiplication C = A * B^T
1190 // matrix layouts: C...row_major, A...col_major, B...row_major
1191 template<typename NumericT>
1193  NumericT alpha,
1194  const NumericT * A,
1195  unsigned int A_row_start,
1196  unsigned int A_col_start,
1197  unsigned int A_row_inc,
1198  unsigned int A_col_inc,
1199  unsigned int A_row_size,
1200  unsigned int A_col_size,
1201  unsigned int A_internal_rows,
1202  unsigned int A_internal_cols,
1203  const NumericT * B,
1204  unsigned int B_row_start,
1205  unsigned int B_col_start,
1206  unsigned int B_row_inc,
1207  unsigned int B_col_inc,
1208  unsigned int B_row_size,
1209  unsigned int B_col_size,
1210  unsigned int B_internal_rows,
1211  unsigned int B_internal_cols,
1212  NumericT beta,
1213  NumericT * C,
1214  unsigned int C_row_start,
1215  unsigned int C_col_start,
1216  unsigned int C_row_inc,
1217  unsigned int C_col_inc,
1218  unsigned int C_row_size,
1219  unsigned int C_col_size,
1220  unsigned int C_internal_rows,
1221  unsigned int C_internal_cols)
1222 {
1223 
1224  __shared__ NumericT bufA[272];
1225  __shared__ NumericT bufB[272];
1226 
1227  vcl_size_t block_size = 16;//get_local_size(0);
1228  vcl_size_t row_block_id = blockIdx.x;
1229  vcl_size_t col_block_id = blockIdx.y;
1230  vcl_size_t row_thread_id = threadIdx.x;
1231  vcl_size_t col_thread_id = threadIdx.y;
1232  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
1233  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
1234  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1235  vcl_size_t bStep = block_size * B_col_inc;
1236  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1237  NumericT Csub = 0;
1238  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1239  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1240 
1241  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1242  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1243  for (vcl_size_t block = 0;
1244  block < block_num;
1245  ++block)
1246  {
1247  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1248  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1249  __syncthreads();
1250  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1251  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1252  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1253  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1254  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1255  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1256  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1257  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1258  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1259  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1260  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1261  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1262  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1263  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1264  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1265  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1266  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1267  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1268  __syncthreads();
1269  aBegin += aStep;
1270  bBegin += bStep;
1271  }
1272  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1273  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1274 }
1275 
1276 // matrix-matrix multiplication C = A^T * B
1277 // matrix layouts: C...row_major, A...col_major, B...row_major
1278 template<typename NumericT>
1280  NumericT alpha,
1281  const NumericT * A,
1282  unsigned int A_row_start,
1283  unsigned int A_col_start,
1284  unsigned int A_row_inc,
1285  unsigned int A_col_inc,
1286  unsigned int A_row_size,
1287  unsigned int A_col_size,
1288  unsigned int A_internal_rows,
1289  unsigned int A_internal_cols,
1290  const NumericT * B,
1291  unsigned int B_row_start,
1292  unsigned int B_col_start,
1293  unsigned int B_row_inc,
1294  unsigned int B_col_inc,
1295  unsigned int B_row_size,
1296  unsigned int B_col_size,
1297  unsigned int B_internal_rows,
1298  unsigned int B_internal_cols,
1299  NumericT beta,
1300  NumericT * C,
1301  unsigned int C_row_start,
1302  unsigned int C_col_start,
1303  unsigned int C_row_inc,
1304  unsigned int C_col_inc,
1305  unsigned int C_row_size,
1306  unsigned int C_col_size,
1307  unsigned int C_internal_rows,
1308  unsigned int C_internal_cols)
1309 {
1310 
1311  __shared__ NumericT bufA[272];
1312  __shared__ NumericT bufB[272];
1313 
1314  vcl_size_t block_size = 16;//get_local_size(0);
1315  vcl_size_t row_block_id = blockIdx.x;
1316  vcl_size_t col_block_id = blockIdx.y;
1317  vcl_size_t row_thread_id = threadIdx.x;
1318  vcl_size_t col_thread_id = threadIdx.y;
1319  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1320  vcl_size_t aStep = block_size * A_row_inc;
1321  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
1322  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
1323  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1324  NumericT Csub = 0;
1325  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1326  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1327 
1328  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1329  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1330  for (vcl_size_t block = 0;
1331  block < block_num;
1332  ++block)
1333  {
1334  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1335  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1336  __syncthreads();
1337  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1338  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1339  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1340  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1341  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1342  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1343  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1344  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1345  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1346  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1347  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1348  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1349  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1350  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1351  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1352  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1353  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1354  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1355  __syncthreads();
1356  aBegin += aStep;
1357  bBegin += bStep;
1358  }
1359  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1360  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1361 }
1362 
1363 // matrix-matrix multiplication C = A^T * B^T
1364 // matrix layouts: C...row_major, A...col_major, B...row_major
1365 template<typename NumericT>
1367  NumericT alpha,
1368  const NumericT * A,
1369  unsigned int A_row_start,
1370  unsigned int A_col_start,
1371  unsigned int A_row_inc,
1372  unsigned int A_col_inc,
1373  unsigned int A_row_size,
1374  unsigned int A_col_size,
1375  unsigned int A_internal_rows,
1376  unsigned int A_internal_cols,
1377  const NumericT * B,
1378  unsigned int B_row_start,
1379  unsigned int B_col_start,
1380  unsigned int B_row_inc,
1381  unsigned int B_col_inc,
1382  unsigned int B_row_size,
1383  unsigned int B_col_size,
1384  unsigned int B_internal_rows,
1385  unsigned int B_internal_cols,
1386  NumericT beta,
1387  NumericT * C,
1388  unsigned int C_row_start,
1389  unsigned int C_col_start,
1390  unsigned int C_row_inc,
1391  unsigned int C_col_inc,
1392  unsigned int C_row_size,
1393  unsigned int C_col_size,
1394  unsigned int C_internal_rows,
1395  unsigned int C_internal_cols)
1396 {
1397 
1398  __shared__ NumericT bufA[272];
1399  __shared__ NumericT bufB[272];
1400 
1401  vcl_size_t block_size = 16;//get_local_size(0);
1402  vcl_size_t row_block_id = blockIdx.x;
1403  vcl_size_t col_block_id = blockIdx.y;
1404  vcl_size_t row_thread_id = threadIdx.x;
1405  vcl_size_t col_thread_id = threadIdx.y;
1406  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
1407  vcl_size_t aStep = block_size * A_row_inc;
1408  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
1409  vcl_size_t bStep = block_size * B_col_inc;
1410  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1411  NumericT Csub = 0;
1412  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
1413  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
1414 
1415  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1416  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1417  for (vcl_size_t block = 0;
1418  block < block_num;
1419  ++block)
1420  {
1421  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1422  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1423  __syncthreads();
1424  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1425  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1426  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1427  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1428  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1429  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1430  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1431  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1432  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1433  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1434  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1435  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1436  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1437  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1438  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1439  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1440  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1441  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1442  __syncthreads();
1443  aBegin += aStep;
1444  bBegin += bStep;
1445  }
1446  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1447  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1448 }
1449 
1450 
1451 
1452 
1453 
1455 
1456 
1457 
1458 
1459 
1460 
1461 // matrix-matrix multiplication C = A * B
1462 // matrix layouts: C...col_major, A...row_major, B...col_major
1463 template<typename NumericT>
1465  NumericT alpha,
1466  const NumericT * A,
1467  unsigned int A_row_start,
1468  unsigned int A_col_start,
1469  unsigned int A_row_inc,
1470  unsigned int A_col_inc,
1471  unsigned int A_row_size,
1472  unsigned int A_col_size,
1473  unsigned int A_internal_rows,
1474  unsigned int A_internal_cols,
1475  const NumericT * B,
1476  unsigned int B_row_start,
1477  unsigned int B_col_start,
1478  unsigned int B_row_inc,
1479  unsigned int B_col_inc,
1480  unsigned int B_row_size,
1481  unsigned int B_col_size,
1482  unsigned int B_internal_rows,
1483  unsigned int B_internal_cols,
1484  NumericT beta,
1485  NumericT * C,
1486  unsigned int C_row_start,
1487  unsigned int C_col_start,
1488  unsigned int C_row_inc,
1489  unsigned int C_col_inc,
1490  unsigned int C_row_size,
1491  unsigned int C_col_size,
1492  unsigned int C_internal_rows,
1493  unsigned int C_internal_cols)
1494 {
1495 
1496  __shared__ NumericT bufA[272];
1497  __shared__ NumericT bufB[272];
1498 
1499  vcl_size_t block_size = 16;//get_local_size(0);
1500  vcl_size_t row_block_id = blockIdx.x;
1501  vcl_size_t col_block_id = blockIdx.y;
1502  vcl_size_t row_thread_id = threadIdx.x;
1503  vcl_size_t col_thread_id = threadIdx.y;
1504  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1505  vcl_size_t aStep = block_size * A_col_inc;
1506  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1507  vcl_size_t bStep = block_size * B_row_inc;
1508  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1509  NumericT Csub = 0;
1510  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1511  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1512 
1513  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1514  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1515  for (vcl_size_t block = 0;
1516  block < block_num;
1517  ++block)
1518  {
1519  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1520  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1521  __syncthreads();
1522  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1523  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1524  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1525  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1526  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1527  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1528  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1529  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1530  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1531  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1532  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1533  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1534  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1535  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1536  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1537  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1538  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1539  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1540  __syncthreads();
1541  aBegin += aStep;
1542  bBegin += bStep;
1543  }
1544  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1545  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1546 }
1547 
1548 // matrix-matrix multiplication C = A * B^T
1549 // matrix layouts: C...col_major, A...row_major, B...col_major
1550 template<typename NumericT>
1552  NumericT alpha,
1553  const NumericT * A,
1554  unsigned int A_row_start,
1555  unsigned int A_col_start,
1556  unsigned int A_row_inc,
1557  unsigned int A_col_inc,
1558  unsigned int A_row_size,
1559  unsigned int A_col_size,
1560  unsigned int A_internal_rows,
1561  unsigned int A_internal_cols,
1562  const NumericT * B,
1563  unsigned int B_row_start,
1564  unsigned int B_col_start,
1565  unsigned int B_row_inc,
1566  unsigned int B_col_inc,
1567  unsigned int B_row_size,
1568  unsigned int B_col_size,
1569  unsigned int B_internal_rows,
1570  unsigned int B_internal_cols,
1571  NumericT beta,
1572  NumericT * C,
1573  unsigned int C_row_start,
1574  unsigned int C_col_start,
1575  unsigned int C_row_inc,
1576  unsigned int C_col_inc,
1577  unsigned int C_row_size,
1578  unsigned int C_col_size,
1579  unsigned int C_internal_rows,
1580  unsigned int C_internal_cols)
1581 {
1582 
1583  __shared__ NumericT bufA[272];
1584  __shared__ NumericT bufB[272];
1585 
1586  vcl_size_t block_size = 16;//get_local_size(0);
1587  vcl_size_t row_block_id = blockIdx.x;
1588  vcl_size_t col_block_id = blockIdx.y;
1589  vcl_size_t row_thread_id = threadIdx.x;
1590  vcl_size_t col_thread_id = threadIdx.y;
1591  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1592  vcl_size_t aStep = block_size * A_col_inc;
1593  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1594  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1595  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1596  NumericT Csub = 0;
1597  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1598  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1599 
1600  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1601  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1602  for (vcl_size_t block = 0;
1603  block < block_num;
1604  ++block)
1605  {
1606  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1607  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1608  __syncthreads();
1609  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1610  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1611  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1612  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1613  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1614  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1615  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1616  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1617  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1618  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1619  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1620  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1621  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1622  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1623  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1624  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1625  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1626  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1627  __syncthreads();
1628  aBegin += aStep;
1629  bBegin += bStep;
1630  }
1631  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1632  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1633 }
1634 
1635 // matrix-matrix multiplication C = A^T * B
1636 // matrix layouts: C...col_major, A...row_major, B...col_major
1637 template<typename NumericT>
1639  NumericT alpha,
1640  const NumericT * A,
1641  unsigned int A_row_start,
1642  unsigned int A_col_start,
1643  unsigned int A_row_inc,
1644  unsigned int A_col_inc,
1645  unsigned int A_row_size,
1646  unsigned int A_col_size,
1647  unsigned int A_internal_rows,
1648  unsigned int A_internal_cols,
1649  const NumericT * B,
1650  unsigned int B_row_start,
1651  unsigned int B_col_start,
1652  unsigned int B_row_inc,
1653  unsigned int B_col_inc,
1654  unsigned int B_row_size,
1655  unsigned int B_col_size,
1656  unsigned int B_internal_rows,
1657  unsigned int B_internal_cols,
1658  NumericT beta,
1659  NumericT * C,
1660  unsigned int C_row_start,
1661  unsigned int C_col_start,
1662  unsigned int C_row_inc,
1663  unsigned int C_col_inc,
1664  unsigned int C_row_size,
1665  unsigned int C_col_size,
1666  unsigned int C_internal_rows,
1667  unsigned int C_internal_cols)
1668 {
1669 
1670  __shared__ NumericT bufA[272];
1671  __shared__ NumericT bufB[272];
1672 
1673  vcl_size_t block_size = 16;//get_local_size(0);
1674  vcl_size_t row_block_id = blockIdx.x;
1675  vcl_size_t col_block_id = blockIdx.y;
1676  vcl_size_t row_thread_id = threadIdx.x;
1677  vcl_size_t col_thread_id = threadIdx.y;
1678  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
1679  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
1680  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1681  vcl_size_t bStep = block_size * B_row_inc;
1682  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1683  NumericT Csub = 0;
1684  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1685  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1686 
1687  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1688  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1689  for (vcl_size_t block = 0;
1690  block < block_num;
1691  ++block)
1692  {
1693  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1694  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1695  __syncthreads();
1696  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1697  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1698  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1699  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1700  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1701  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1702  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1703  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1704  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1705  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1706  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1707  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1708  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1709  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1710  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1711  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1712  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1713  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1714  __syncthreads();
1715  aBegin += aStep;
1716  bBegin += bStep;
1717  }
1718  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1719  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1720 }
1721 
1722 // matrix-matrix multiplication C = A^T * B^T
1723 // matrix layouts: C...col_major, A...row_major, B...col_major
1724 template<typename NumericT>
1726  NumericT alpha,
1727  const NumericT * A,
1728  unsigned int A_row_start,
1729  unsigned int A_col_start,
1730  unsigned int A_row_inc,
1731  unsigned int A_col_inc,
1732  unsigned int A_row_size,
1733  unsigned int A_col_size,
1734  unsigned int A_internal_rows,
1735  unsigned int A_internal_cols,
1736  const NumericT * B,
1737  unsigned int B_row_start,
1738  unsigned int B_col_start,
1739  unsigned int B_row_inc,
1740  unsigned int B_col_inc,
1741  unsigned int B_row_size,
1742  unsigned int B_col_size,
1743  unsigned int B_internal_rows,
1744  unsigned int B_internal_cols,
1745  NumericT beta,
1746  NumericT * C,
1747  unsigned int C_row_start,
1748  unsigned int C_col_start,
1749  unsigned int C_row_inc,
1750  unsigned int C_col_inc,
1751  unsigned int C_row_size,
1752  unsigned int C_col_size,
1753  unsigned int C_internal_rows,
1754  unsigned int C_internal_cols)
1755 {
1756 
1757  __shared__ NumericT bufA[272];
1758  __shared__ NumericT bufB[272];
1759 
1760  vcl_size_t block_size = 16;//get_local_size(0);
1761  vcl_size_t row_block_id = blockIdx.x;
1762  vcl_size_t col_block_id = blockIdx.y;
1763  vcl_size_t row_thread_id = threadIdx.x;
1764  vcl_size_t col_thread_id = threadIdx.y;
1765  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
1766  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
1767  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1768  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1769  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
1770  NumericT Csub = 0;
1771  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1772  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1773 
1774  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1775  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1776  for (vcl_size_t block = 0;
1777  block < block_num;
1778  ++block)
1779  {
1780  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
1781  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1782  __syncthreads();
1783  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1784  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1785  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1786  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1787  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1788  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1789  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1790  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1791  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1792  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1793  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1794  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1795  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1796  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1797  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1798  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1799  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1800  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1801  __syncthreads();
1802  aBegin += aStep;
1803  bBegin += bStep;
1804  }
1805  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1806  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
1807 }
1808 
1809 
1810 
1811 
1813 
1814 
1815 
1816 
1817 // matrix-matrix multiplication C = A * B
1818 // matrix layouts: C...row_major, A...row_major, B...col_major
1819 template<typename NumericT>
1821  NumericT alpha,
1822  const NumericT * A,
1823  unsigned int A_row_start,
1824  unsigned int A_col_start,
1825  unsigned int A_row_inc,
1826  unsigned int A_col_inc,
1827  unsigned int A_row_size,
1828  unsigned int A_col_size,
1829  unsigned int A_internal_rows,
1830  unsigned int A_internal_cols,
1831  const NumericT * B,
1832  unsigned int B_row_start,
1833  unsigned int B_col_start,
1834  unsigned int B_row_inc,
1835  unsigned int B_col_inc,
1836  unsigned int B_row_size,
1837  unsigned int B_col_size,
1838  unsigned int B_internal_rows,
1839  unsigned int B_internal_cols,
1840  NumericT beta,
1841  NumericT * C,
1842  unsigned int C_row_start,
1843  unsigned int C_col_start,
1844  unsigned int C_row_inc,
1845  unsigned int C_col_inc,
1846  unsigned int C_row_size,
1847  unsigned int C_col_size,
1848  unsigned int C_internal_rows,
1849  unsigned int C_internal_cols)
1850 {
1851 
1852  __shared__ NumericT bufA[272];
1853  __shared__ NumericT bufB[272];
1854 
1855  vcl_size_t block_size = 16;//get_local_size(0);
1856  vcl_size_t row_block_id = blockIdx.x;
1857  vcl_size_t col_block_id = blockIdx.y;
1858  vcl_size_t row_thread_id = threadIdx.x;
1859  vcl_size_t col_thread_id = threadIdx.y;
1860  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1861  vcl_size_t aStep = block_size * A_col_inc;
1862  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
1863  vcl_size_t bStep = block_size * B_row_inc;
1864  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1865  NumericT Csub = 0;
1866  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1867  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1868 
1869  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1870  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1871  for (vcl_size_t block = 0;
1872  block < block_num;
1873  ++block)
1874  {
1875  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1876  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
1877  __syncthreads();
1878  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1879  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1880  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1881  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1882  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1883  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1884  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1885  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1886  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1887  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1888  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1889  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1890  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1891  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1892  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1893  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1894  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1895  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1896  __syncthreads();
1897  aBegin += aStep;
1898  bBegin += bStep;
1899  }
1900  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
1901  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1902 }
1903 
1904 // matrix-matrix multiplication C = A * B^T
1905 // matrix layouts: C...row_major, A...row_major, B...col_major
1906 template<typename NumericT>
1908  NumericT alpha,
1909  const NumericT * A,
1910  unsigned int A_row_start,
1911  unsigned int A_col_start,
1912  unsigned int A_row_inc,
1913  unsigned int A_col_inc,
1914  unsigned int A_row_size,
1915  unsigned int A_col_size,
1916  unsigned int A_internal_rows,
1917  unsigned int A_internal_cols,
1918  const NumericT * B,
1919  unsigned int B_row_start,
1920  unsigned int B_col_start,
1921  unsigned int B_row_inc,
1922  unsigned int B_col_inc,
1923  unsigned int B_row_size,
1924  unsigned int B_col_size,
1925  unsigned int B_internal_rows,
1926  unsigned int B_internal_cols,
1927  NumericT beta,
1928  NumericT * C,
1929  unsigned int C_row_start,
1930  unsigned int C_col_start,
1931  unsigned int C_row_inc,
1932  unsigned int C_col_inc,
1933  unsigned int C_row_size,
1934  unsigned int C_col_size,
1935  unsigned int C_internal_rows,
1936  unsigned int C_internal_cols)
1937 {
1938 
1939  __shared__ NumericT bufA[272];
1940  __shared__ NumericT bufB[272];
1941 
1942  vcl_size_t block_size = 16;//get_local_size(0);
1943  vcl_size_t row_block_id = blockIdx.x;
1944  vcl_size_t col_block_id = blockIdx.y;
1945  vcl_size_t row_thread_id = threadIdx.x;
1946  vcl_size_t col_thread_id = threadIdx.y;
1947  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
1948  vcl_size_t aStep = block_size * A_col_inc;
1949  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
1950  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
1951  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
1952  NumericT Csub = 0;
1953  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
1954  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
1955 
1956  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
1957  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
1958  for (vcl_size_t block = 0;
1959  block < block_num;
1960  ++block)
1961  {
1962  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
1963  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
1964  __syncthreads();
1965  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
1966  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
1967  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1968  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1969  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1970  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1971  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1972  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1973  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1974  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1975  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1976  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1977  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1978  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1979  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1980  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1981  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1982  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
1983  __syncthreads();
1984  aBegin += aStep;
1985  bBegin += bStep;
1986  }
1987  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
1988  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
1989 }
1990 
1991 // matrix-matrix multiplication C = A^T * B
1992 // matrix layouts: C...row_major, A...row_major, B...col_major
1993 template<typename NumericT>
1995  NumericT alpha,
1996  const NumericT * A,
1997  unsigned int A_row_start,
1998  unsigned int A_col_start,
1999  unsigned int A_row_inc,
2000  unsigned int A_col_inc,
2001  unsigned int A_row_size,
2002  unsigned int A_col_size,
2003  unsigned int A_internal_rows,
2004  unsigned int A_internal_cols,
2005  const NumericT * B,
2006  unsigned int B_row_start,
2007  unsigned int B_col_start,
2008  unsigned int B_row_inc,
2009  unsigned int B_col_inc,
2010  unsigned int B_row_size,
2011  unsigned int B_col_size,
2012  unsigned int B_internal_rows,
2013  unsigned int B_internal_cols,
2014  NumericT beta,
2015  NumericT * C,
2016  unsigned int C_row_start,
2017  unsigned int C_col_start,
2018  unsigned int C_row_inc,
2019  unsigned int C_col_inc,
2020  unsigned int C_row_size,
2021  unsigned int C_col_size,
2022  unsigned int C_internal_rows,
2023  unsigned int C_internal_cols)
2024 {
2025 
2026  __shared__ NumericT bufA[272];
2027  __shared__ NumericT bufB[272];
2028 
2029  vcl_size_t block_size = 16;//get_local_size(0);
2030  vcl_size_t row_block_id = blockIdx.x;
2031  vcl_size_t col_block_id = blockIdx.y;
2032  vcl_size_t row_thread_id = threadIdx.x;
2033  vcl_size_t col_thread_id = threadIdx.y;
2034  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2035  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2036  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
2037  vcl_size_t bStep = block_size * B_row_inc;
2038  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2039  NumericT Csub = 0;
2040  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2041  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
2042 
2043  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2044  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2045  for (vcl_size_t block = 0;
2046  block < block_num;
2047  ++block)
2048  {
2049  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2050  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2051  __syncthreads();
2052  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2053  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2054  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2055  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2056  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2057  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2058  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2059  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2060  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2061  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2062  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2063  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2064  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2065  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2066  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2067  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2068  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2069  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2070  __syncthreads();
2071  aBegin += aStep;
2072  bBegin += bStep;
2073  }
2074  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2075  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2076 }
2077 
2078 // matrix-matrix multiplication C = A^T * B^T
2079 // matrix layouts: C...row_major, A...row_major, B...col_major
2080 template<typename NumericT>
2082  NumericT alpha,
2083  const NumericT * A,
2084  unsigned int A_row_start,
2085  unsigned int A_col_start,
2086  unsigned int A_row_inc,
2087  unsigned int A_col_inc,
2088  unsigned int A_row_size,
2089  unsigned int A_col_size,
2090  unsigned int A_internal_rows,
2091  unsigned int A_internal_cols,
2092  const NumericT * B,
2093  unsigned int B_row_start,
2094  unsigned int B_col_start,
2095  unsigned int B_row_inc,
2096  unsigned int B_col_inc,
2097  unsigned int B_row_size,
2098  unsigned int B_col_size,
2099  unsigned int B_internal_rows,
2100  unsigned int B_internal_cols,
2101  NumericT beta,
2102  NumericT * C,
2103  unsigned int C_row_start,
2104  unsigned int C_col_start,
2105  unsigned int C_row_inc,
2106  unsigned int C_col_inc,
2107  unsigned int C_row_size,
2108  unsigned int C_col_size,
2109  unsigned int C_internal_rows,
2110  unsigned int C_internal_cols)
2111 {
2112 
2113  __shared__ NumericT bufA[272];
2114  __shared__ NumericT bufB[272];
2115 
2116  vcl_size_t block_size = 16;//get_local_size(0);
2117  vcl_size_t row_block_id = blockIdx.x;
2118  vcl_size_t col_block_id = blockIdx.y;
2119  vcl_size_t row_thread_id = threadIdx.x;
2120  vcl_size_t col_thread_id = threadIdx.y;
2121  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2122  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2123  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
2124  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
2125  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2126  NumericT Csub = 0;
2127  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2128  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
2129 
2130  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2131  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2132  for (vcl_size_t block = 0;
2133  block < block_num;
2134  ++block)
2135  {
2136  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2137  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2138  __syncthreads();
2139  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2140  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2141  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2142  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2143  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2144  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2145  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2146  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2147  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2148  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2149  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2150  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2151  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2152  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2153  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2154  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2155  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2156  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2157  __syncthreads();
2158  aBegin += aStep;
2159  bBegin += bStep;
2160  }
2161  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2162  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2163 }
2164 
2165 
2166 
2167 
2168 
2170 
2171 
2172 
2173 
2174 
2175 
2176 // matrix-matrix multiplication C = A * B
2177 // matrix layouts: C...col_major, A...row_major, B...row_major
2178 template<typename NumericT>
2180  NumericT alpha,
2181  const NumericT * A,
2182  unsigned int A_row_start,
2183  unsigned int A_col_start,
2184  unsigned int A_row_inc,
2185  unsigned int A_col_inc,
2186  unsigned int A_row_size,
2187  unsigned int A_col_size,
2188  unsigned int A_internal_rows,
2189  unsigned int A_internal_cols,
2190  const NumericT * B,
2191  unsigned int B_row_start,
2192  unsigned int B_col_start,
2193  unsigned int B_row_inc,
2194  unsigned int B_col_inc,
2195  unsigned int B_row_size,
2196  unsigned int B_col_size,
2197  unsigned int B_internal_rows,
2198  unsigned int B_internal_cols,
2199  NumericT beta,
2200  NumericT * C,
2201  unsigned int C_row_start,
2202  unsigned int C_col_start,
2203  unsigned int C_row_inc,
2204  unsigned int C_col_inc,
2205  unsigned int C_row_size,
2206  unsigned int C_col_size,
2207  unsigned int C_internal_rows,
2208  unsigned int C_internal_cols)
2209 {
2210 
2211  __shared__ NumericT bufA[272];
2212  __shared__ NumericT bufB[272];
2213 
2214  vcl_size_t block_size = 16;//get_local_size(0);
2215  vcl_size_t row_block_id = blockIdx.x;
2216  vcl_size_t col_block_id = blockIdx.y;
2217  vcl_size_t row_thread_id = threadIdx.x;
2218  vcl_size_t col_thread_id = threadIdx.y;
2219  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2220  vcl_size_t aStep = block_size * A_col_inc;
2221  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2222  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2223  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2224  NumericT Csub = 0;
2225  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2226  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2227 
2228  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2229  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2230  for (vcl_size_t block = 0;
2231  block < block_num;
2232  ++block)
2233  {
2234  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2235  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2236  __syncthreads();
2237  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2238  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2239  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2240  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2241  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2242  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2243  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2244  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2245  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2246  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2247  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2248  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2249  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2250  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2251  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2252  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2253  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2254  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2255  __syncthreads();
2256  aBegin += aStep;
2257  bBegin += bStep;
2258  }
2259  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2260  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2261 }
2262 
2263 // matrix-matrix multiplication C = A * B^T
2264 // matrix layouts: C...col_major, A...row_major, B...row_major
2265 template<typename NumericT>
2267  NumericT alpha,
2268  const NumericT * A,
2269  unsigned int A_row_start,
2270  unsigned int A_col_start,
2271  unsigned int A_row_inc,
2272  unsigned int A_col_inc,
2273  unsigned int A_row_size,
2274  unsigned int A_col_size,
2275  unsigned int A_internal_rows,
2276  unsigned int A_internal_cols,
2277  const NumericT * B,
2278  unsigned int B_row_start,
2279  unsigned int B_col_start,
2280  unsigned int B_row_inc,
2281  unsigned int B_col_inc,
2282  unsigned int B_row_size,
2283  unsigned int B_col_size,
2284  unsigned int B_internal_rows,
2285  unsigned int B_internal_cols,
2286  NumericT beta,
2287  NumericT * C,
2288  unsigned int C_row_start,
2289  unsigned int C_col_start,
2290  unsigned int C_row_inc,
2291  unsigned int C_col_inc,
2292  unsigned int C_row_size,
2293  unsigned int C_col_size,
2294  unsigned int C_internal_rows,
2295  unsigned int C_internal_cols)
2296 {
2297 
2298  __shared__ NumericT bufA[272];
2299  __shared__ NumericT bufB[272];
2300 
2301  vcl_size_t block_size = 16;//get_local_size(0);
2302  vcl_size_t row_block_id = blockIdx.x;
2303  vcl_size_t col_block_id = blockIdx.y;
2304  vcl_size_t row_thread_id = threadIdx.x;
2305  vcl_size_t col_thread_id = threadIdx.y;
2306  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2307  vcl_size_t aStep = block_size * A_col_inc;
2308  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2309  vcl_size_t bStep = block_size * B_col_inc;
2310  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2311  NumericT Csub = 0;
2312  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2313  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2314 
2315  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2316  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2317  for (vcl_size_t block = 0;
2318  block < block_num;
2319  ++block)
2320  {
2321  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2322  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2323  __syncthreads();
2324  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2325  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2326  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2327  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2328  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2329  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2330  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2331  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2332  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2333  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2334  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2335  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2336  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2337  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2338  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2339  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2340  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2341  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2342  __syncthreads();
2343  aBegin += aStep;
2344  bBegin += bStep;
2345  }
2346  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2347  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2348 }
2349 
2350 // matrix-matrix multiplication C = A^T * B
2351 // matrix layouts: C...col_major, A...row_major, B...row_major
2352 template<typename NumericT>
2354  NumericT alpha,
2355  const NumericT * A,
2356  unsigned int A_row_start,
2357  unsigned int A_col_start,
2358  unsigned int A_row_inc,
2359  unsigned int A_col_inc,
2360  unsigned int A_row_size,
2361  unsigned int A_col_size,
2362  unsigned int A_internal_rows,
2363  unsigned int A_internal_cols,
2364  const NumericT * B,
2365  unsigned int B_row_start,
2366  unsigned int B_col_start,
2367  unsigned int B_row_inc,
2368  unsigned int B_col_inc,
2369  unsigned int B_row_size,
2370  unsigned int B_col_size,
2371  unsigned int B_internal_rows,
2372  unsigned int B_internal_cols,
2373  NumericT beta,
2374  NumericT * C,
2375  unsigned int C_row_start,
2376  unsigned int C_col_start,
2377  unsigned int C_row_inc,
2378  unsigned int C_col_inc,
2379  unsigned int C_row_size,
2380  unsigned int C_col_size,
2381  unsigned int C_internal_rows,
2382  unsigned int C_internal_cols)
2383 {
2384 
2385  __shared__ NumericT bufA[272];
2386  __shared__ NumericT bufB[272];
2387 
2388  vcl_size_t block_size = 16;//get_local_size(0);
2389  vcl_size_t row_block_id = blockIdx.x;
2390  vcl_size_t col_block_id = blockIdx.y;
2391  vcl_size_t row_thread_id = threadIdx.x;
2392  vcl_size_t col_thread_id = threadIdx.y;
2393  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2394  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2395  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2396  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2397  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2398  NumericT Csub = 0;
2399  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2400  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2401 
2402  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2403  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2404  for (vcl_size_t block = 0;
2405  block < block_num;
2406  ++block)
2407  {
2408  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2409  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2410  __syncthreads();
2411  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2412  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2413  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2414  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2415  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2416  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2417  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2418  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2419  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2420  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2421  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2422  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2423  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2424  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2425  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2426  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2427  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2428  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2429  __syncthreads();
2430  aBegin += aStep;
2431  bBegin += bStep;
2432  }
2433  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2434  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2435 }
2436 
2437 // matrix-matrix multiplication C = A^T * B^T
2438 // matrix layouts: C...col_major, A...row_major, B...row_major
2439 template<typename NumericT>
2441  NumericT alpha,
2442  const NumericT * A,
2443  unsigned int A_row_start,
2444  unsigned int A_col_start,
2445  unsigned int A_row_inc,
2446  unsigned int A_col_inc,
2447  unsigned int A_row_size,
2448  unsigned int A_col_size,
2449  unsigned int A_internal_rows,
2450  unsigned int A_internal_cols,
2451  const NumericT * B,
2452  unsigned int B_row_start,
2453  unsigned int B_col_start,
2454  unsigned int B_row_inc,
2455  unsigned int B_col_inc,
2456  unsigned int B_row_size,
2457  unsigned int B_col_size,
2458  unsigned int B_internal_rows,
2459  unsigned int B_internal_cols,
2460  NumericT beta,
2461  NumericT * C,
2462  unsigned int C_row_start,
2463  unsigned int C_col_start,
2464  unsigned int C_row_inc,
2465  unsigned int C_col_inc,
2466  unsigned int C_row_size,
2467  unsigned int C_col_size,
2468  unsigned int C_internal_rows,
2469  unsigned int C_internal_cols)
2470 {
2471 
2472  __shared__ NumericT bufA[272];
2473  __shared__ NumericT bufB[272];
2474 
2475  vcl_size_t block_size = 16;//get_local_size(0);
2476  vcl_size_t row_block_id = blockIdx.x;
2477  vcl_size_t col_block_id = blockIdx.y;
2478  vcl_size_t row_thread_id = threadIdx.x;
2479  vcl_size_t col_thread_id = threadIdx.y;
2480  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2481  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2482  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2483  vcl_size_t bStep = block_size * B_col_inc;
2484  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2485  NumericT Csub = 0;
2486  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2487  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2488 
2489  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2490  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2491  for (vcl_size_t block = 0;
2492  block < block_num;
2493  ++block)
2494  {
2495  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2496  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2497  __syncthreads();
2498  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2499  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2500  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2501  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2502  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2503  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2504  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2505  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2506  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2507  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2508  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2509  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2510  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2511  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2512  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2513  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2514  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2515  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2516  __syncthreads();
2517  aBegin += aStep;
2518  bBegin += bStep;
2519  }
2520  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2521  C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
2522 }
2523 
2524 
2525 
2526 
2527 
2529 
2530 
2531 
2532 
2533 // matrix-matrix multiplication C = A * B
2534 // matrix layouts: C...row_major, A...row_major, B...row_major
2535 template<typename NumericT>
2537  NumericT alpha,
2538  const NumericT * A,
2539  unsigned int A_row_start,
2540  unsigned int A_col_start,
2541  unsigned int A_row_inc,
2542  unsigned int A_col_inc,
2543  unsigned int A_row_size,
2544  unsigned int A_col_size,
2545  unsigned int A_internal_rows,
2546  unsigned int A_internal_cols,
2547  const NumericT * B,
2548  unsigned int B_row_start,
2549  unsigned int B_col_start,
2550  unsigned int B_row_inc,
2551  unsigned int B_col_inc,
2552  unsigned int B_row_size,
2553  unsigned int B_col_size,
2554  unsigned int B_internal_rows,
2555  unsigned int B_internal_cols,
2556  NumericT beta,
2557  NumericT * C,
2558  unsigned int C_row_start,
2559  unsigned int C_col_start,
2560  unsigned int C_row_inc,
2561  unsigned int C_col_inc,
2562  unsigned int C_row_size,
2563  unsigned int C_col_size,
2564  unsigned int C_internal_rows,
2565  unsigned int C_internal_cols)
2566 {
2567 
2568  __shared__ NumericT bufA[272];
2569  __shared__ NumericT bufB[272];
2570 
2571  vcl_size_t block_size = 16;//get_local_size(0);
2572  vcl_size_t row_block_id = blockIdx.x;
2573  vcl_size_t col_block_id = blockIdx.y;
2574  vcl_size_t row_thread_id = threadIdx.x;
2575  vcl_size_t col_thread_id = threadIdx.y;
2576  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2577  vcl_size_t aStep = block_size * A_col_inc;
2578  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2579  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2580  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2581  NumericT Csub = 0;
2582  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2583  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2584 
2585  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2586  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2587  for (vcl_size_t block = 0;
2588  block < block_num;
2589  ++block)
2590  {
2591  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2592  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2593  __syncthreads();
2594  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2595  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2596  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2597  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2598  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2599  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2600  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2601  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2602  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2603  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2604  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2605  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2606  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2607  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2608  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2609  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2610  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2611  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2612  __syncthreads();
2613  aBegin += aStep;
2614  bBegin += bStep;
2615  }
2616  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2617  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2618 }
2619 
2620 // matrix-matrix multiplication C = A * B^T
2621 // matrix layouts: C...row_major, A...row_major, B...row_major
2622 template<typename NumericT>
2624  NumericT alpha,
2625  const NumericT * A,
2626  unsigned int A_row_start,
2627  unsigned int A_col_start,
2628  unsigned int A_row_inc,
2629  unsigned int A_col_inc,
2630  unsigned int A_row_size,
2631  unsigned int A_col_size,
2632  unsigned int A_internal_rows,
2633  unsigned int A_internal_cols,
2634  const NumericT * B,
2635  unsigned int B_row_start,
2636  unsigned int B_col_start,
2637  unsigned int B_row_inc,
2638  unsigned int B_col_inc,
2639  unsigned int B_row_size,
2640  unsigned int B_col_size,
2641  unsigned int B_internal_rows,
2642  unsigned int B_internal_cols,
2643  NumericT beta,
2644  NumericT * C,
2645  unsigned int C_row_start,
2646  unsigned int C_col_start,
2647  unsigned int C_row_inc,
2648  unsigned int C_col_inc,
2649  unsigned int C_row_size,
2650  unsigned int C_col_size,
2651  unsigned int C_internal_rows,
2652  unsigned int C_internal_cols)
2653 {
2654 
2655  __shared__ NumericT bufA[272];
2656  __shared__ NumericT bufB[272];
2657 
2658  vcl_size_t block_size = 16;//get_local_size(0);
2659  vcl_size_t row_block_id = blockIdx.x;
2660  vcl_size_t col_block_id = blockIdx.y;
2661  vcl_size_t row_thread_id = threadIdx.x;
2662  vcl_size_t col_thread_id = threadIdx.y;
2663  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
2664  vcl_size_t aStep = block_size * A_col_inc;
2665  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2666  vcl_size_t bStep = block_size * B_col_inc;
2667  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
2668  NumericT Csub = 0;
2669  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2670  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2671 
2672  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2673  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2674  for (vcl_size_t block = 0;
2675  block < block_num;
2676  ++block)
2677  {
2678  bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
2679  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2680  __syncthreads();
2681  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2682  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2683  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2684  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2685  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2686  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2687  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2688  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2689  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2690  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2691  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2692  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2693  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2694  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2695  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2696  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2697  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2698  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2699  __syncthreads();
2700  aBegin += aStep;
2701  bBegin += bStep;
2702  }
2703  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2704  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2705 }
2706 
2707 // matrix-matrix multiplication C = A^T * B
2708 // matrix layouts: C...row_major, A...row_major, B...row_major
2709 template<typename NumericT>
2711  NumericT alpha,
2712  const NumericT * A,
2713  unsigned int A_row_start,
2714  unsigned int A_col_start,
2715  unsigned int A_row_inc,
2716  unsigned int A_col_inc,
2717  unsigned int A_row_size,
2718  unsigned int A_col_size,
2719  unsigned int A_internal_rows,
2720  unsigned int A_internal_cols,
2721  const NumericT * B,
2722  unsigned int B_row_start,
2723  unsigned int B_col_start,
2724  unsigned int B_row_inc,
2725  unsigned int B_col_inc,
2726  unsigned int B_row_size,
2727  unsigned int B_col_size,
2728  unsigned int B_internal_rows,
2729  unsigned int B_internal_cols,
2730  NumericT beta,
2731  NumericT * C,
2732  unsigned int C_row_start,
2733  unsigned int C_col_start,
2734  unsigned int C_row_inc,
2735  unsigned int C_col_inc,
2736  unsigned int C_row_size,
2737  unsigned int C_col_size,
2738  unsigned int C_internal_rows,
2739  unsigned int C_internal_cols)
2740 {
2741 
2742  __shared__ NumericT bufA[272];
2743  __shared__ NumericT bufB[272];
2744 
2745  vcl_size_t block_size = 16;//get_local_size(0);
2746  vcl_size_t row_block_id = blockIdx.x;
2747  vcl_size_t col_block_id = blockIdx.y;
2748  vcl_size_t row_thread_id = threadIdx.x;
2749  vcl_size_t col_thread_id = threadIdx.y;
2750  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2751  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2752  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
2753  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
2754  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2755  NumericT Csub = 0;
2756  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2757  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2758 
2759  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2760  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2761  for (vcl_size_t block = 0;
2762  block < block_num;
2763  ++block)
2764  {
2765  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2766  bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
2767  __syncthreads();
2768  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2769  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2770  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2771  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2772  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2773  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2774  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2775  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2776  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2777  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2778  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2779  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2780  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2781  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2782  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2783  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2784  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2785  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2786  __syncthreads();
2787  aBegin += aStep;
2788  bBegin += bStep;
2789  }
2790  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
2791  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2792 }
2793 
2794 // matrix-matrix multiplication C = A^T * B^T
2795 // matrix layouts: C...row_major, A...row_major, B...row_major
2796 template<typename NumericT>
2798  NumericT alpha,
2799  const NumericT * A,
2800  unsigned int A_row_start,
2801  unsigned int A_col_start,
2802  unsigned int A_row_inc,
2803  unsigned int A_col_inc,
2804  unsigned int A_row_size,
2805  unsigned int A_col_size,
2806  unsigned int A_internal_rows,
2807  unsigned int A_internal_cols,
2808  const NumericT * B,
2809  unsigned int B_row_start,
2810  unsigned int B_col_start,
2811  unsigned int B_row_inc,
2812  unsigned int B_col_inc,
2813  unsigned int B_row_size,
2814  unsigned int B_col_size,
2815  unsigned int B_internal_rows,
2816  unsigned int B_internal_cols,
2817  NumericT beta,
2818  NumericT * C,
2819  unsigned int C_row_start,
2820  unsigned int C_col_start,
2821  unsigned int C_row_inc,
2822  unsigned int C_col_inc,
2823  unsigned int C_row_size,
2824  unsigned int C_col_size,
2825  unsigned int C_internal_rows,
2826  unsigned int C_internal_cols)
2827 {
2828 
2829  __shared__ NumericT bufA[272];
2830  __shared__ NumericT bufB[272];
2831 
2832  vcl_size_t block_size = 16;//get_local_size(0);
2833  vcl_size_t row_block_id = blockIdx.x;
2834  vcl_size_t col_block_id = blockIdx.y;
2835  vcl_size_t row_thread_id = threadIdx.x;
2836  vcl_size_t col_thread_id = threadIdx.y;
2837  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
2838  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
2839  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
2840  vcl_size_t bStep = block_size * B_col_inc;
2841  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
2842  NumericT Csub = 0;
2843  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
2844  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
2845 
2846  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
2847  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
2848  for (vcl_size_t block = 0;
2849  block < block_num;
2850  ++block)
2851  {
2852  bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
2853  bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
2854  __syncthreads();
2855  NumericT * bufAptr = bufA + row_thread_id_times_block_size;
2856  NumericT * bufBptr = bufB + col_thread_id_times_block_size;
2857  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2858  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2859  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2860  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2861  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2862  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2863  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2864  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2865  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2866  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2867  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2868  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2869  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2870  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2871  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2872  Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
2873  __syncthreads();
2874  aBegin += aStep;
2875  bBegin += bStep;
2876  }
2877  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
2878  C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
2879 }
2880 
2881 
2882 } // namespace cuda
2883 } //namespace linalg
2884 } //namespace viennacl
2885 
2886 
2887 #endif
__global__ void matrix_matrix_col_col_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
This file provides the forward declarations for the main types used within ViennaCL.
__global__ void matrix_matrix_col_row_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
float NumericT
Definition: bisect.cpp:40
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
__global__ void matrix_matrix_col_col_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
std::size_t vcl_size_t
Definition: forwards.h:75
__global__ void matrix_matrix_row_row_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_col_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_row_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_row_prod_TT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_col_col_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_col_row_col_prod_AA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_TA_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)
__global__ void matrix_matrix_row_col_row_prod_AT_kernel(NumericT alpha, const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *B, unsigned int B_row_start, unsigned int B_col_start, unsigned int B_row_inc, unsigned int B_col_inc, unsigned int B_row_size, unsigned int B_col_size, unsigned int B_internal_rows, unsigned int B_internal_cols, NumericT beta, NumericT *C, unsigned int C_row_start, unsigned int C_col_start, unsigned int C_row_inc, unsigned int C_col_inc, unsigned int C_row_size, unsigned int C_col_size, unsigned int C_internal_rows, unsigned int C_internal_cols)