00001 #ifndef VIENNACL_LINALG_KERNELS_MATRIX_PROD_COL_COL_COL_SOURCE_HPP_
00002 #define VIENNACL_LINALG_KERNELS_MATRIX_PROD_COL_COL_COL_SOURCE_HPP_
00003
00004 namespace viennacl
00005 {
00006 namespace linalg
00007 {
00008 namespace kernels
00009 {
00010 const char * const matrix_prod_col_col_col_align1_prod_TT =
00011 "// file automatically generated - do not edit!\n"
00012 "// matrix-matrix multiplication C = A^T * B^T\n"
00013 "// matrix layouts: C...col_major, A...col_major, B...col_major\n"
00014 "__kernel void prod_TT(\n"
00015 " __global const float * A,\n"
00016 " unsigned int A_row_start,\n"
00017 " unsigned int A_col_start,\n"
00018 " unsigned int A_row_size,\n"
00019 " unsigned int A_col_size,\n"
00020 " unsigned int A_internal_rows,\n"
00021 " unsigned int A_internal_cols,\n"
00022 " __global const float * B, \n"
00023 " unsigned int B_row_start,\n"
00024 " unsigned int B_col_start,\n"
00025 " unsigned int B_row_size,\n"
00026 " unsigned int B_col_size,\n"
00027 " unsigned int B_internal_rows,\n"
00028 " unsigned int B_internal_cols,\n"
00029 " __global float * C,\n"
00030 " unsigned int C_row_start,\n"
00031 " unsigned int C_col_start,\n"
00032 " unsigned int C_row_size,\n"
00033 " unsigned int C_col_size,\n"
00034 " unsigned int C_internal_rows,\n"
00035 " unsigned int C_internal_cols,\n"
00036 " __local float * bufA,\n"
00037 " __local float * bufB) \n"
00038 "{ \n"
00039 " size_t block_size = get_local_size(0);\n"
00040 " size_t row_block_id = get_group_id(0);\n"
00041 " size_t col_block_id = get_group_id(1);\n"
00042 " size_t row_thread_id = get_local_id(0);\n"
00043 " size_t col_thread_id = get_local_id(1);\n"
00044 " size_t row_block_id_ = get_local_id(1);\n"
00045 " size_t aBegin = (row_block_id * block_size + A_col_start) * A_internal_rows + A_row_start;\n"
00046 " size_t aStep = block_size;\n"
00047 " size_t bBegin = (col_block_id * block_size + B_row_start) + B_col_start * B_internal_rows;\n"
00048 " size_t bStep = block_size * B_internal_rows;\n"
00049 " size_t block_num = A_row_size / block_size;\n"
00050 " if (block_num * block_size != A_row_size)\n"
00051 " ++block_num;\n"
00052 " float Csub = 0;\n"
00053 " size_t aOffset = row_thread_id * A_internal_rows + col_thread_id;\n"
00054 " size_t bOffset = row_thread_id * B_internal_rows + col_thread_id;\n"
00055 " size_t row_thread_id_times_block_size = row_thread_id * block_size;\n"
00056 " for (size_t block = 0;\n"
00057 " block < block_num;\n"
00058 " ++block)\n"
00059 " {\n"
00060 " bufA[row_thread_id_times_block_size + col_thread_id] = (block * block_size + col_thread_id < A_row_size && get_global_id(0) < A_col_size) ? A[aBegin + aOffset] : 0;\n"
00061 " bufB[col_thread_id * block_size + row_thread_id] = ( (block * block_size + row_thread_id < B_col_size) && get_global_id(1) < B_row_size ) ? B[bBegin + bOffset] : 0;\n"
00062 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00063 "__local float * bufAptr = bufA + row_thread_id_times_block_size;\n"
00064 "__local float * bufBptr = bufB + col_thread_id * block_size;\n"
00065 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00066 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00067 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00068 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00069 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00070 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00071 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00072 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00073 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00074 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00075 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00076 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00077 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00078 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00079 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00080 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00081 " aBegin += aStep;\n"
00082 " bBegin += bStep;\n"
00083 " }\n"
00084 " if (get_global_id(0) < A_col_size && get_global_id(1) < B_row_size)\n"
00085 " C[get_global_id(0) + C_row_start + (get_global_id(1) + C_col_start) * C_internal_rows] = Csub;\n"
00086 "}\n"
00087 ;
00088
00089 const char * const matrix_prod_col_col_col_align1_prod_TA =
00090 "// file automatically generated - do not edit!\n"
00091 "// matrix-matrix multiplication C = A^T * B\n"
00092 "// matrix layouts: C...col_major, A...col_major, B...col_major\n"
00093 "__kernel void prod_TA(\n"
00094 " __global const float * A,\n"
00095 " unsigned int A_row_start,\n"
00096 " unsigned int A_col_start,\n"
00097 " unsigned int A_row_size,\n"
00098 " unsigned int A_col_size,\n"
00099 " unsigned int A_internal_rows,\n"
00100 " unsigned int A_internal_cols,\n"
00101 " __global const float * B, \n"
00102 " unsigned int B_row_start,\n"
00103 " unsigned int B_col_start,\n"
00104 " unsigned int B_row_size,\n"
00105 " unsigned int B_col_size,\n"
00106 " unsigned int B_internal_rows,\n"
00107 " unsigned int B_internal_cols,\n"
00108 " __global float * C,\n"
00109 " unsigned int C_row_start,\n"
00110 " unsigned int C_col_start,\n"
00111 " unsigned int C_row_size,\n"
00112 " unsigned int C_col_size,\n"
00113 " unsigned int C_internal_rows,\n"
00114 " unsigned int C_internal_cols,\n"
00115 " __local float * bufA,\n"
00116 " __local float * bufB) \n"
00117 "{ \n"
00118 " size_t block_size = get_local_size(0);\n"
00119 " size_t row_block_id = get_group_id(0);\n"
00120 " size_t col_block_id = get_group_id(1);\n"
00121 " size_t row_thread_id = get_local_id(0);\n"
00122 " size_t col_thread_id = get_local_id(1);\n"
00123 " size_t row_block_id_ = get_local_id(1);\n"
00124 " size_t aBegin = (row_block_id * block_size + A_col_start) * A_internal_rows + A_row_start;\n"
00125 " size_t aStep = block_size;\n"
00126 " size_t bBegin = (col_block_id * block_size + B_col_start) * B_internal_rows + B_row_start;\n"
00127 " size_t bStep = block_size;\n"
00128 " size_t block_num = A_row_size / block_size;\n"
00129 " if (block_num * block_size != A_row_size)\n"
00130 " ++block_num;\n"
00131 " float Csub = 0;\n"
00132 " size_t aOffset = row_thread_id * A_internal_rows + col_thread_id;\n"
00133 " size_t bOffset = row_thread_id + col_thread_id * B_internal_rows;\n"
00134 " size_t row_thread_id_times_block_size = row_thread_id * block_size;\n"
00135 " for (size_t block = 0;\n"
00136 " block < block_num;\n"
00137 " ++block)\n"
00138 " {\n"
00139 " bufA[row_thread_id_times_block_size + col_thread_id] = (block * block_size + col_thread_id < A_row_size && get_global_id(0) < A_col_size) ? A[aBegin + aOffset] : 0;\n"
00140 " bufB[col_thread_id * block_size + row_thread_id] = ( (block * block_size + row_thread_id < B_row_size) && get_global_id(1) < B_col_size ) ? B[bBegin + bOffset] : 0;\n"
00141 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00142 "__local float * bufAptr = bufA + row_thread_id_times_block_size;\n"
00143 "__local float * bufBptr = bufB + col_thread_id * block_size;\n"
00144 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00145 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00146 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00147 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00148 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00149 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00150 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00151 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00152 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00153 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00154 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00155 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00156 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00157 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00158 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00159 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00160 " aBegin += aStep;\n"
00161 " bBegin += bStep;\n"
00162 " }\n"
00163 " if (get_global_id(0) < A_col_size && get_global_id(1) < B_col_size)\n"
00164 " C[get_global_id(0) + C_row_start + (get_global_id(1) + C_col_start) * C_internal_rows] = Csub;\n"
00165 "}\n"
00166 ;
00167
00168 const char * const matrix_prod_col_col_col_align1_prod_AA =
00169 "// file automatically generated - do not edit!\n"
00170 "// matrix-matrix multiplication C = A * B\n"
00171 "// matrix layouts: C...col_major, A...col_major, B...col_major\n"
00172 "__kernel void prod_AA(\n"
00173 " __global const float * A,\n"
00174 " unsigned int A_row_start,\n"
00175 " unsigned int A_col_start,\n"
00176 " unsigned int A_row_size,\n"
00177 " unsigned int A_col_size,\n"
00178 " unsigned int A_internal_rows,\n"
00179 " unsigned int A_internal_cols,\n"
00180 " __global const float * B, \n"
00181 " unsigned int B_row_start,\n"
00182 " unsigned int B_col_start,\n"
00183 " unsigned int B_row_size,\n"
00184 " unsigned int B_col_size,\n"
00185 " unsigned int B_internal_rows,\n"
00186 " unsigned int B_internal_cols,\n"
00187 " __global float * C,\n"
00188 " unsigned int C_row_start,\n"
00189 " unsigned int C_col_start,\n"
00190 " unsigned int C_row_size,\n"
00191 " unsigned int C_col_size,\n"
00192 " unsigned int C_internal_rows,\n"
00193 " unsigned int C_internal_cols,\n"
00194 " __local float * bufA,\n"
00195 " __local float * bufB) \n"
00196 "{ \n"
00197 " size_t block_size = get_local_size(0);\n"
00198 " size_t row_block_id = get_group_id(0);\n"
00199 " size_t col_block_id = get_group_id(1);\n"
00200 " size_t row_thread_id = get_local_id(0);\n"
00201 " size_t col_thread_id = get_local_id(1);\n"
00202 " size_t row_block_id_ = get_local_id(1);\n"
00203 " size_t aBegin = (row_block_id * block_size + A_row_start) + A_col_start * A_internal_rows;\n"
00204 " size_t aStep = block_size * A_internal_rows;\n"
00205 " size_t bBegin = (col_block_id * block_size + B_col_start) * B_internal_rows + B_row_start;\n"
00206 " size_t bStep = block_size;\n"
00207 " size_t block_num = A_col_size / block_size;\n"
00208 " if (block_num * block_size != A_col_size)\n"
00209 " ++block_num;\n"
00210 " float Csub = 0;\n"
00211 " size_t aOffset = row_thread_id + col_thread_id * A_internal_rows;\n"
00212 " size_t bOffset = row_thread_id + col_thread_id * B_internal_rows;\n"
00213 " size_t row_thread_id_times_block_size = row_thread_id * block_size;\n"
00214 " for (size_t block = 0;\n"
00215 " block < block_num;\n"
00216 " ++block)\n"
00217 " {\n"
00218 " bufA[row_thread_id_times_block_size + col_thread_id] = (block * block_size + col_thread_id < A_col_size && get_global_id(0) < A_row_size) ? A[aBegin + aOffset] : 0;\n"
00219 " bufB[col_thread_id * block_size + row_thread_id] = ( (block * block_size + row_thread_id < B_row_size) && get_global_id(1) < B_col_size ) ? B[bBegin + bOffset] : 0;\n"
00220 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00221 "__local float * bufAptr = bufA + row_thread_id_times_block_size;\n"
00222 "__local float * bufBptr = bufB + col_thread_id * block_size;\n"
00223 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00224 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00225 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00226 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00227 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00228 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00229 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00230 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00231 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00232 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00233 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00234 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00235 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00236 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00237 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00238 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00239 " aBegin += aStep;\n"
00240 " bBegin += bStep;\n"
00241 " }\n"
00242 " if (get_global_id(0) < A_row_size && get_global_id(1) < B_col_size)\n"
00243 " C[get_global_id(0) + C_row_start + (get_global_id(1) + C_col_start) * C_internal_rows] = Csub;\n"
00244 "}\n"
00245 ;
00246
00247 const char * const matrix_prod_col_col_col_align1_prod_AT =
00248 "// file automatically generated - do not edit!\n"
00249 "// matrix-matrix multiplication C = A * B^T\n"
00250 "// matrix layouts: C...col_major, A...col_major, B...col_major\n"
00251 "__kernel void prod_AT(\n"
00252 " __global const float * A,\n"
00253 " unsigned int A_row_start,\n"
00254 " unsigned int A_col_start,\n"
00255 " unsigned int A_row_size,\n"
00256 " unsigned int A_col_size,\n"
00257 " unsigned int A_internal_rows,\n"
00258 " unsigned int A_internal_cols,\n"
00259 " __global const float * B, \n"
00260 " unsigned int B_row_start,\n"
00261 " unsigned int B_col_start,\n"
00262 " unsigned int B_row_size,\n"
00263 " unsigned int B_col_size,\n"
00264 " unsigned int B_internal_rows,\n"
00265 " unsigned int B_internal_cols,\n"
00266 " __global float * C,\n"
00267 " unsigned int C_row_start,\n"
00268 " unsigned int C_col_start,\n"
00269 " unsigned int C_row_size,\n"
00270 " unsigned int C_col_size,\n"
00271 " unsigned int C_internal_rows,\n"
00272 " unsigned int C_internal_cols,\n"
00273 " __local float * bufA,\n"
00274 " __local float * bufB) \n"
00275 "{ \n"
00276 " size_t block_size = get_local_size(0);\n"
00277 " size_t row_block_id = get_group_id(0);\n"
00278 " size_t col_block_id = get_group_id(1);\n"
00279 " size_t row_thread_id = get_local_id(0);\n"
00280 " size_t col_thread_id = get_local_id(1);\n"
00281 " size_t row_block_id_ = get_local_id(1);\n"
00282 " size_t aBegin = (row_block_id * block_size + A_row_start) + A_col_start * A_internal_rows;\n"
00283 " size_t aStep = block_size * A_internal_rows;\n"
00284 " size_t bBegin = (col_block_id * block_size + B_row_start) + B_col_start * B_internal_rows;\n"
00285 " size_t bStep = block_size * B_internal_rows;\n"
00286 " size_t block_num = A_col_size / block_size;\n"
00287 " if (block_num * block_size != A_col_size)\n"
00288 " ++block_num;\n"
00289 " float Csub = 0;\n"
00290 " size_t aOffset = row_thread_id + col_thread_id * A_internal_rows;\n"
00291 " size_t bOffset = row_thread_id * B_internal_rows + col_thread_id;\n"
00292 " size_t row_thread_id_times_block_size = row_thread_id * block_size;\n"
00293 " for (size_t block = 0;\n"
00294 " block < block_num;\n"
00295 " ++block)\n"
00296 " {\n"
00297 " bufA[row_thread_id_times_block_size + col_thread_id] = (block * block_size + col_thread_id < A_col_size && get_global_id(0) < A_row_size) ? A[aBegin + aOffset] : 0;\n"
00298 " bufB[col_thread_id * block_size + row_thread_id] = ( (block * block_size + row_thread_id < B_col_size) && get_global_id(1) < B_row_size ) ? B[bBegin + bOffset] : 0;\n"
00299 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00300 "__local float * bufAptr = bufA + row_thread_id_times_block_size;\n"
00301 "__local float * bufBptr = bufB + col_thread_id * block_size;\n"
00302 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00303 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00304 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00305 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00306 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00307 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00308 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00309 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00310 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00311 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00312 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00313 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00314 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00315 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00316 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
00317 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00318 " aBegin += aStep;\n"
00319 " bBegin += bStep;\n"
00320 " }\n"
00321 " if (get_global_id(0) < A_row_size && get_global_id(1) < B_row_size)\n"
00322 " C[get_global_id(0) + C_row_start + (get_global_id(1) + C_col_start) * C_internal_rows] = Csub;\n"
00323 "}\n"
00324 ;
00325
00326 }
00327 }
00328 }
00329 #endif