00001 #ifndef VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
00002 #define VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
00003
00004 namespace viennacl
00005 {
00006 namespace linalg
00007 {
00008 namespace kernels
00009 {
00010 const char * const compressed_matrix_align4_vec_mul =
00011 "__kernel void vec_mul(\n"
00012 " __global const unsigned int * row_indices,\n"
00013 " __global const uint4 * column_indices, \n"
00014 " __global const float4 * elements,\n"
00015 " __global const float * vector, \n"
00016 " __global float * result,\n"
00017 " unsigned int size)\n"
00018 "{ \n"
00019 " float dot_prod;\n"
00020 " unsigned int start, next_stop;\n"
00021 " uint4 col_idx;\n"
00022 " float4 tmp_vec;\n"
00023 " float4 tmp_entries;\n"
00024 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00025 " {\n"
00026 " dot_prod = 0.0f;\n"
00027 " start = row_indices[row] / 4;\n"
00028 " next_stop = row_indices[row+1] / 4;\n"
00029 " for (unsigned int i = start; i < next_stop; ++i)\n"
00030 " {\n"
00031 " col_idx = column_indices[i];\n"
00032 " tmp_entries = elements[i];\n"
00033 " tmp_vec.x = vector[col_idx.x];\n"
00034 " tmp_vec.y = vector[col_idx.y];\n"
00035 " tmp_vec.z = vector[col_idx.z];\n"
00036 " tmp_vec.w = vector[col_idx.w];\n"
00037 " dot_prod += dot(tmp_entries, tmp_vec);\n"
00038 " }\n"
00039 " result[row] = dot_prod;\n"
00040 " }\n"
00041 "}\n"
00042 ;
00043
00044 const char * const compressed_matrix_align1_jacobi =
00045 "__kernel void jacobi(\n"
00046 " __global const unsigned int * row_indices,\n"
00047 " __global const unsigned int * column_indices,\n"
00048 " __global const float * elements,\n"
00049 " float weight,\n"
00050 " __global const float * old_result,\n"
00051 " __global float * new_result,\n"
00052 " __global const float * rhs,\n"
00053 " unsigned int size)\n"
00054 " {\n"
00055 " float sum, diag=1;\n"
00056 " int col;\n"
00057 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00058 " {\n"
00059 " sum = 0;\n"
00060 " for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++)\n"
00061 " {\n"
00062 " col = column_indices[j];\n"
00063 " if (i == col)\n"
00064 " diag = elements[j];\n"
00065 " else \n"
00066 " sum += elements[j] * old_result[col]; \n"
00067 " } \n"
00068 " new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n"
00069 " } \n"
00070 " } \n"
00071 ;
00072
00073 const char * const compressed_matrix_align1_vec_mul =
00074 "__kernel void vec_mul(\n"
00075 " __global const unsigned int * row_indices,\n"
00076 " __global const unsigned int * column_indices, \n"
00077 " __global const float * elements,\n"
00078 " __global const float * vector, \n"
00079 " __global float * result,\n"
00080 " unsigned int size) \n"
00081 "{ \n"
00082 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00083 " {\n"
00084 " float dot_prod = 0.0f;\n"
00085 " unsigned int row_end = row_indices[row+1];\n"
00086 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00087 " dot_prod += elements[i] * vector[column_indices[i]];\n"
00088 " result[row] = dot_prod;\n"
00089 " }\n"
00090 "}\n"
00091 ;
00092
00093 const char * const compressed_matrix_align1_row_scaling_2 =
00094 "__kernel void row_scaling_2(\n"
00095 " __global const unsigned int * row_indices,\n"
00096 " __global const unsigned int * column_indices, \n"
00097 " __global const float * elements,\n"
00098 " __global float * diag_M_inv,\n"
00099 " unsigned int size) \n"
00100 "{ \n"
00101 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00102 " {\n"
00103 " float dot_prod = 0.0f;\n"
00104 " float temp = 0.0f;\n"
00105 " unsigned int row_end = row_indices[row+1];\n"
00106 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00107 " {\n"
00108 " temp = elements[i];\n"
00109 " dot_prod += temp * temp;\n"
00110 " }\n"
00111 " diag_M_inv[row] = 1.0f / sqrt(dot_prod);\n"
00112 " }\n"
00113 "}\n"
00114 ;
00115
00116 const char * const compressed_matrix_align1_jacobi_precond =
00117 "__kernel void jacobi_precond(\n"
00118 " __global const unsigned int * row_indices,\n"
00119 " __global const unsigned int * column_indices, \n"
00120 " __global const float * elements,\n"
00121 " __global float * diag_M_inv,\n"
00122 " unsigned int size) \n"
00123 "{ \n"
00124 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00125 " {\n"
00126 " float diag = 1.0f;\n"
00127 " unsigned int row_end = row_indices[row+1];\n"
00128 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00129 " {\n"
00130 " if (row == column_indices[i])\n"
00131 " {\n"
00132 " diag = elements[i];\n"
00133 " break;\n"
00134 " }\n"
00135 " }\n"
00136 " diag_M_inv[row] = 1.0f / diag;\n"
00137 " }\n"
00138 "}\n"
00139 ;
00140
00141 const char * const compressed_matrix_align1_row_scaling_1 =
00142 "__kernel void row_scaling_1(\n"
00143 " __global const unsigned int * row_indices,\n"
00144 " __global const unsigned int * column_indices, \n"
00145 " __global const float * elements,\n"
00146 " __global float * diag_M_inv,\n"
00147 " unsigned int size) \n"
00148 "{ \n"
00149 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00150 " {\n"
00151 " float dot_prod = 0.0f;\n"
00152 " unsigned int row_end = row_indices[row+1];\n"
00153 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00154 " dot_prod += fabs(elements[i]);\n"
00155 " diag_M_inv[row] = 1.0f / dot_prod;\n"
00156 " }\n"
00157 "}\n"
00158 ;
00159
00160 const char * const compressed_matrix_align1_lu_forward =
00161 " \n"
00162 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
00163 "__kernel void lu_forward(\n"
00164 " __global const unsigned int * row_indices,\n"
00165 " __global const unsigned int * column_indices, \n"
00166 " __global const float * elements,\n"
00167 " __local int * buffer, \n"
00168 " __local float * vec_entries, //a memory block from vector\n"
00169 " __global float * vector,\n"
00170 " unsigned int size) \n"
00171 "{\n"
00172 " int waiting_for; //block index that must be finished before the current thread can start\n"
00173 " unsigned int waiting_for_index;\n"
00174 " int block_offset;\n"
00175 " unsigned int col;\n"
00176 " unsigned int row;\n"
00177 " unsigned int row_index_end;\n"
00178 " \n"
00179 " //backward substitution: one thread per row in blocks of get_global_size(0)\n"
00180 " for (unsigned int block_num = 0; block_num <= size / get_global_size(0); ++block_num)\n"
00181 " {\n"
00182 " block_offset = block_num * get_global_size(0);\n"
00183 " row = block_offset + get_global_id(0);\n"
00184 " buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
00185 " waiting_for = -1;\n"
00186 " if (row < size)\n"
00187 " {\n"
00188 " vec_entries[get_global_id(0)] = vector[row];\n"
00189 " waiting_for_index = row_indices[row];\n"
00190 " row_index_end = row_indices[row+1];\n"
00191 " }\n"
00192 " \n"
00193 " if (get_global_id(0) == 0)\n"
00194 " buffer[get_global_size(0)] = 1;\n"
00195 " //try to eliminate all lines in the block. \n"
00196 " //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
00197 " for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
00198 " {\n"
00199 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00200 " if (row < size) //valid index?\n"
00201 " {\n"
00202 " if (waiting_for >= 0)\n"
00203 " {\n"
00204 " if (buffer[waiting_for] == 1)\n"
00205 " waiting_for = -1;\n"
00206 " }\n"
00207 " \n"
00208 " if (waiting_for == -1) //substitution not yet done, check whether possible\n"
00209 " {\n"
00210 " //check whether reduction is possible:\n"
00211 " for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
00212 " {\n"
00213 " col = column_indices[j];\n"
00214 " if (col < block_offset) //index valid, but not from current block\n"
00215 " vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
00216 " else if (col < row) //index is from current block\n"
00217 " {\n"
00218 " if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
00219 " {\n"
00220 " waiting_for = col - block_offset;\n"
00221 " waiting_for_index = j;\n"
00222 " break;\n"
00223 " }\n"
00224 " else //updated entry is available in shared memory:\n"
00225 " vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
00226 " }\n"
00227 " }\n"
00228 " \n"
00229 " if (waiting_for == -1) //this row is done\n"
00230 " {\n"
00231 " buffer[get_global_id(0)] = 1;\n"
00232 " waiting_for = -2; //magic number: thread is finished\n"
00233 " }\n"
00234 " } \n"
00235 " } //row < size\n"
00236 " else\n"
00237 " buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
00238 " ///////// check whether all threads are done. If yes, exit loop /////////////\n"
00239 " \n"
00240 " if (buffer[get_global_id(0)] == 0)\n"
00241 " buffer[get_global_size(0)] = 0;\n"
00242 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00243 " \n"
00244 " if (buffer[get_global_size(0)] > 0) //all threads break this loop simultaneously\n"
00245 " break;\n"
00246 " if (get_global_id(0) == 0)\n"
00247 " buffer[get_global_size(0)] = 1;\n"
00248 " } //for k\n"
00249 " \n"
00250 " //write to vector:\n"
00251 " if (row < size)\n"
00252 " vector[row] = vec_entries[get_global_id(0)];\n"
00253 " \n"
00254 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00255 " } //for block_num\n"
00256 "}\n"
00257 ;
00258
00259 const char * const compressed_matrix_align1_bicgstab_kernel2 =
00260 "void helper_bicgstab_kernel2_parallel_reduction( __local float * tmp_buffer )\n"
00261 "{\n"
00262 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00263 " {\n"
00264 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00265 " if (get_local_id(0) < stride)\n"
00266 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00267 " }\n"
00268 "}\n"
00269 "//////// inner products:\n"
00270 "float bicgstab_kernel2_inner_prod(\n"
00271 " __global const float * vec1,\n"
00272 " __global const float * vec2,\n"
00273 " unsigned int size,\n"
00274 " __local float * tmp_buffer)\n"
00275 "{\n"
00276 " float tmp = 0;\n"
00277 " unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00278 " for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
00279 " {\n"
00280 " if (i < size)\n"
00281 " tmp += vec1[i] * vec2[i];\n"
00282 " }\n"
00283 " tmp_buffer[get_local_id(0)] = tmp;\n"
00284 " \n"
00285 " helper_bicgstab_kernel2_parallel_reduction(tmp_buffer);\n"
00286 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00287 " return tmp_buffer[0];\n"
00288 "}\n"
00289 "__kernel void bicgstab_kernel2(\n"
00290 " __global const float * tmp0,\n"
00291 " __global const float * tmp1,\n"
00292 " __global const float * r0star, \n"
00293 " __global const float * s, \n"
00294 " __global float * p, \n"
00295 " __global float * result,\n"
00296 " __global float * residual,\n"
00297 " __global const float * alpha,\n"
00298 " __global float * ip_rr0star,\n"
00299 " __global float * error_estimate,\n"
00300 " __local float * tmp_buffer,\n"
00301 " unsigned int size) \n"
00302 "{ \n"
00303 " float omega_local = bicgstab_kernel2_inner_prod(tmp1, s, size, tmp_buffer) / bicgstab_kernel2_inner_prod(tmp1, tmp1, size, tmp_buffer);\n"
00304 " float alpha_local = alpha[0];\n"
00305 " \n"
00306 " //result += alpha * p + omega * s;\n"
00307 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00308 " result[i] += alpha_local * p[i] + omega_local * s[i];\n"
00309 " //residual = s - omega * tmp1;\n"
00310 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00311 " residual[i] = s[i] - omega_local * tmp1[i];\n"
00312 " //new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);\n"
00313 " float new_ip_rr0star = bicgstab_kernel2_inner_prod(residual, r0star, size, tmp_buffer);\n"
00314 " float beta = (new_ip_rr0star / ip_rr0star[0]) * (alpha_local / omega_local);\n"
00315 " \n"
00316 " //p = residual + beta * (p - omega*tmp0);\n"
00317 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00318 " p[i] = residual[i] + beta * (p[i] - omega_local * tmp0[i]);\n"
00319 " //compute norm of residual:\n"
00320 " float new_error_estimate = bicgstab_kernel2_inner_prod(residual, residual, size, tmp_buffer);\n"
00321 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00322 " //update values:\n"
00323 " if (get_global_id(0) == 0)\n"
00324 " {\n"
00325 " error_estimate[0] = new_error_estimate;\n"
00326 " ip_rr0star[0] = new_ip_rr0star;\n"
00327 " }\n"
00328 "}\n"
00329 ;
00330
00331 const char * const compressed_matrix_align1_lu_backward =
00332 "// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n"
00333 "__kernel void lu_backward(\n"
00334 " __global const unsigned int * row_indices,\n"
00335 " __global const unsigned int * column_indices, \n"
00336 " __global const float * elements,\n"
00337 " __local int * buffer, \n"
00338 " __local float * vec_entries, //a memory block from vector\n"
00339 " __global float * vector,\n"
00340 " unsigned int size) \n"
00341 "{\n"
00342 " int waiting_for; //block index that must be finished before the current thread can start\n"
00343 " unsigned int waiting_for_index;\n"
00344 " unsigned int block_offset;\n"
00345 " unsigned int col;\n"
00346 " unsigned int row;\n"
00347 " unsigned int row_index_end;\n"
00348 " float diagonal_entry = 42;\n"
00349 " \n"
00350 " //forward substitution: one thread per row in blocks of get_global_size(0)\n"
00351 " for (int block_num = size / get_global_size(0); block_num > -1; --block_num)\n"
00352 " {\n"
00353 " block_offset = block_num * get_global_size(0);\n"
00354 " row = block_offset + get_global_id(0);\n"
00355 " buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
00356 " waiting_for = -1;\n"
00357 " \n"
00358 " if (row < size)\n"
00359 " {\n"
00360 " vec_entries[get_global_id(0)] = vector[row];\n"
00361 " waiting_for_index = row_indices[row];\n"
00362 " row_index_end = row_indices[row+1];\n"
00363 " diagonal_entry = column_indices[waiting_for_index];\n"
00364 " }\n"
00365 " \n"
00366 " if (get_global_id(0) == 0)\n"
00367 " buffer[get_global_size(0)] = 1;\n"
00368 " //try to eliminate all lines in the block. \n"
00369 " //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
00370 " for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
00371 " {\n"
00372 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00373 " if (row < size) //valid index?\n"
00374 " {\n"
00375 " if (waiting_for >= 0)\n"
00376 " {\n"
00377 " if (buffer[waiting_for] == 1)\n"
00378 " waiting_for = -1;\n"
00379 " }\n"
00380 " \n"
00381 " if (waiting_for == -1) //substitution not yet done, check whether possible\n"
00382 " {\n"
00383 " //check whether reduction is possible:\n"
00384 " for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
00385 " {\n"
00386 " col = column_indices[j];\n"
00387 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00388 " if (col >= block_offset + get_global_size(0)) //index valid, but not from current block\n"
00389 " vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
00390 " else if (col > row) //index is from current block\n"
00391 " {\n"
00392 " if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
00393 " {\n"
00394 " waiting_for = col - block_offset;\n"
00395 " waiting_for_index = j;\n"
00396 " break;\n"
00397 " }\n"
00398 " else //updated entry is available in shared memory:\n"
00399 " vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
00400 " }\n"
00401 " else if (col == row)\n"
00402 " diagonal_entry = elements[j];\n"
00403 " }\n"
00404 " \n"
00405 " if (waiting_for == -1) //this row is done\n"
00406 " {\n"
00407 " if (row == 0)\n"
00408 " vec_entries[get_global_id(0)] /= elements[0];\n"
00409 " else\n"
00410 " vec_entries[get_global_id(0)] /= diagonal_entry;\n"
00411 " buffer[get_global_id(0)] = 1;\n"
00412 " waiting_for = -2; //magic number: thread is finished\n"
00413 " }\n"
00414 " } \n"
00415 " } //row < size\n"
00416 " else\n"
00417 " buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
00418 " \n"
00419 " ///////// check whether all threads are done. If yes, exit loop /////////////\n"
00420 " if (buffer[get_global_id(0)] == 0)\n"
00421 " buffer[get_global_size(0)] = 0;\n"
00422 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00423 " \n"
00424 " if (buffer[get_global_size(0)] > 0) //all threads break the loop simultaneously\n"
00425 " break;\n"
00426 " if (get_global_id(0) == 0)\n"
00427 " buffer[get_global_size(0)] = 1;\n"
00428 " } //for k\n"
00429 " if (row < size)\n"
00430 " vector[row] = vec_entries[get_global_id(0)];\n"
00431 " //vector[row] = diagonal_entry;\n"
00432 " \n"
00433 " //if (row == 0)\n"
00434 " //vector[0] = diagonal_entry;\n"
00435 " //vector[0] = elements[0];\n"
00436 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00437 " } //for block_num\n"
00438 "}\n"
00439 ;
00440
00441 const char * const compressed_matrix_align1_bicgstab_kernel1 =
00442 "void helper_bicgstab_kernel1_parallel_reduction( __local float * tmp_buffer )\n"
00443 "{\n"
00444 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00445 " {\n"
00446 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00447 " if (get_local_id(0) < stride)\n"
00448 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00449 " }\n"
00450 "}\n"
00451 "//////// inner products:\n"
00452 "float bicgstab_kernel1_inner_prod(\n"
00453 " __global const float * vec1,\n"
00454 " __global const float * vec2,\n"
00455 " unsigned int size,\n"
00456 " __local float * tmp_buffer)\n"
00457 "{\n"
00458 " float tmp = 0;\n"
00459 " unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00460 " for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
00461 " {\n"
00462 " if (i < size)\n"
00463 " tmp += vec1[i] * vec2[i];\n"
00464 " }\n"
00465 " tmp_buffer[get_local_id(0)] = tmp;\n"
00466 " \n"
00467 " helper_bicgstab_kernel1_parallel_reduction(tmp_buffer);\n"
00468 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00469 " return tmp_buffer[0];\n"
00470 "}\n"
00471 "__kernel void bicgstab_kernel1(\n"
00472 " __global const float * tmp0,\n"
00473 " __global const float * r0star, \n"
00474 " __global const float * residual,\n"
00475 " __global float * s,\n"
00476 " __global float * alpha,\n"
00477 " __global const float * ip_rr0star,\n"
00478 " __local float * tmp_buffer,\n"
00479 " unsigned int size) \n"
00480 "{ \n"
00481 " float alpha_local = ip_rr0star[0] / bicgstab_kernel1_inner_prod(tmp0, r0star, size, tmp_buffer);\n"
00482 " \n"
00483 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00484 " s[i] = residual[i] - alpha_local * tmp0[i];\n"
00485 " \n"
00486 " if (get_global_id(0) == 0)\n"
00487 " alpha[0] = alpha_local;\n"
00488 "}\n"
00489 ;
00490
00491 const char * const compressed_matrix_align8_vec_mul =
00492 "__kernel void vec_mul(\n"
00493 " __global const unsigned int * row_indices,\n"
00494 " __global const uint8 * column_indices, \n"
00495 " __global const float8 * elements,\n"
00496 " __global const float * vector, \n"
00497 " __global float * result,\n"
00498 " unsigned int size)\n"
00499 "{ \n"
00500 " float dot_prod;\n"
00501 " unsigned int start, next_stop;\n"
00502 " uint8 col_idx;\n"
00503 " float8 tmp_vec;\n"
00504 " float8 tmp_entries;\n"
00505 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00506 " {\n"
00507 " dot_prod = 0.0f;\n"
00508 " start = row_indices[row] / 8;\n"
00509 " next_stop = row_indices[row+1] / 8;\n"
00510 " for (unsigned int i = start; i < next_stop; ++i)\n"
00511 " {\n"
00512 " col_idx = column_indices[i];\n"
00513 " tmp_entries = elements[i];\n"
00514 " tmp_vec.s0 = vector[col_idx.s0];\n"
00515 " tmp_vec.s1 = vector[col_idx.s1];\n"
00516 " tmp_vec.s2 = vector[col_idx.s2];\n"
00517 " tmp_vec.s3 = vector[col_idx.s3];\n"
00518 " tmp_vec.s4 = vector[col_idx.s4];\n"
00519 " tmp_vec.s5 = vector[col_idx.s5];\n"
00520 " tmp_vec.s6 = vector[col_idx.s6];\n"
00521 " tmp_vec.s7 = vector[col_idx.s7];\n"
00522 " dot_prod += dot(tmp_entries.lo, tmp_vec.lo);\n"
00523 " dot_prod += dot(tmp_entries.hi, tmp_vec.hi);\n"
00524 " }\n"
00525 " result[row] = dot_prod;\n"
00526 " }\n"
00527 "}\n"
00528 ;
00529
00530 }
00531 }
00532 }
00533 #endif