Go to the documentation of this file.00001 #ifndef VIENNACL_LINALG_KERNELS_COORDINATE_MATRIX_SOURCE_HPP_
00002 #define VIENNACL_LINALG_KERNELS_COORDINATE_MATRIX_SOURCE_HPP_
00003
00004 namespace viennacl
00005 {
00006 namespace linalg
00007 {
00008 namespace kernels
00009 {
00010 const char * const coordinate_matrix_align1_vec_mul =
00011 "//segmented parallel reduction. At present restricted to up to 256 threads\n"
00012 "void segmented_parallel_reduction(unsigned int row, \n"
00013 " float val, \n"
00014 " __local unsigned int * shared_rows, \n"
00015 " __local float * inter_results) \n"
00016 "{ \n"
00017 " //barrier(CLK_LOCAL_MEM_FENCE); \n"
00018 " shared_rows[get_local_id(0)] = row; \n"
00019 " inter_results[get_local_id(0)] = val; \n"
00020 " float left = 0;\n"
00021 " \n"
00022 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00023 " if( get_local_id(0) >= 1 && row == shared_rows[get_local_id(0) - 1] ) { left = inter_results[get_local_id(0) - 1]; } \n"
00024 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00025 " inter_results[get_local_id(0)] += left; left = 0;\n"
00026 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00027 " if( get_local_id(0) >= 2 && row == shared_rows[get_local_id(0) - 2] ) { left = inter_results[get_local_id(0) - 2]; } \n"
00028 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00029 " inter_results[get_local_id(0)] += left; left = 0;\n"
00030 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00031 " if( get_local_id(0) >= 4 && row == shared_rows[get_local_id(0) - 4] ) { left = inter_results[get_local_id(0) - 4]; } \n"
00032 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00033 " inter_results[get_local_id(0)] += left; left = 0;\n"
00034 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00035 " if( get_local_id(0) >= 8 && row == shared_rows[get_local_id(0) - 8] ) { left = inter_results[get_local_id(0) - 8]; } \n"
00036 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00037 " inter_results[get_local_id(0)] += left; left = 0;\n"
00038 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00039 " if( get_local_id(0) >= 16 && row == shared_rows[get_local_id(0) - 16] ) { left = inter_results[get_local_id(0) - 16]; } \n"
00040 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00041 " inter_results[get_local_id(0)] += left; left = 0;\n"
00042 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00043 " if( get_local_id(0) >= 32 && row == shared_rows[get_local_id(0) - 32] ) { left = inter_results[get_local_id(0) - 32]; } \n"
00044 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00045 " inter_results[get_local_id(0)] += left; left = 0;\n"
00046 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00047 " if( get_local_id(0) >= 64 && row == shared_rows[get_local_id(0) - 64] ) { left = inter_results[get_local_id(0) - 64]; } \n"
00048 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00049 " inter_results[get_local_id(0)] += left; left = 0;\n"
00050 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00051 " if( get_local_id(0) >= 128 && row == shared_rows[get_local_id(0) - 128] ) { left = inter_results[get_local_id(0) - 128]; } \n"
00052 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00053 " inter_results[get_local_id(0)] += left; left = 0;\n"
00054 " barrier(CLK_LOCAL_MEM_FENCE); \n"
00055 " //if( get_local_id(0) >= 256 && row == shared_rows[get_local_id(0) - 256] ) { left = inter_results[get_local_id(0) - 256]; } \n"
00056 " //barrier(CLK_LOCAL_MEM_FENCE); \n"
00057 " //inter_results[get_local_id(0)] += left; left = 0;\n"
00058 " //barrier(CLK_LOCAL_MEM_FENCE); \n"
00059 "}\n"
00060 "__kernel void vec_mul( \n"
00061 " __global const uint2 * coords, //(row_index, column_index) \n"
00062 " __global const float * elements, \n"
00063 " __global const uint * group_boundaries,\n"
00064 " __global const float * vector, \n"
00065 " __global float * result, \n"
00066 " __local unsigned int * shared_rows, \n"
00067 " __local float * inter_results) \n"
00068 "{ \n"
00069 " uint2 tmp; \n"
00070 " float val;\n"
00071 " uint last_index = get_local_size(0) - 1;\n"
00072 " uint group_start = group_boundaries[get_group_id(0)];\n"
00073 " uint group_end = group_boundaries[get_group_id(0) + 1];\n"
00074 " uint k_end = 1 + (group_end - group_start - 1) / get_local_size(0); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)\n"
00075 " uint local_index = 0;\n"
00076 " for (uint k = 0; k < k_end; ++k)\n"
00077 " { \n"
00078 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
00079 " \n"
00080 " local_index = group_start + k * get_local_size(0) + get_local_id(0); \n"
00081 " \n"
00082 " if (local_index < group_end)\n"
00083 " {\n"
00084 " tmp = coords[local_index]; \n"
00085 " val = elements[local_index] * vector[tmp.y]; \n"
00086 " }\n"
00087 " else\n"
00088 " {\n"
00089 " tmp.x = 0;\n"
00090 " tmp.y = 0;\n"
00091 " val = 0;\n"
00092 " }\n"
00093 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
00094 " //check for carry from previous loop run: \n"
00095 " if (get_local_id(0) == 0 && k > 0)\n"
00096 " { \n"
00097 " if (tmp.x == shared_rows[last_index]) \n"
00098 " val += inter_results[last_index]; \n"
00099 " else \n"
00100 " result[shared_rows[last_index]] += inter_results[last_index]; \n"
00101 " } \n"
00102 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
00103 " segmented_parallel_reduction(tmp.x, val, shared_rows, inter_results); //all threads have to enter this function\n"
00104 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
00105 " if (get_local_id(0) != last_index &&\n"
00106 " shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] &&\n"
00107 " inter_results[get_local_id(0)] != 0) \n"
00108 " { \n"
00109 " result[tmp.x] += inter_results[get_local_id(0)]; \n"
00110 " }\n"
00111 " \n"
00112 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
00113 " } //for k\n"
00114 " \n"
00115 " if (get_local_id(0) == last_index && inter_results[last_index] != 0) \n"
00116 " result[tmp.x] += inter_results[last_index]; \n"
00117 "}\n"
00118 ;
00119
00120 }
00121 }
00122 }
00123 #endif