1 #ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_ROW_WISE_REDUCTION_HPP
2 #define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_ROW_WISE_REDUCTION_HPP
44 namespace device_specific
50 unsigned int _local_size_0,
unsigned int _local_size_1,
64 return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
65 return TEMPLATE_VALID;
68 unsigned int n_lmem_elements()
const
80 std::string generate_impl(std::string
const & kernel_prefix,
statements_container const & statements, std::vector<mapping_type>
const & mappings,
unsigned int simd_width,
bool is_trans, std::vector<mapped_row_wise_reduction*>
const & exprs)
const
86 std::string lsize1str =
to_string(lsize1);
91 generate_prototype(stream, kernel_prefix,
"unsigned int M, unsigned int N,", mappings, statements);
92 stream <<
"{" << std::endl;
101 for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
102 stream << (*it)->process(
"__local #scalartype #name_buf[" +
to_string(lsize0*lsize1) +
"];") << std::endl;
104 stream <<
"unsigned int lid0 = get_local_id(0);" << std::endl;
105 stream <<
"unsigned int lid1 = get_local_id(1);" << std::endl;
107 stream <<
"for(unsigned int r = get_global_id(0); r < upper_bound_0; r += get_global_size(0)){" << std::endl;
110 for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
111 stream << (*it)->process(
"#scalartype #name_acc = " +
neutral_element((*it)->root_op()) +
";") << std::endl;
113 stream <<
"if (r < M)" << std::endl;
114 stream <<
"{" << std::endl;
120 loop_body(std::vector<mapped_row_wise_reduction*>
const & _exprs,
bool _is_trans) : exprs(_exprs), is_trans(_is_trans){ }
123 std::set<std::string> already_fetched;
124 for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
127 (*it)->process_recursive(kernel_stream,
LHS_NODE_TYPE,
"matrix_trans",
utils::append_width(
"#scalartype",loop_simd_width) +
" #namereg = " +
vload(loop_simd_width,
"c*#stride1",
"#pointer + r*#ld")+
";", already_fetched);
129 (*it)->process_recursive(kernel_stream,
LHS_NODE_TYPE,
"matrix",
"#scalartype #namereg = #pointer[r*#stride1 + c*#ld];", already_fetched);
130 (*it)->process_recursive(kernel_stream,
RHS_NODE_TYPE,
"vector",
utils::append_width(
"#scalartype",loop_simd_width) +
" #namereg = " +
vload(loop_simd_width,
"c*#stride",
"#pointer")+
";", already_fetched);
135 std::vector<std::string> str(loop_simd_width);
136 if (loop_simd_width==1)
139 for (
unsigned int a = 0; a < loop_simd_width; ++a)
143 for (
unsigned int k = 0; k < exprs.size(); ++k)
145 for (
unsigned int a = 0; a < loop_simd_width; ++a)
147 std::map<std::string, std::string> accessors;
149 accessors[
"matrix_trans"] = str[a];
151 accessors[
"matrix"] = str[a];
152 accessors[
"vector"] = str[a];
153 accessors[
"scalar"] =
"#namereg";
154 std::string value = exprs[k]->evaluate_recursive(
LHS_NODE_TYPE, accessors);
156 value+=
"*" + exprs[k]->evaluate_recursive(
RHS_NODE_TYPE, accessors);
166 std::vector<mapped_row_wise_reduction*> exprs;
170 element_wise_loop_1D(stream, loop_body(exprs, is_trans),
p_.fetch_policy, simd_width,
"c",
"N",
"get_local_id(1)",
"get_local_size(1)");
172 stream <<
"}" << std::endl;
174 for (
unsigned int k = 0; k < exprs.size(); ++k)
175 stream << exprs[k]->
process(
"#name_buf[lid0*" + lsize1str +
"+ lid1] = #name_acc;") << std::endl;
177 stream <<
"#pragma unroll" << std::endl;
178 stream <<
"for(unsigned int stride = " <<
p_.
local_size_1/2 <<
"; stride >0; stride /=2)" << std::endl;
179 stream <<
"{" << std::endl;
182 stream <<
"barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
183 stream <<
"if (lid1 < stride)" << std::endl;
184 stream <<
"{" << std::endl;
187 for (
unsigned int k = 0; k < exprs.size(); k++)
189 compute_index_reduction(stream, exprs[k]->
process(
"#name_buf[lid0*" + lsize1str +
" + lid1]"), exprs[k]->process(
"#name_buf[lid0*" + lsize1str +
" + lid1 + stride]")
190 , exprs[k]->process(
"#name_buf_value[lid0*" + lsize1str +
" + lid1]"), exprs[k]->process(
"#name_buf_value[lid0*" + lsize1str +
" + lid1 + stride]"),
191 exprs[k]->root_op());
193 compute_reduction(stream,exprs[k]->
process(
"#name_buf[lid0*" + lsize1str +
" + lid1]"), exprs[k]->
process(
"#name_buf[lid0*" + lsize1str +
" + lid1 + stride]"), exprs[k]->root_op());
196 stream <<
"}" << std::endl;
199 stream <<
"}" << std::endl;
202 stream <<
"if (lid1 == 0 && r < M)";
203 stream <<
"{" << std::endl;
205 std::map<std::string, std::string> accessors;
206 accessors[
"row_wise_reduction"] =
"#name_buf[lid0*" + lsize1str +
"]";
207 accessors[
"vector"] =
"#pointer[r*#stride]";
210 stream <<
"}" << std::endl;
214 stream <<
"}" << std::endl;
217 stream <<
"}" << std::endl;
222 std::vector<std::string> generate_impl(std::string
const & kernel_prefix,
statements_container const & statements, std::vector<mapping_type>
const & mappings)
const
224 std::vector<mapped_row_wise_reduction*> exprs;
225 bool is_trans =
false;
227 statements_container::data_type::const_iterator sit;
228 std::vector<mapping_type>::const_iterator mit;
229 for (mit = mappings.begin(), sit = statements.
data().begin(); mit != mappings.end(); ++mit, ++sit)
231 std::vector<vcl_size_t> idx;
233 parse(*sit, idx, is_trans, A);
235 for (
unsigned int j = 0; j < idx.size(); ++j)
240 std::vector<std::string> res;
243 res.push_back(generate_impl(kernel_prefix, statements, mappings,
p_.
simd_width, is_trans, exprs));
244 res.push_back(generate_impl(kernel_prefix, statements, mappings, 1, is_trans, exprs));
247 res.push_back(generate_impl(kernel_prefix, statements, mappings, 1, is_trans, exprs));
256 std::vector<vcl_size_t> idx;
259 parse(statements.
data().front(), idx, is_trans, A);
266 kernel = &programs[1].program().get_kernel(kernel_prefix);
268 kernel = &programs[0].program().get_kernel(kernel_prefix);
271 kernel = &programs[0].program().get_kernel(kernel_prefix);
278 unsigned int current_arg = 0;
unsigned int local_size_0
void set_arguments(statements_container const &statements, viennacl::ocl::kernel &kernel, unsigned int ¤t_arg)
Represents an OpenCL kernel within ViennaCL.
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Some helper routines for reading/writing/printing scheduler expressions.
void traverse(scheduler::statement const &statement, vcl_size_t root_idx, Fun const &fun, bool inspect)
Recursively execute a functor on a statement.
static bool is_node_trans(scheduler::statement::container_type const &array, vcl_size_t root_idx, leaf_t leaf_type)
parameters_type const & parameters() const
A class representing a compute device (e.g. a GPU)
A class representing the 'data' for the LHS or RHS operand of the respective node.
container_type const & array() const
std::list< scheduler::statement > const & data() const
static std::string append_simd_suffix(std::string const &str, unsigned int i)
scheduler::statement_node const & lhs_most(scheduler::statement::container_type const &array, vcl_size_t root)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
std::string to_string(viennacl::scheduler::op_element op_elem)
Helper routine for converting the operation enums to string.
unsigned int local_size_1
Map ViennaCL objects to generator wrappers.
static bool has_strided_access(statements_container const &statements)
std::string evaluate(leaf_t leaf, std::map< std::string, std::string > const &accessors, scheduler::statement const &statement, vcl_size_t root_idx, mapping_type const &mapping)
bool is_reduction(scheduler::statement_node const &node)
void enqueue(std::string const &kernel_prefix, std::vector< lazy_program_compiler > &programs, statements_container const &statements)
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
static void generate_prototype(utils::kernel_generation_stream &stream, std::string const &name, std::string const &first_arguments, std::vector< mapping_type > const &mappings, statements_container const &statements, std::map< std::string, unsigned int > const &widths)
void compute_reduction(utils::kernel_generation_stream &os, std::string acc, std::string cur, scheduler::op_element const &op)
Code for parsing the expression trees.
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
unsigned int num_groups_0
bool row_major(T const &)
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
row_wise_reduction_parameters(unsigned int _simd_width, unsigned int _local_size_0, unsigned int _local_size_1, unsigned int _num_groups_0, fetching_policy_type _fetch_policy)
bool is_index_reduction(scheduler::op_element const &op)
void compute_index_reduction(utils::kernel_generation_stream &os, std::string acc, std::string cur, std::string const &acc_value, std::string const &cur_value, scheduler::op_element const &op)
static void element_wise_loop_1D(utils::kernel_generation_stream &stream, loop_body_base const &loop_body, fetching_policy_type fetch, unsigned int simd_width, std::string const &i, std::string const &bound, std::string const &domain_id, std::string const &domain_size)
static std::string vload(unsigned int simd_width, std::string const &offset, std::string const &ptr)
The main class for representing a statement such as x = inner_prod(y,z); at runtime.
void arg(unsigned int pos, cl_char val)
Sets a char argument at the provided position.
fetching_policy_type fetch_policy
ValueT const & at(std::map< KeyT, ValueT > const &map, KeyT const &key)
Emulation of C++11's .at() member for std::map<>, const-version.
row_wise_reduction_template(row_wise_reduction_template::parameters_type const ¶meters, char A_trans, binding_policy_t binding_policy=BIND_ALL_UNIQUE)
A tag for row-major storage of a dense matrix.
std::pair< vcl_size_t, leaf_t > mapping_key
parameters_type(unsigned int _simd_width, unsigned int _local_size_1, unsigned int _local_size_2, unsigned int _num_kernels)
void process(utils::kernel_generation_stream &stream, leaf_t leaf, std::string const &type_key, std::string const &to_process, scheduler::statement const &statement, vcl_size_t root_idx, mapping_type const &mapping, std::set< std::string > &already_processed)
std::string neutral_element(scheduler::op_element const &op)
std::string append_width(std::string const &str, unsigned int width)