1 #ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_REDUCTION_TEMPLATE_HPP
2 #define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_REDUCTION_TEMPLATE_HPP
42 namespace device_specific
48 unsigned int _group_size,
unsigned int _num_groups,
59 unsigned int n_lmem_elements()
const
67 return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
68 return TEMPLATE_VALID;
72 std::string
const & buf_str, std::string
const & buf_value_str)
const
74 stream <<
"#pragma unroll" << std::endl;
75 stream <<
"for(unsigned int stride = " << size/2 <<
"; stride >0; stride /=2)" << std::endl;
76 stream <<
"{" << std::endl;
78 stream <<
"barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
79 stream <<
"if (lid < stride)" << std::endl;
80 stream <<
"{" << std::endl;
83 for (
unsigned int k = 0; k < exprs.size(); k++)
86 , exprs[k]->process(buf_value_str+
"[lid]"), exprs[k]->process(buf_value_str+
"[lid+stride]"),
91 stream <<
"}" << std::endl;
93 stream <<
"}" << std::endl;
96 std::string generate_impl(std::string
const & kernel_prefix,
statements_container const & statements, std::vector<mapping_type>
const & mappings,
unsigned int simd_width)
const
100 std::vector<mapped_scalar_reduction*> exprs;
101 for (std::vector<mapping_type>::const_iterator it = mappings.begin(); it != mappings.end(); ++it)
102 for (mapping_type::const_iterator iit = it->begin(); iit != it->end(); ++iit)
107 std::string arguments = generate_value_kernel_argument(
"unsigned int",
"N");
108 for (
unsigned int k = 0; k < N; ++k)
110 std::string numeric_type = utils::numeric_type_to_string(
lhs_most(exprs[k]->statement().array(),
111 exprs[k]->statement().root()).lhs.numeric_type);
114 arguments += generate_pointer_kernel_argument(
"__global",
"unsigned int", exprs[k]->
process(
"#name_temp"));
115 arguments += generate_pointer_kernel_argument(
"__global", numeric_type, exprs[k]->
process(
"#name_temp_value"));
118 arguments += generate_pointer_kernel_argument(
"__global", numeric_type, exprs[k]->
process(
"#name_temp"));
125 stream <<
" __attribute__((reqd_work_group_size(" <<
p_.
local_size_0 <<
",1,1)))" << std::endl;
127 stream <<
"{" << std::endl;
131 stream <<
"unsigned int lid = get_local_id(0);" << std::endl;
134 for (
unsigned int k = 0; k < N; ++k)
139 stream << exprs[k]->process(
"#scalartype #name_acc_value = " +
neutral_element(exprs[k]->root_op()) +
";") << std::endl;
141 stream << exprs[k]->process(
"unsigned int #name_acc = 0;") << std::endl;
146 stream << exprs[k]->process(
"#scalartype #name_acc = " +
neutral_element(exprs[k]->root_op()) +
";") << std::endl;
153 loop_body(std::vector<mapped_scalar_reduction*>
const & exprs_) : exprs(exprs_){ }
157 std::string i = (loop_simd_width==1)?
"i*#stride":
"i";
158 std::string process_str;
161 std::set<std::string> already_fetched;
162 process_str =
utils::append_width(
"#scalartype",loop_simd_width) +
" #namereg = " +
vload(loop_simd_width,i,
"#pointer")+
";";
163 for (std::vector<mapped_scalar_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
165 (*it)->process_recursive(kernel_stream,
PARENT_NODE_TYPE,
"vector", process_str, already_fetched);
166 (*it)->process_recursive(kernel_stream,
PARENT_NODE_TYPE,
"matrix_row",
"#scalartype #namereg = #pointer[$OFFSET{#row*#stride1, i*#stride2}];", already_fetched);
167 (*it)->process_recursive(kernel_stream,
PARENT_NODE_TYPE,
"matrix_column",
"#scalartype #namereg = #pointer[$OFFSET{i*#stride1,#column*#stride2}];", already_fetched);
168 (*it)->process_recursive(kernel_stream,
PARENT_NODE_TYPE,
"matrix_diag",
"#scalartype #namereg = #pointer[#diag_offset<0?$OFFSET{(i - #diag_offset)*#stride1, i*#stride2}:$OFFSET{i*#stride1, (i + #diag_offset)*#stride2}];", already_fetched);
173 std::vector<std::string> str(loop_simd_width);
174 if (loop_simd_width==1)
177 for (
unsigned int a = 0; a < loop_simd_width; ++a)
180 for (
unsigned int k = 0; k < exprs.size(); ++k)
182 for (
unsigned int a = 0; a < loop_simd_width; ++a)
184 std::map<std::string, std::string> accessors;
185 accessors[
"vector"] = str[a];
186 accessors[
"matrix_row"] = str[a];
187 accessors[
"matrix_column"] = str[a];
188 accessors[
"matrix_diag"] = str[a];
189 accessors[
"scalar"] =
"#namereg";
190 std::string value = exprs[k]->evaluate_recursive(
LHS_NODE_TYPE, accessors);
192 value+=
"*" + exprs[k]->evaluate_recursive(
RHS_NODE_TYPE, accessors);
203 std::vector<mapped_scalar_reduction*> exprs;
206 element_wise_loop_1D(stream, loop_body(exprs),
p_.fetching_policy, simd_width,
"i",
"N",
"get_global_id(0)",
"get_global_size(0)");
209 for (
unsigned int k = 0; k < N; ++k)
212 stream << exprs[k]->process(
"#name_buf_value[lid] = #name_acc_value;") << std::endl;
213 stream << exprs[k]->process(
"#name_buf[lid] = #name_acc;") << std::endl;
217 reduce_1d_local_memory(stream,
p_.
local_size_0, exprs,
"#name_buf",
"#name_buf_value");
220 stream <<
"if (lid==0)" << std::endl;
221 stream <<
"{" << std::endl;
223 for (
unsigned int k = 0; k < N; ++k)
226 stream << exprs[k]->process(
"#name_temp_value[get_group_id(0)] = #name_buf_value[0];") << std::endl;
227 stream << exprs[k]->process(
"#name_temp[get_group_id(0)] = #name_buf[0];") << std::endl;
230 stream <<
"}" << std::endl;
233 stream <<
"}" << std::endl;
238 stream <<
" __attribute__((reqd_work_group_size(" <<
p_.
local_size_0 <<
",1,1)))" << std::endl;
240 stream <<
"{" << std::endl;
243 stream <<
"unsigned int lid = get_local_id(0);" << std::endl;
245 for (
unsigned int k = 0; k < N; ++k)
250 stream << exprs[k]->process(
"unsigned int #name_acc = 0;") << std::endl;
252 stream << exprs[k]->process(
"#scalartype #name_acc_value = " +
neutral_element(exprs[k]->root_op()) +
";");
257 stream << exprs[k]->process(
"#scalartype #name_acc = " +
neutral_element(exprs[k]->root_op()) +
";");
261 stream <<
"for(unsigned int i = lid; i < " <<
p_.num_groups <<
"; i += get_local_size(0))" << std::endl;
262 stream <<
"{" << std::endl;
264 for (
unsigned int k = 0; k < N; ++k)
267 exprs[k]->
process(
"#name_acc_value"),exprs[k]->
process(
"#name_temp_value[i]"),exprs[k]->root_op());
272 stream <<
"}" << std::endl;
274 for (
unsigned int k = 0; k < N; ++k)
277 stream << exprs[k]->process(
"#name_buf_value[lid] = #name_acc_value;") << std::endl;
278 stream << exprs[k]->process(
"#name_buf[lid] = #name_acc;") << std::endl;
283 reduce_1d_local_memory(stream,
p_.
local_size_0, exprs,
"#name_buf",
"#name_buf_value");
285 stream <<
"if (lid==0)" << std::endl;
286 stream <<
"{" << std::endl;
288 std::map<std::string, std::string> accessors;
289 accessors[
"scalar_reduction"] =
"#name_buf[0]";
290 accessors[
"scalar"] =
"*#pointer";
291 accessors[
"vector"] =
"#pointer[#start]";
294 stream <<
"}" << std::endl;
297 stream <<
"}" << std::endl;
302 std::vector<std::string> generate_impl(std::string
const & kernel_prefix,
statements_container const & statements, std::vector<mapping_type>
const & mappings)
const
304 std::vector<std::string> result;
305 result.push_back(generate_impl(kernel_prefix +
"_strided", statements, mappings, 1));
306 result.push_back(generate_impl(kernel_prefix, statements, mappings,
p_.
simd_width));
314 std::vector<scheduler::statement_node const *> reductions;
316 for (statements_container::data_type::const_iterator it = statements.
data().begin(); it != statements.
data().end(); ++it)
318 std::vector<vcl_size_t> reductions_idx;
321 for (std::vector<vcl_size_t>::iterator itt = reductions_idx.begin(); itt != reductions_idx.end(); ++itt)
322 reductions.push_back(&it->array()[*itt]);
331 kernels[0] = &programs[0].program().get_kernel(kernel_prefix+
"_strided_0");
332 kernels[1] = &programs[0].program().get_kernel(kernel_prefix+
"_strided_1");
336 kernels[0] = &programs[1].program().get_kernel(kernel_prefix+
"_0");
337 kernels[1] = &programs[1].program().get_kernel(kernel_prefix+
"_1");
346 for (
unsigned int k = 0; k < 2; k++)
348 unsigned int n_arg = 0;
349 kernels[k]->
arg(n_arg++, size);
352 for (std::vector<scheduler::statement_node const *>::const_iterator it = reductions.begin(); it != reductions.end(); ++it)
356 if (tmpidx_.size() <= j)
358 kernels[k]->
arg(n_arg++, tmpidx_[j]);
362 if (tmp_.size() <= i)
364 kernels[k]->
arg(n_arg++, tmp_[i]);
370 for (
unsigned int k = 0; k < 2; k++)
376 std::vector< viennacl::ocl::handle<cl_mem> > tmp_;
377 std::vector< viennacl::ocl::handle<cl_mem> > tmpidx_;
fetching_policy_type fetching_policy
unsigned int local_size_0
void set_arguments(statements_container const &statements, viennacl::ocl::kernel &kernel, unsigned int ¤t_arg)
Represents an OpenCL kernel within ViennaCL.
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
void traverse(scheduler::statement const &statement, vcl_size_t root_idx, Fun const &fun, bool inspect)
Recursively execute a functor on a statement.
void enqueue(std::string const &kernel_prefix, std::vector< lazy_program_compiler > &programs, statements_container const &statements)
parameters_type const & parameters() const
A class representing a compute device (e.g. a GPU)
container_type const & array() const
std::list< scheduler::statement > const & data() const
static std::string append_simd_suffix(std::string const &str, unsigned int i)
scheduler::statement_node const & lhs_most(scheduler::statement::container_type const &array, vcl_size_t root)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
static bool has_strided_access(statements_container const &statements)
reduction_template(reduction_template::parameters_type const ¶meters, binding_policy_t binding_policy=BIND_ALL_UNIQUE)
std::string evaluate(leaf_t leaf, std::map< std::string, std::string > const &accessors, scheduler::statement const &statement, vcl_size_t root_idx, mapping_type const &mapping)
bool is_reduction(scheduler::statement_node const &node)
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
static vcl_size_t vector_size(scheduler::statement_node const &node, bool up_to_internal_size)
static void generate_prototype(utils::kernel_generation_stream &stream, std::string const &name, std::string const &first_arguments, std::vector< mapping_type > const &mappings, statements_container const &statements, std::map< std::string, unsigned int > const &widths)
void compute_reduction(utils::kernel_generation_stream &os, std::string acc, std::string cur, scheduler::op_element const &op)
Code for parsing the expression trees.
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
viennacl::ocl::context const & context() const
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
bool is_index_reduction(scheduler::op_element const &op)
void compute_index_reduction(utils::kernel_generation_stream &os, std::string acc, std::string cur, std::string const &acc_value, std::string const &cur_value, scheduler::op_element const &op)
static void element_wise_loop_1D(utils::kernel_generation_stream &stream, loop_body_base const &loop_body, fetching_policy_type fetch, unsigned int simd_width, std::string const &i, std::string const &bound, std::string const &domain_id, std::string const &domain_size)
static std::string vload(unsigned int simd_width, std::string const &offset, std::string const &ptr)
The main class for representing a statement such as x = inner_prod(y,z); at runtime.
void arg(unsigned int pos, cl_char val)
Sets a char argument at the provided position.
Implementations for the OpenCL backend functionality.
unsigned int size_of(scheduler::statement_node_numeric_type type)
reduction_parameters(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy)
parameters_type(unsigned int _simd_width, unsigned int _local_size_1, unsigned int _local_size_2, unsigned int _num_kernels)
void process(utils::kernel_generation_stream &stream, leaf_t leaf, std::string const &type_key, std::string const &to_process, scheduler::statement const &statement, vcl_size_t root_idx, mapping_type const &mapping, std::set< std::string > &already_processed)
std::string neutral_element(scheduler::op_element const &op)
std::string append_width(std::string const &str, unsigned int width)
viennacl::ocl::handle< cl_mem > create_memory(cl_mem_flags flags, unsigned int size, void *ptr=NULL) const
Creates a memory buffer within the context.