1 #ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_MATRIX_PRODUCT_HPP
2 #define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_MATRIX_PRODUCT_HPP
44 namespace device_specific
51 ,
unsigned int ms,
unsigned int ks,
unsigned int ns
53 ,
unsigned int local_fetch_0_param,
unsigned int local_fetch_1_param):
template_base::
parameters_type(simd_width, local_size_0, local_size_1, 1),
56 mL(ms*local_size_0),
nL(ns*local_size_1){}
78 unsigned int n_lmem_elements()
const
82 N +=
p_.kL * (
p_.mL+1);
84 N +=
p_.nL * (
p_.kL+1);
91 return TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH;
94 return TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE;
97 return TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL;
99 if (!(A_trans_==
'N' && B_trans_==
'T') &&
p_.
simd_width>1)
100 return TEMPLATE_SIMD_WIDTH_MUST_BE_ONE;
105 return TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT;
110 unsigned int bound1 = (A_trans_==
'N')?
p_.kL:
p_.mL;
111 unsigned int bound0 = (A_trans_==
'N')?
p_.mL:
p_.kL;
113 if (
p_.local_fetch_1>0 && (bound1 %
p_.local_fetch_1)> 0)
114 return A_trans_==
'N'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
117 return A_trans_==
'N'?TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE:TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE;
122 unsigned int bound1 = (B_trans_==
'T')?
p_.kL:
p_.nL;
123 unsigned int bound0 = (B_trans_==
'T')?
p_.nL:
p_.kL;
125 if (
p_.local_fetch_1>0 && (bound1 %
p_.local_fetch_1)> 0)
126 return B_trans_==
'T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
129 return B_trans_==
'T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
133 return TEMPLATE_VALID;
141 using namespace tree_parsing;
142 using namespace scheduler;
150 vcl_size_t node_add_idx = array[root_idx].rhs.node_index;
152 vcl_size_t node_1_idx = array[node_add_idx].lhs.node_index;
153 alpha_idx = node_1_idx;
156 vcl_size_t mat_prod_idx = array[node_1_idx].lhs.node_index;
160 A_idx = mat_prod_idx;
165 A_idx = array[mat_prod_idx].lhs.node_index;
172 B_idx = mat_prod_idx;
178 B_idx = array[mat_prod_idx].rhs.node_index;
182 vcl_size_t node_2_idx = array[node_add_idx].rhs.node_index;
183 beta_idx = node_2_idx;
191 stream <<
"if (" << inbounds <<
")" << std::endl;
193 stream << do_if <<
";" << std::endl;
195 stream <<
"else" << std::endl;
197 stream << do_else <<
";" << std::endl;
201 stream << do_if <<
";" << std::endl;
205 std::string generate_impl(
const std::string &kernel_prefix,
const statements_container &statements,
const std::vector<mapping_type> &mappings,
bool fallback)
const
210 parameters_type pfallback(1,
p_.
local_size_0,
p_.
kL,
p_.
local_size_1,
p_.
mS, 1,
p_.
nS,
p_.
A_fetching_policy,
p_.
B_fetching_policy,
p_.
local_fetch_0,
p_.
local_fetch_1);
213 #define VIENNACL_MUL_STRIDE1 string(fallback?"*#stride1":"")
214 #define VIENNACL_HANDLE_BOUNDS(in_bounds, to_load) (!fallback?string(to_load):string( string(in_bounds) + "?" + string(to_load) + ":0"))
215 #define VIENNACL_VSTORE(value, offset, ptr) vstore(p.simd_width, value, offset, ptr)
226 bool A_trans =
false, B_trans =
false;
227 vcl_size_t C_idx=0, alpha_idx=0, A_idx=0, B_idx=0, beta_idx=0;
229 parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
241 stream <<
" __attribute__((reqd_work_group_size(" << p.
local_size_0 <<
"," << p.
local_size_1 <<
",1)))" << std::endl;
242 std::map<std::string, unsigned int> widths;
245 generate_prototype(stream, kernel_prefix,
"unsigned int M, unsigned int N, unsigned int K, ", mappings, statements, widths);
246 stream <<
"{" << std::endl;
276 stream <<
"uint gidx = get_group_id(0);" << std::endl;
277 stream <<
"uint gidy = get_group_id(1);" << std::endl;
278 stream <<
"uint idx = get_local_id(0);" << std::endl;
279 stream <<
"uint idy = get_local_id(1);" << std::endl;
284 stream <<
"uint idt = " << p.
local_size_0 <<
"*idy + idx;" << std::endl;
285 stream <<
"uint idxT = idt % " << p.
local_fetch_0 <<
";" << std::endl;
286 stream <<
"uint idyT = idt / " << p.
local_fetch_0 <<
";" << std::endl;
293 stream <<
"bool in_bounds_m[" << p.
mS <<
"];" << std::endl;
294 stream <<
"for(unsigned int m = 0; m < " << p.
mS <<
"; m++)" << std::endl;
299 stream <<
"in_bounds_m[m] = gidx*" << p.
mL <<
" + idx*" << p.
mS <<
" + m < M;" << std::endl;
302 stream <<
"in_bounds_m[m] = gidx*" << p.
mL <<
" + idx + m*" << p.
local_size_0 <<
" < M;" << std::endl;
311 stream <<
"bool in_bounds_m_local[" << p.
mL/fetch_size <<
"];" << std::endl;
312 stream <<
"for(unsigned int m = 0; m < " << p.
mL/fetch_size <<
"; m++)" << std::endl;
314 stream <<
"in_bounds_m_local[m] = gidx*" << p.
mL <<
" + " << (A_trans_==
'N'?
"idxT":
"idyT") <<
" + m*" << fetch_size <<
" < M;" << std::endl;
319 stream <<
"bool in_bounds_n[" << p.
nS <<
"];" << std::endl;
320 stream <<
"for(unsigned int n = 0; n < " << p.
nS <<
"; n++)" << std::endl;
325 stream <<
"in_bounds_n[n] = gidy*" << p.
nL <<
" + idy*" << p.
nS <<
" + n < N;" << std::endl;
328 stream <<
"in_bounds_n[n] = gidy*" << p.
nL <<
" + idy + n*" << p.
local_size_1 <<
" < N;" << std::endl;
337 stream <<
"bool in_bounds_n_local[" << p.
nL/fetch_size <<
"];" << std::endl;
338 stream <<
"for(unsigned int n = 0; n < " << p.
nL/fetch_size <<
"; n++)" << std::endl;
340 stream <<
"in_bounds_n_local[n] = gidy*" << p.
nL <<
" + " << (B_trans_==
'T'?
"idxT":
"idyT") <<
" + n*" << fetch_size <<
" < N;" << std::endl;
398 stream <<
"for(unsigned int block_k=0; block_k < K; block_k+=" << p.
kL <<
"){" << std::endl;
406 stream << A->
process(
"__local #scalartype* plA = lA + idxT*" +
to_string(p.
mL + 1) +
" + idyT;") << std::endl;
415 stream << B->
process(
"__local #scalartype* plB = lB + idxT*" +
to_string(p.
nL+1) +
"+ idyT;") <<std::endl;
420 stream <<
"barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
459 stream <<
"barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
460 stream <<
"uint offA = " << p.
simd_width <<
"*idx;" << std::endl;
461 stream <<
"uint offB = " << p.
simd_width <<
"*idy;" << std::endl;
465 stream <<
"for(unsigned int k = 0; k < " << p.
kL <<
" && (block_k + k < K); k+=" << p.
kS <<
"){" << std::endl;
467 stream <<
"for(unsigned int k = 0; k < " << p.
kL <<
"; k+=" << p.
kS <<
"){" << std::endl;
471 stream <<
"#pragma unroll " << p.
kS << std::endl;
472 stream <<
"for(unsigned int kk = 0; kk < " << p.
kS <<
"; kk++)" << std::endl;
473 stream <<
"#pragma unroll " << p.
mS/p.
simd_width << std::endl;
474 stream <<
"for(unsigned int mm = 0; mm < " << p.
mS/p.
simd_width <<
"; mm++)" << std::endl;
475 stream <<
"{" << std::endl;
480 for (
unsigned int ss = 0; ss < p.
simd_width; ++ss)
505 stream <<
"}" << std::endl;
507 stream <<
"#pragma unroll " << p.
kS << std::endl;
508 stream <<
"for(unsigned int kk = 0; kk < " << p.
kS <<
"; kk++)" << std::endl;
509 stream <<
"#pragma unroll " << p.
nS/p.
simd_width << std::endl;
510 stream <<
"for(unsigned int nn = 0; nn < " << p.
nS/p.
simd_width <<
"; nn++)" << std::endl;
511 stream <<
"{" << std::endl;
516 for (
unsigned int ss = 0; ss < p.
simd_width; ++ss)
541 stream <<
"}" << std::endl;
548 stream <<
"offA += " << p.
kS*(p.
mL+1) <<
";" << std::endl;
563 stream <<
"offB += " << p.
kS*(p.
nL+1) <<
";" << std::endl;
575 stream <<
"#pragma unroll " << p.
kS << std::endl;
576 stream <<
"for(unsigned int kk = 0; kk <" << p.
kS <<
"; ++kk)" << std::endl;
577 stream <<
"{" << std::endl;
579 for (
unsigned int nn=0; nn < p.
nS; ++nn)
580 for (
unsigned int mm=0; mm < p.
mS; ++mm)
582 string res_str, lhs_str, rhs_str;
592 stream << res_str <<
"=" <<
"fma(" << lhs_str <<
"," << rhs_str <<
"," << res_str <<
");" << std::endl;
595 stream <<
"}" << std::endl;
601 stream <<
"}" << std::endl;
622 stream <<
"}" << std::endl;
631 stream << C->
process(
"#pointer += idx*" +
to_string(ministartstride0) +
"*#ld;") << std::endl;
632 stream << C->
process(
"#pointer += gidy*" +
to_string(p.
nL) +
"*#stride2;") << std::endl;
633 stream << C->
process(
"#pointer += idy*" +
to_string(ministartstride1) +
"*#stride2;") << std::endl;
635 for (
unsigned int n=0; n < p.
nS; ++n)
637 for (
unsigned int m=0; m < p.
mS; ++m)
643 stream <<
"if (in_bounds_m[" +
to_string(m) +
"] && in_bounds_n[" +
to_string(n) +
"])" << std::endl;
646 stream << C->
process(
"#pointer[" + Cj +
"*#ld] = rC[" +
to_string(m) +
"][" +
to_string(n) +
"]*" + alpha->
name() +
"+ #pointer[" + Cj +
"*#ld]*" + beta->
name() +
";") << std::endl;
651 stream << C->
process(
"#pointer += #stride2;") << std::endl;
662 stream << C->
process(
"#pointer += gidx*" +
to_string(p.
mL) +
"*#stride1;") << std::endl;
663 stream << C->
process(
"#pointer += idx*" +
to_string(ministartstride0) +
"*#stride1;") << std::endl;
665 stream << C->
process(
"#pointer += idy*" +
to_string(ministartstride1) +
"*#ld;") << std::endl;
667 for (
unsigned int m=0; m < p.
mS; ++m)
669 for (
unsigned int n=0; n < p.
nS; ++n)
675 stream <<
"if (in_bounds_m[" +
to_string(m) +
"] && in_bounds_n[" +
to_string(n) +
"])" << std::endl;
678 stream << C->
process(
"#pointer[" + Cj +
"*#ld] = rC[" +
to_string(m) +
"][" +
to_string(n) +
"]*" + alpha->
name() +
" + #pointer[" + Cj +
"*#ld]*" + beta->
name() +
";") << std::endl;
684 stream << C->
process(
"#pointer += #stride1;") << std::endl;
691 stream <<
"}" << std::endl;
695 #undef VIENNACL_MUL_STRIDE1
696 #undef VIENNACL_HANDLE_BOUNDS
697 #undef VIENNACL_VSTORE
700 std::vector<std::string> generate_impl(std::string
const & kernel_prefix,
statements_container const & statements, std::vector<mapping_type>
const & mappings)
const
702 std::vector<std::string> res;
703 res.push_back(generate_impl(kernel_prefix, statements, mappings,
false));
704 res.push_back(generate_impl(kernel_prefix, statements, mappings,
true));
708 template<
class NumericT>
712 std::vector<lazy_program_compiler> & programs, std::string
const & kernel_prefix,
vcl_size_t id)
737 unsigned int current_arg = 0;
738 kernel.
arg(current_arg++, cl_uint(C.
size1()));
739 kernel.
arg(current_arg++, cl_uint(C.
size2()));
741 kernel.
arg(current_arg++, cl_uint(A_trans_==
'T'?A.
size2():A.
size1()));
743 kernel.
arg(current_arg++, cl_uint(A_trans_==
'N'?A.
size2():A.
size1()));
749 template<
class NumericT>
754 slice s0(s0_0, 1, s0_1 - s0_0);
755 slice s1(s1_0, 1, s1_1 - s1_0);
761 template<
class NumericT>
764 NumericT beta_value, std::vector<lazy_program_compiler> & programs, std::string
const & kernel_prefix)
766 using namespace device_specific::utils;
767 vcl_size_t ldstrideA = call_on_matrix(A, leading_stride());
768 vcl_size_t ldstrideB = call_on_matrix(B, leading_stride());
769 vcl_size_t ldstrideC = call_on_matrix(C, leading_stride());
770 vcl_size_t ldstartA = call_on_matrix(A, leading_start());
771 vcl_size_t ldstartB = call_on_matrix(B, leading_start());
772 bool swap_A = ((A_trans_==
'T') ^ utils::call_on_matrix(A, row_major_fun()));
773 bool swap_B = ((B_trans_==
'T') ^ utils::call_on_matrix(B, row_major_fun()));
775 vcl_size_t M = call_on_matrix(C, size1_fun());
776 vcl_size_t N = call_on_matrix(C, size2_fun());
778 if (utils::call_on_matrix(A, row_major_fun()))
779 K = A_trans_==
'T'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun());
781 K = A_trans_==
'N'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun());
783 if (M <
p_.
mL || N <
p_.
nL || K < p_.kL || ldstrideA> 1 || ldstrideB > 1 || ldstrideC > 1 ||
786 enqueue_block(statement, A, B, C, beta, create_slice(ptr_matrix, A, 0, M, 0, K, swap_A),
787 create_slice(ptr_matrix, B, 0, K, 0, N, swap_B),
788 create_slice(ptr_matrix, C, 0, M, 0, N,
false), beta_value, programs, kernel_prefix, 1);
802 enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN,
false), beta_value, programs, kernel_prefix, 0);
803 enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN,
false), (
NumericT)1, programs, kernel_prefix, 1);
805 enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N,
false), beta_value, programs, kernel_prefix, 1);
806 enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N,
false), (
NumericT)1, programs, kernel_prefix, 1);
808 enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN,
false), beta_value, programs, kernel_prefix, 1);
809 enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN,
false), (
NumericT)1, programs, kernel_prefix, 1);
811 enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N,
false), beta_value, programs, kernel_prefix, 1);
812 enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N,
false), (
NumericT)1, programs, kernel_prefix, 1);
820 using namespace device_specific::utils;
821 using namespace tree_parsing;
824 bool A_trans, B_trans;
825 vcl_size_t C_idx=0, A_idx=0, B_idx=0, alpha_idx=0, beta_idx = 0;
827 parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
virtual void enqueue(std::string const &kernel_prefix, std::vector< lazy_program_compiler > &programs, statements_container const &statements)
unsigned int local_size_0
#define VIENNACL_MUL_STRIDE1
unsigned int local_fetch_0
Exception for the case the generator is unable to deal with the operation.
void set_arguments(statements_container const &statements, viennacl::ocl::kernel &kernel, unsigned int ¤t_arg)
Class for representing strided submatrices of a bigger matrix A.
Represents an OpenCL kernel within ViennaCL.
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
static void assign_element(lhs_rhs_element &elem, char const &t)
parameters_type const & parameters() const
A class representing a compute device (e.g. a GPU)
This file provides the forward declarations for the main types used within ViennaCL.
A class representing the 'data' for the LHS or RHS operand of the respective node.
container_type const & array() const
fetching_policy_type A_fetching_policy
viennacl::scalar< float > s1
std::list< scheduler::statement > const & data() const
Forward declaration of dense matrix classes.
viennacl::matrix_base< double > * matrix_double
void swap(vector_base< T > &vec1, vector_base< T > &vec2)
Swaps the contents of two vectors, data is copied.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
std::vector< value_type > container_type
#define VIENNACL_VSTORE(value, offset, ptr)
std::string to_string(viennacl::scheduler::op_element op_elem)
Helper routine for converting the operation enums to string.
unsigned int local_size_1
Map ViennaCL objects to generator wrappers.
statement_node_numeric_type numeric_type
viennacl::matrix_base< float > * matrix_float
std::string process(std::string const &in) const
size_type size2() const
Returns the number of columns.
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
#define VIENNACL_HANDLE_BOUNDS(in_bounds, to_load)
static void generate_prototype(utils::kernel_generation_stream &stream, std::string const &name, std::string const &first_arguments, std::vector< mapping_type > const &mappings, statements_container const &statements, std::map< std::string, unsigned int > const &widths)
size_type size1() const
Returns the number of rows.
viennacl::enable_if< viennacl::is_scalar< ScalarT1 >::value &&viennacl::is_scalar< ScalarT2 >::value >::type swap(ScalarT1 &s1, ScalarT2 &s2)
Swaps the contents of two scalars, data is copied.
std::map< mapping_key, tools::shared_ptr< mapped_object > > mapping_type
matrix_product_parameters(unsigned int simd_width, unsigned int local_size_0, unsigned int KL, unsigned int local_size_1, unsigned int ms, unsigned int ks, unsigned int ns, fetching_policy_type A_fetching_policy_param, fetching_policy_type B_fetching_policy_param, unsigned int local_fetch_0_param, unsigned int local_fetch_1_param)
Proxy classes for matrices.
Code for parsing the expression trees.
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
scheduler::lhs_rhs_element & lhs_rhs_element(scheduler::statement const &st, vcl_size_t idx, leaf_t leaf)
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
The main class for representing a statement such as x = inner_prod(y,z); at runtime.
void arg(unsigned int pos, cl_char val)
Sets a char argument at the provided position.
matrix_product_template(matrix_product_template::parameters_type const ¶meters, char A_trans, char B_trans)
ValueT const & at(std::map< KeyT, ValueT > const &map, KeyT const &key)
Emulation of C++11's .at() member for std::map<>, const-version.
A slice class that refers to an interval [start, stop), where 'start' is included, and 'stop' is excluded.
std::string const & name() const
fetching_policy_type B_fetching_policy
std::pair< vcl_size_t, leaf_t > mapping_key
unsigned int local_fetch_1
parameters_type(unsigned int _simd_width, unsigned int _local_size_1, unsigned int _local_size_2, unsigned int _num_kernels)
void process(utils::kernel_generation_stream &stream, leaf_t leaf, std::string const &type_key, std::string const &to_process, scheduler::statement const &statement, vcl_size_t root_idx, mapping_type const &mapping, std::set< std::string > &already_processed)
std::string append_width(std::string const &str, unsigned int width)