00001 #ifndef VIENNACL_MATRIX_OPERATIONS_HPP_
00002 #define VIENNACL_MATRIX_OPERATIONS_HPP_
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00024 #include "viennacl/forwards.h"
00025 #include "viennacl/ocl/device.hpp"
00026 #include "viennacl/ocl/handle.hpp"
00027 #include "viennacl/ocl/kernel.hpp"
00028 #include "viennacl/scalar.hpp"
00029 #include "viennacl/vector.hpp"
00030 #include "viennacl/tools/tools.hpp"
00031 #include "viennacl/meta/predicate.hpp"
00032 #include "viennacl/traits/size.hpp"
00033 #include "viennacl/traits/start.hpp"
00034 #include "viennacl/traits/handle.hpp"
00035 #include "viennacl/tools/matrix_kernel_class_deducer.hpp"
00036 #include "viennacl/tools/matrix_prod_kernel_class_deducer.hpp"
00037 #include "viennacl/linalg/kernels/vector_kernels.h"
00038 #include "viennacl/linalg/kernels/matrix_row_kernels.h"
00039 #include "viennacl/linalg/kernels/matrix_col_kernels.h"
00040
00041 #include "viennacl/linalg/kernels/matrix_prod_col_col_col_kernels.h"
00042 #include "viennacl/linalg/kernels/matrix_prod_col_col_row_kernels.h"
00043 #include "viennacl/linalg/kernels/matrix_prod_col_row_col_kernels.h"
00044 #include "viennacl/linalg/kernels/matrix_prod_col_row_row_kernels.h"
00045
00046 #include "viennacl/linalg/kernels/matrix_prod_row_col_col_kernels.h"
00047 #include "viennacl/linalg/kernels/matrix_prod_row_col_row_kernels.h"
00048 #include "viennacl/linalg/kernels/matrix_prod_row_row_col_kernels.h"
00049 #include "viennacl/linalg/kernels/matrix_prod_row_row_row_kernels.h"
00050
00051 namespace viennacl
00052 {
00053 namespace linalg
00054 {
00055
00064 template<class TYPE, typename F, unsigned int ALIGNMENT>
00065 void add(const viennacl::matrix<TYPE, F, ALIGNMENT> & mat1,
00066 const viennacl::matrix<TYPE, F, ALIGNMENT> & mat2,
00067 viennacl::matrix<TYPE, F, ALIGNMENT> & result)
00068 {
00069 assert(result.size1() == mat1.size1());
00070 assert(result.size2() == mat1.size2());
00071 assert(result.size1() == mat2.size1());
00072 assert(result.size2() == mat2.size2());
00073
00074 typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<TYPE, F, ALIGNMENT> >::ResultType KernelClass;
00075
00076 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "add");
00077 assert( (mat1.internal_size() == mat2.internal_size())
00078 && "Operands must have same dimension and memory layout in this version of ViennaCL!");
00079 cl_uint size = std::min(mat1.internal_size(), mat2.internal_size());
00080
00081 viennacl::ocl::enqueue(k(mat1, mat2, result, size));
00082 }
00083
00091 template <typename M1, typename M2>
00092 typename viennacl::enable_if< viennacl::is_matrix<M1>::value
00093 && viennacl::is_matrix<M2>::value
00094 >::type
00095 inplace_add(M1 & result, M2 const & mat2)
00096 {
00097 assert(viennacl::traits::size1(result) == viennacl::traits::size1(mat2));
00098 assert(viennacl::traits::size2(result) == viennacl::traits::size2(mat2));
00099
00100 typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< M1 >::ResultType KernelClass;
00101
00102 size_t block_size = 15;
00103
00104 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "inplace_add");
00105 k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size1(result), block_size));
00106 k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(viennacl::traits::size2(result), block_size));
00107 k.local_work_size(0, block_size);
00108 k.local_work_size(1, block_size);
00109
00110 viennacl::ocl::enqueue(k(viennacl::traits::handle(result),
00111 cl_uint(viennacl::traits::start1(result)), cl_uint(viennacl::traits::start2(result)),
00112 cl_uint(viennacl::traits::size1(result)), cl_uint(viennacl::traits::size2(result)),
00113 cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
00114 viennacl::traits::handle(mat2),
00115 cl_uint(viennacl::traits::start1(mat2)), cl_uint(viennacl::traits::start2(mat2)),
00116 cl_uint(viennacl::traits::size1(mat2)), cl_uint(viennacl::traits::size2(mat2)),
00117 cl_uint(viennacl::traits::internal_size1(mat2)), cl_uint(viennacl::traits::internal_size2(mat2))
00118 )
00119 );
00120 }
00121
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00188 template<class TYPE, typename F, unsigned int ALIGNMENT>
00189 void sub(const viennacl::matrix<TYPE, F, ALIGNMENT> & mat1,
00190 const viennacl::matrix<TYPE, F, ALIGNMENT> & mat2,
00191 viennacl::matrix<TYPE, F, ALIGNMENT> & result)
00192 {
00193 assert(result.size1() == mat1.size1());
00194 assert(result.size2() == mat1.size2());
00195 assert(result.size1() == mat2.size1());
00196 assert(result.size2() == mat2.size2());
00197
00198 typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<TYPE, F, ALIGNMENT> >::ResultType KernelClass;
00199
00200 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "sub");
00201 assert( (mat1.internal_size() == mat2.internal_size())
00202 && "Operands must have same dimension and memory layout in this version of ViennaCL!");
00203 cl_uint size = std::min(mat1.internal_size(), mat2.internal_size());
00204
00205 viennacl::ocl::enqueue(k(mat1, mat2, result, size));
00206 }
00207
00215 template<class TYPE, typename F, unsigned int ALIGNMENT>
00216 void inplace_sub(viennacl::matrix<TYPE, F, ALIGNMENT> & result,
00217 const viennacl::matrix<TYPE, F, ALIGNMENT> & mat2)
00218 {
00219 assert(result.size1() == mat2.size1());
00220 assert(result.size2() == mat2.size2());
00221
00222 typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<TYPE, F, ALIGNMENT> >::ResultType KernelClass;
00223
00224 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "inplace_sub");
00225 assert( (result.internal_size() == mat2.internal_size())
00226 && "Operands must have same dimension and memory layout in this version of ViennaCL!");
00227 cl_uint size = std::min(result.internal_size(), mat2.internal_size());
00228
00229 viennacl::ocl::enqueue(k(result, mat2, size));
00230 }
00231
00239 template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
00240 void inplace_mult(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & result,
00241 SCALARTYPE val)
00242 {
00243 typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType KernelClass;
00244
00245 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "cpu_inplace_mult");
00246 viennacl::ocl::enqueue(k(result, val, cl_uint(result.internal_size())));
00247 }
00248
00249
00257 template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
00258 void inplace_mult(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & result,
00259 viennacl::scalar<SCALARTYPE> const & val)
00260 {
00261 typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType KernelClass;
00262
00263 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "inplace_mult");
00264 viennacl::ocl::enqueue(k(result, val, cl_uint(result.internal_size())));
00265 }
00266
00267
00268
00276 template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
00277 void inplace_divide(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & result,
00278 viennacl::scalar<SCALARTYPE> const & val)
00279 {
00280 typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType KernelClass;
00281
00282 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "inplace_divide");
00283 unsigned int size = result.internal_size();
00284
00285 viennacl::ocl::enqueue(k(result, val, size));
00286 }
00287
00288
00296 template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
00297 viennacl::vector_expression<const viennacl::matrix<SCALARTYPE, F, ALIGNMENT>,
00298 const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
00299 op_prod > prod_impl(const viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat,
00300 const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
00301 {
00302 return viennacl::vector_expression<const viennacl::matrix<SCALARTYPE, F, ALIGNMENT>,
00303 const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
00304 op_prod >(mat, vec);
00305 }
00306
00315 template<class TYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
00316 void prod_impl(const viennacl::matrix<TYPE, F, ALIGNMENT> & mat,
00317 const viennacl::vector<TYPE, VECTOR_ALIGNMENT> & vec,
00318 viennacl::vector<TYPE, VECTOR_ALIGNMENT> & result)
00319 {
00320 assert(mat.size2() == vec.size());
00321
00322 assert(vec.handle() != result.handle() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
00323 result.resize(mat.size1());
00324
00325 typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<TYPE, F, ALIGNMENT> >::ResultType KernelClass;
00326
00327 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "vec_mul");
00328 viennacl::ocl::enqueue(
00329 k(mat, cl_uint(mat.size1()), cl_uint(mat.size2()),
00330 cl_uint(mat.internal_size1()), cl_uint(mat.internal_size2()), vec, result));
00331 }
00332
00333
00334
00335
00343 template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
00344 viennacl::vector_expression<const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
00345 const matrix<SCALARTYPE, F, ALIGNMENT>,
00346 op_trans>,
00347 const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
00348 op_prod > prod_impl(const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
00349 const matrix<SCALARTYPE, F, ALIGNMENT>,
00350 op_trans> & proxy,
00351 const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec)
00352 {
00353 return viennacl::vector_expression<const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
00354 const matrix<SCALARTYPE, F, ALIGNMENT>,
00355 op_trans>,
00356 const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT>,
00357 op_prod >(proxy, vec);
00358 }
00359
00362 template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
00363 void prod_impl(const viennacl::matrix_expression< const matrix<SCALARTYPE, F, ALIGNMENT>,
00364 const matrix<SCALARTYPE, F, ALIGNMENT>,
00365 op_trans> & mat,
00366 const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec,
00367 viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
00368 {
00369 trans_prod_impl(mat.lhs(), vec, result);
00370 }
00371
00380 template<class SCALARTYPE, typename F, unsigned int ALIGNMENT, unsigned int VECTOR_ALIGNMENT>
00381 void trans_prod_impl(const matrix<SCALARTYPE, F, ALIGNMENT> & mat,
00382 const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec,
00383 viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
00384 {
00385 assert(mat.size1() == vec.size());
00386
00387 assert(vec.handle() != result.handle() && "No direct inplace matrix-vector product possible. Introduce a temporary!");
00388 result.resize(mat.size2());
00389
00390 typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType KernelClass;
00391
00392 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "trans_vec_mul");
00393
00394 viennacl::ocl::enqueue(k(mat, cl_uint(mat.size1()), cl_uint(mat.size2()),
00395 cl_uint(mat.internal_size1()), cl_uint(mat.internal_size2()), vec, result));
00396 }
00397
00398
00399
00405 template<class TYPE, typename F1, typename F2, typename F3, unsigned int ALIGNMENT>
00406 void prod_impl(const viennacl::matrix<TYPE, F1, ALIGNMENT> & A,
00407 const viennacl::matrix<TYPE, F2, ALIGNMENT> & B,
00408 viennacl::matrix<TYPE, F3, ALIGNMENT> & C,
00409 int block_size = 15)
00410 {
00411 assert(A.size1() == C.size1());
00412 assert(A.size2() == B.size1());
00413 assert(B.size2() == C.size2());
00414
00415 assert(C.handle() != A.handle()
00416 && C.handle() != B.handle()
00417 && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
00418
00419 typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<TYPE, F1, ALIGNMENT>,
00420 viennacl::matrix<TYPE, F2, ALIGNMENT>,
00421 viennacl::matrix<TYPE, F3, ALIGNMENT> >::ResultType KernelClass;
00422 KernelClass::init();
00423
00424
00425 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_AA");
00426
00427
00428
00429
00430
00431
00432 k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
00433 k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
00434 k.local_work_size(0, block_size);
00435 k.local_work_size(1, block_size);
00436
00437 viennacl::ocl::enqueue(
00438 k(A, cl_uint(0), cl_uint(0),
00439 cl_uint(A.size1()), cl_uint(A.size2()),
00440 cl_uint(A.internal_size1()), cl_uint(A.internal_size2()),
00441 B, cl_uint(0), cl_uint(0),
00442 cl_uint(B.size1()), cl_uint(B.size2()),
00443 cl_uint(B.internal_size1()), cl_uint(B.internal_size2()),
00444 C, cl_uint(0), cl_uint(0),
00445 cl_uint(C.size1()), cl_uint(C.size2()),
00446 cl_uint(C.internal_size1()), cl_uint(C.internal_size2()),
00447 viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size),
00448 viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size) ));
00449 }
00450
00451
00457 template<typename T1, typename T2, typename T3>
00458 void prod_impl(const viennacl::matrix_range<T1> & A,
00459 const viennacl::matrix_range<T2> & B,
00460 viennacl::matrix_range<T3> & C,
00461 int block_size = 15)
00462 {
00463 typedef typename T1::value_type::value_type value_type;
00464
00465 assert(A.size1() == C.size1());
00466 assert(A.size2() == B.size1());
00467 assert(B.size2() == C.size2());
00468
00469 assert(C.get().handle() != A.get().handle()
00470 && C.get().handle() != B.get().handle()
00471 && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
00472
00473 typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< T1, T2, T3 >::ResultType KernelClass;
00474 KernelClass::init();
00475
00476
00477 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_AA");
00478
00479
00480
00481
00482
00483
00484 k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
00485 k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
00486 k.local_work_size(0, block_size);
00487 k.local_work_size(1, block_size);
00488
00489 viennacl::ocl::enqueue(
00490 k(A.get(), cl_uint(A.start1()), cl_uint(A.start2()),
00491 cl_uint(A.size1()), cl_uint(A.size2()),
00492 cl_uint(A.get().internal_size1()), cl_uint(A.get().internal_size2()),
00493 B.get(), cl_uint(B.start1()), cl_uint(B.start2()),
00494 cl_uint(B.size1()), cl_uint(B.size2()),
00495 cl_uint(B.get().internal_size1()), cl_uint(B.get().internal_size2()),
00496 C.get(), cl_uint(C.start1()), cl_uint(C.start2()),
00497 cl_uint(C.size1()), cl_uint(C.size2()),
00498 cl_uint(C.get().internal_size1()), cl_uint(C.get().internal_size2()),
00499 viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size),
00500 viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size) ));
00501 }
00502
00503
00504
00510 template<class TYPE, typename F1, typename F2, typename F3, unsigned int ALIGNMENT>
00511 void prod_impl(const viennacl::matrix_expression< const matrix<TYPE, F1, ALIGNMENT>,
00512 const matrix<TYPE, F1, ALIGNMENT>,
00513 op_trans> & A,
00514 const viennacl::matrix<TYPE, F2, ALIGNMENT> & B,
00515 viennacl::matrix<TYPE, F3, ALIGNMENT> & C)
00516 {
00517 assert(A.size2() == C.size1());
00518 assert(A.size1() == B.size1());
00519 assert(B.size2() == C.size2());
00520
00521 assert(C.handle() != A.lhs().handle()
00522 && C.handle() != B.handle()
00523 && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
00524
00525 int block_size = 15;
00526
00527 typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<TYPE, F1, ALIGNMENT>,
00528 viennacl::matrix<TYPE, F2, ALIGNMENT>,
00529 viennacl::matrix<TYPE, F3, ALIGNMENT> >::ResultType KernelClass;
00530 KernelClass::init();
00531
00532 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_TA");
00533
00534 k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
00535 k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
00536
00537 k.local_work_size(0, block_size);
00538 k.local_work_size(1, block_size);
00539 viennacl::ocl::enqueue(
00540 k(A.lhs(), cl_uint(0), cl_uint(0),
00541 cl_uint(A.lhs().size1()), cl_uint(A.lhs().size2()),
00542 cl_uint(A.lhs().internal_size1()), cl_uint(A.lhs().internal_size2()),
00543 B, cl_uint(0), cl_uint(0),
00544 cl_uint(B.size1()), cl_uint(B.size2()),
00545 cl_uint(B.internal_size1()), cl_uint(B.internal_size2()),
00546 C, cl_uint(0), cl_uint(0),
00547 cl_uint(C.size1()), cl_uint(C.size2()),
00548 cl_uint(C.internal_size1()), cl_uint(C.internal_size2()),
00549 viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size),
00550 viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size) )
00551 );
00552 }
00553
00554
00560 template <typename M1, typename M2, typename M3>
00561 void prod_impl(const viennacl::matrix_expression< const matrix_range<M1>,
00562 const matrix_range<M1>,
00563 op_trans> & A_trans,
00564 const viennacl::matrix_range<M2> & B,
00565 viennacl::matrix_range<M3> & C)
00566 {
00567 typedef typename M1::value_type::value_type value_type;
00568 assert(A_trans.size2() == C.size1());
00569 assert(A_trans.size1() == B.size1());
00570 assert(B.size2() == C.size2());
00571
00572 assert(C.get().handle() != A_trans.lhs().get().handle()
00573 && C.get().handle() != B.get().handle()
00574 && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
00575
00576 int block_size = 15;
00577
00578 typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< M1, M2, M3 >::ResultType KernelClass;
00579 KernelClass::init();
00580
00581 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_TA");
00582
00583 k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
00584 k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
00585
00586 k.local_work_size(0, block_size);
00587 k.local_work_size(1, block_size);
00588
00589 const matrix_range<M1> & A = A_trans.lhs();
00590 viennacl::ocl::enqueue(
00591 k(A.get(), cl_uint(A.start1()), cl_uint(A.start2()),
00592 cl_uint(A.size1()), cl_uint(A.size2()),
00593 cl_uint(A.get().internal_size1()), cl_uint(A.get().internal_size2()),
00594 B.get(), cl_uint(B.start1()), cl_uint(B.start2()),
00595 cl_uint(B.size1()), cl_uint(B.size2()),
00596 cl_uint(B.get().internal_size1()), cl_uint(B.get().internal_size2()),
00597 C.get(), cl_uint(C.start1()), cl_uint(C.start2()),
00598 cl_uint(C.size1()), cl_uint(C.size2()),
00599 cl_uint(C.get().internal_size1()), cl_uint(C.get().internal_size2()),
00600 viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size),
00601 viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size) )
00602 );
00603 }
00604
00605
00606
00607
00613 template<class TYPE, typename F1, typename F2, typename F3, unsigned int ALIGNMENT>
00614 void prod_impl(const viennacl::matrix<TYPE, F1, ALIGNMENT> & A,
00615 const viennacl::matrix_expression< const matrix<TYPE, F2, ALIGNMENT>,
00616 const matrix<TYPE, F2, ALIGNMENT>,
00617 op_trans> & B,
00618 viennacl::matrix<TYPE, F3, ALIGNMENT> & C)
00619 {
00620 assert(A.size1() == C.size1());
00621 assert(A.size2() == B.size2());
00622 assert(B.size1() == C.size2());
00623
00624 assert(C.handle() != A.handle()
00625 && C.handle() != B.lhs().handle()
00626 && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
00627
00628 int block_size = 15;
00629
00630 typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<TYPE, F1, ALIGNMENT>,
00631 viennacl::matrix<TYPE, F2, ALIGNMENT>,
00632 viennacl::matrix<TYPE, F3, ALIGNMENT> >::ResultType KernelClass;
00633 KernelClass::init();
00634
00635 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_AT");
00636
00637 k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
00638 k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
00639
00640 k.local_work_size(0, block_size);
00641 k.local_work_size(1, block_size);
00642 viennacl::ocl::enqueue(
00643 k(A, cl_uint(0), cl_uint(0),
00644 cl_uint(A.size1()), cl_uint(A.size2()),
00645 cl_uint(A.internal_size1()), cl_uint(A.internal_size2()),
00646 B.lhs(), cl_uint(0), cl_uint(0),
00647 cl_uint(B.lhs().size1()), cl_uint(B.lhs().size2()),
00648 cl_uint(B.lhs().internal_size1()), cl_uint(B.lhs().internal_size2()),
00649 C, cl_uint(0), cl_uint(0),
00650 cl_uint(C.size1()), cl_uint(C.size2()),
00651 cl_uint(C.internal_size1()), cl_uint(C.internal_size2()),
00652 viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size),
00653 viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size) )
00654 );
00655 }
00656
00657
00663 template <typename M1, typename M2, typename M3>
00664 void prod_impl(const viennacl::matrix_range<M1> & A,
00665 const viennacl::matrix_expression< const matrix_range<M2>,
00666 const matrix_range<M2>,
00667 op_trans> & B_trans,
00668 viennacl::matrix_range<M3> & C)
00669 {
00670 typedef typename M1::value_type::value_type value_type;
00671 assert(A.size1() == C.size1());
00672 assert(A.size2() == B_trans.size2());
00673 assert(B_trans.size1() == C.size2());
00674
00675 assert(C.get().handle() != A.get().handle()
00676 && C.get().handle() != B_trans.lhs().get().handle()
00677 && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
00678
00679 int block_size = 15;
00680
00681 typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< M1, M2, M3 >::ResultType KernelClass;
00682 KernelClass::init();
00683
00684 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_AT");
00685
00686 k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
00687 k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
00688
00689 k.local_work_size(0, block_size);
00690 k.local_work_size(1, block_size);
00691 const matrix_range<M2> & B = B_trans.lhs();
00692 viennacl::ocl::enqueue(
00693 k(A.get(), cl_uint(A.start1()), cl_uint(A.start2()),
00694 cl_uint(A.size1()), cl_uint(A.size2()),
00695 cl_uint(A.get().internal_size1()), cl_uint(A.get().internal_size2()),
00696 B.get(), cl_uint(B.start1()), cl_uint(B.start2()),
00697 cl_uint(B.size1()), cl_uint(B.size2()),
00698 cl_uint(B.get().internal_size1()), cl_uint(B.get().internal_size2()),
00699 C.get(), cl_uint(C.start1()), cl_uint(C.start2()),
00700 cl_uint(C.size1()), cl_uint(C.size2()),
00701 cl_uint(C.get().internal_size1()), cl_uint(C.get().internal_size2()),
00702 viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size),
00703 viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size) )
00704 );
00705 }
00706
00707
00708
00709
00710
00711
00712
00713
00714
00720 template<class TYPE, typename F1, typename F2, typename F3, unsigned int ALIGNMENT>
00721 void prod_impl(const viennacl::matrix_expression< const matrix<TYPE, F1, ALIGNMENT>,
00722 const matrix<TYPE, F1, ALIGNMENT>,
00723 op_trans> & A,
00724 const viennacl::matrix_expression< const matrix<TYPE, F2, ALIGNMENT>,
00725 const matrix<TYPE, F2, ALIGNMENT>,
00726 op_trans> & B,
00727 viennacl::matrix<TYPE, F3, ALIGNMENT> & C)
00728 {
00729 assert(A.size2() == C.size1());
00730 assert(A.size1() == B.size2());
00731 assert(B.size1() == C.size2());
00732
00733 assert(C.handle() != A.lhs().handle()
00734 && C.handle() != B.lhs().handle()
00735 && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
00736
00737 int block_size = 15;
00738
00739 typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< viennacl::matrix<TYPE, F1, ALIGNMENT>,
00740 viennacl::matrix<TYPE, F2, ALIGNMENT>,
00741 viennacl::matrix<TYPE, F3, ALIGNMENT> >::ResultType KernelClass;
00742 KernelClass::init();
00743
00744 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_TT");
00745
00746 k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
00747 k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
00748
00749 k.local_work_size(0, block_size);
00750 k.local_work_size(1, block_size);
00751 viennacl::ocl::enqueue(
00752 k(A.lhs(), cl_uint(0), cl_uint(0),
00753 cl_uint(A.lhs().size1()), cl_uint(A.lhs().size2()),
00754 cl_uint(A.lhs().internal_size1()), cl_uint(A.lhs().internal_size2()),
00755 B.lhs(), cl_uint(0), cl_uint(0),
00756 cl_uint(B.lhs().size1()), cl_uint(B.lhs().size2()),
00757 cl_uint(B.lhs().internal_size1()), cl_uint(B.lhs().internal_size2()),
00758 C, cl_uint(0), cl_uint(0),
00759 cl_uint(C.size1()), cl_uint(C.size2()),
00760 cl_uint(C.internal_size1()), cl_uint(C.internal_size2()),
00761 viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size),
00762 viennacl::ocl::local_mem(sizeof(TYPE) * block_size * block_size) )
00763 );
00764 }
00765
00766
00772 template <typename M1, typename M2, typename M3>
00773 void prod_impl(const viennacl::matrix_expression< const matrix_range<M1>,
00774 const matrix_range<M1>,
00775 op_trans> & A_trans,
00776 const viennacl::matrix_expression< const matrix_range<M2>,
00777 const matrix_range<M2>,
00778 op_trans> & B_trans,
00779 viennacl::matrix_range<M3> & C)
00780 {
00781 typedef typename M1::value_type::value_type value_type;
00782 assert(A_trans.size2() == C.size1());
00783 assert(A_trans.size1() == B_trans.size2());
00784 assert(B_trans.size1() == C.size2());
00785
00786 assert(C.get().handle() != A_trans.lhs().get().handle()
00787 && C.get().handle() != B_trans.lhs().get().handle()
00788 && "No direct inplace matrix-matrix product possible. Introduce a temporary!");
00789
00790 int block_size = 15;
00791
00792 typedef typename viennacl::tools::MATRIX_PROD_KERNEL_CLASS_DEDUCER< M1, M2, M3 >::ResultType KernelClass;
00793 KernelClass::init();
00794
00795 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "prod_TT");
00796
00797 k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size1(), block_size));
00798 k.global_work_size(1, viennacl::tools::roundUpToNextMultiple<unsigned int>(C.size2(), block_size));
00799
00800 k.local_work_size(0, block_size);
00801 k.local_work_size(1, block_size);
00802 const matrix_range<M1> & A = A_trans.lhs();
00803 const matrix_range<M2> & B = B_trans.lhs();
00804 viennacl::ocl::enqueue(
00805 k(A.get(), cl_uint(A.start1()), cl_uint(A.start2()),
00806 cl_uint(A.size1()), cl_uint(A.size2()),
00807 cl_uint(A.get().internal_size1()), cl_uint(A.get().internal_size2()),
00808 B.get(), cl_uint(B.start1()), cl_uint(B.start2()),
00809 cl_uint(B.size1()), cl_uint(B.size2()),
00810 cl_uint(B.get().internal_size1()), cl_uint(B.get().internal_size2()),
00811 C.get(), cl_uint(C.start1()), cl_uint(C.start2()),
00812 cl_uint(C.size1()), cl_uint(C.size2()),
00813 cl_uint(C.get().internal_size1()), cl_uint(C.get().internal_size2()),
00814 viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size),
00815 viennacl::ocl::local_mem(sizeof(value_type) * block_size * block_size) )
00816 );
00817 }
00818
00819
00820
00821
00822
00823
00824
00825
00826
00832 template<class SCALARTYPE, unsigned int VA1, unsigned int VA2>
00833 viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
00834 const viennacl::vector<SCALARTYPE, VA2>,
00835 op_prod> outer_prod(const viennacl::vector<SCALARTYPE, VA1> & vec1,
00836 const viennacl::vector<SCALARTYPE, VA2> & vec2)
00837 {
00838 return viennacl::matrix_expression< const viennacl::vector<SCALARTYPE, VA1>,
00839 const viennacl::vector<SCALARTYPE, VA2>,
00840 op_prod>(vec1, vec2);
00841 }
00842
00843
00844
00853 template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
00854 void rank_1_update(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat1,
00855 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00856 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2)
00857 {
00858 assert(mat1.size1() == vec1.size());
00859 assert(mat1.size2() == vec2.size());
00860
00861 typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType KernelClass;
00862
00863 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "rank1_update");
00864
00865 viennacl::ocl::enqueue(k(mat1, cl_uint(mat1.size1()), cl_uint(mat1.size2()),
00866 cl_uint(mat1.internal_size1()), cl_uint(mat1.internal_size2()), vec1, vec2));
00867 }
00868
00869
00879 template<class SCALARTYPE, typename F, unsigned int ALIGNMENT>
00880 void scaled_rank_1_update(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & mat1,
00881 SCALARTYPE val,
00882 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00883 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2)
00884 {
00885 assert(mat1.size1() == vec1.size());
00886 assert(mat1.size2() == vec2.size());
00887
00888 typedef typename viennacl::tools::MATRIX_KERNEL_CLASS_DEDUCER< matrix<SCALARTYPE, F, ALIGNMENT> >::ResultType KernelClass;
00889
00890 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(KernelClass::program_name(), "scaled_rank1_update");
00891
00892 viennacl::ocl::enqueue(k(mat1, cl_uint(mat1.size1()), cl_uint(mat1.size2()),
00893 cl_uint(mat1.internal_size1()), cl_uint(mat1.internal_size2()),
00894 val, vec1, vec2));
00895 }
00896
00897 }
00898
00899
00900
00905 template <typename SCALARTYPE, unsigned int ALIGNMENT>
00906 template <typename F, unsigned int MAT_ALIGNMENT>
00907 viennacl::vector<SCALARTYPE, ALIGNMENT> &
00908 viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
00909 const viennacl::vector<SCALARTYPE, ALIGNMENT>,
00910 viennacl::op_prod> & proxy)
00911 {
00912
00913 if (proxy.rhs().handle() == this->handle())
00914 {
00915 viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
00916 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
00917 *this = result;
00918 return *this;
00919 }
00920 else
00921 {
00922 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
00923 return *this;
00924 }
00925 return *this;
00926 }
00927
00928
00933 template <typename SCALARTYPE, unsigned int ALIGNMENT>
00934 template <typename F, unsigned int MAT_ALIGNMENT>
00935 viennacl::vector<SCALARTYPE, ALIGNMENT> &
00936 viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
00937 const vector<SCALARTYPE, ALIGNMENT>,
00938 op_prod> & proxy)
00939 {
00940 vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
00941 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
00942 *this += result;
00943 return *this;
00944 }
00945
00950 template <typename SCALARTYPE, unsigned int ALIGNMENT>
00951 template <typename F, unsigned int MAT_ALIGNMENT>
00952 viennacl::vector<SCALARTYPE, ALIGNMENT> &
00953 viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
00954 const vector<SCALARTYPE, ALIGNMENT>,
00955 op_prod> & proxy)
00956 {
00957 vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
00958 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
00959 *this -= result;
00960 return *this;
00961 }
00962
00963
00964
00969 template <typename SCALARTYPE, unsigned int ALIGNMENT>
00970 template <typename F, unsigned int MAT_ALIGNMENT>
00971 viennacl::vector<SCALARTYPE, ALIGNMENT>
00972 viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
00973 const vector<SCALARTYPE, ALIGNMENT>,
00974 op_prod> & proxy)
00975 {
00976 assert(proxy.lhs().size1() == size());
00977 vector<SCALARTYPE, ALIGNMENT> result(size());
00978 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
00979 result += *this;
00980 return result;
00981 }
00982
00987 template <typename SCALARTYPE, unsigned int ALIGNMENT>
00988 template <typename F, unsigned int MAT_ALIGNMENT>
00989 viennacl::vector<SCALARTYPE, ALIGNMENT>
00990 viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
00991 const vector<SCALARTYPE, ALIGNMENT>,
00992 op_prod> & proxy)
00993 {
00994 assert(proxy.lhs().size1() == size());
00995 vector<SCALARTYPE, ALIGNMENT> result(size());
00996 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
00997 result = *this - result;
00998 return result;
00999 }
01000
01001
01003
01004
01005
01010 template <typename SCALARTYPE, unsigned int ALIGNMENT>
01011 template <typename F, unsigned int MAT_ALIGNMENT>
01012 viennacl::vector<SCALARTYPE, ALIGNMENT> &
01013 viennacl::vector<SCALARTYPE, ALIGNMENT>::operator=(const viennacl::vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
01014 const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
01015 op_trans>,
01016 const viennacl::vector<SCALARTYPE, ALIGNMENT>,
01017 viennacl::op_prod> & proxy)
01018 {
01019
01020 if (proxy.rhs().handle() == this->handle())
01021 {
01022 viennacl::vector<SCALARTYPE, ALIGNMENT> result(proxy.rhs().size());
01023 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
01024 *this = result;
01025 return *this;
01026 }
01027 else
01028 {
01029 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
01030 return *this;
01031 }
01032 return *this;
01033 }
01034
01035
01040 template <typename SCALARTYPE, unsigned int ALIGNMENT>
01041 template <typename F, unsigned int MAT_ALIGNMENT>
01042 viennacl::vector<SCALARTYPE, ALIGNMENT> &
01043 viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
01044 const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
01045 op_trans>,
01046 const vector<SCALARTYPE, ALIGNMENT>,
01047 op_prod> & proxy)
01048 {
01049 vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
01050 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
01051 *this += result;
01052 return *this;
01053 }
01054
01059 template <typename SCALARTYPE, unsigned int ALIGNMENT>
01060 template <typename F, unsigned int MAT_ALIGNMENT>
01061 viennacl::vector<SCALARTYPE, ALIGNMENT> &
01062 viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-=(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
01063 const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
01064 op_trans>,
01065 const vector<SCALARTYPE, ALIGNMENT>,
01066 op_prod> & proxy)
01067 {
01068 vector<SCALARTYPE, ALIGNMENT> result(proxy.lhs().size1());
01069 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
01070 *this -= result;
01071 return *this;
01072 }
01073
01074
01075
01080 template <typename SCALARTYPE, unsigned int ALIGNMENT>
01081 template <typename F, unsigned int MAT_ALIGNMENT>
01082 viennacl::vector<SCALARTYPE, ALIGNMENT>
01083 viennacl::vector<SCALARTYPE, ALIGNMENT>::operator+(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
01084 const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
01085 op_trans>,
01086 const vector<SCALARTYPE, ALIGNMENT>,
01087 op_prod> & proxy)
01088 {
01089 assert(proxy.lhs().size1() == size());
01090 vector<SCALARTYPE, ALIGNMENT> result(size());
01091 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
01092 result += *this;
01093 return result;
01094 }
01095
01100 template <typename SCALARTYPE, unsigned int ALIGNMENT>
01101 template <typename F, unsigned int MAT_ALIGNMENT>
01102 viennacl::vector<SCALARTYPE, ALIGNMENT>
01103 viennacl::vector<SCALARTYPE, ALIGNMENT>::operator-(const vector_expression< const matrix_expression< const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
01104 const matrix<SCALARTYPE, F, MAT_ALIGNMENT>,
01105 op_trans>,
01106 const vector<SCALARTYPE, ALIGNMENT>,
01107 op_prod> & proxy)
01108 {
01109 assert(proxy.lhs().size1() == size());
01110 vector<SCALARTYPE, ALIGNMENT> result(size());
01111 viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
01112 result = *this - result;
01113 return result;
01114 }
01115
01116
01117 }
01118
01119
01120 #endif