ViennaCL - The Vienna Computing Library: /data/development/ViennaCL/dev/viennacl/linalg/detail/spai/qr.hpp Source File

Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_DETAIL_SPAI_QR_HPP
00002 #define VIENNACL_LINALG_DETAIL_SPAI_QR_HPP
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2011, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008 
00009                             -----------------
00010                   ViennaCL - The Vienna Computing Library
00011                             -----------------
00012 
00013    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00014                
00015    (A list of authors and contributors can be found in the PDF manual)
00016 
00017    License:         MIT (X11), see file LICENSE in the base directory
00018 ============================================================================= */
00019 
00026 #include <utility>
00027 #include <iostream>
00028 #include <fstream>
00029 #include <string>
00030 #include <algorithm>
00031 #include <vector>
00032 #include <math.h>
00033 #include <cmath>
00034 #include <sstream>
00035 #include "viennacl/ocl/backend.hpp"
00036 #include "boost/numeric/ublas/vector.hpp"
00037 #include "boost/numeric/ublas/matrix.hpp"
00038 #include "boost/numeric/ublas/matrix_proxy.hpp"
00039 #include "boost/numeric/ublas/storage.hpp"
00040 #include "boost/numeric/ublas/io.hpp"
00041 #include "boost/numeric/ublas/matrix_expression.hpp"
00042 #include "boost/numeric/ublas/detail/matrix_assign.hpp"
00043 //#include "boost/thread/thread.hpp"
00044 
00045 #include "viennacl/vector.hpp"
00046 #include "viennacl/matrix.hpp"
00047 
00048 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
00049 #include "viennacl/linalg/detail/spai/block_vector.hpp"
00050 #include "viennacl/linalg/kernels/spai_source.h"
00051 #include "viennacl/linalg/kernels/spai_kernels.h"
00052 
00053 namespace viennacl
00054 {
00055     namespace linalg
00056     {
00057       namespace detail
00058       {
00059         namespace spai
00060         {
00061         
00062           
00063           
00064           //********** DEBUG FUNCTIONS *****************//
00065           template< typename T, typename InputIterator>
00066           void Print(std::ostream& ostr, InputIterator it_begin, InputIterator it_end){
00067               //std::ostream_iterator<int> it_os(ostr, delimiter);
00068               std::string delimiters = " ";
00069               std::copy(it_begin, it_end, std::ostream_iterator<T>(ostr, delimiters.c_str()));
00070               ostr<<std::endl;
00071           }
00072           
00073           template<typename VectorType, typename MatrixType>
00074           void write_to_block(VectorType& con_A_I_J, unsigned int start_ind,  const std::vector<unsigned int>& I, const std::vector<unsigned int>& J, MatrixType& m){
00075               m.resize(I.size(), J.size(), false);
00076               for(size_t i = 0; i < J.size(); ++i){
00077                   for(size_t j = 0; j < I.size(); ++j){
00078                       m(j,i) = con_A_I_J[start_ind + i*I.size() + j];
00079                   }
00080               }
00081           }
00082           
00083           template<typename VectorType>
00084           void print_continious_matrix(VectorType& con_A_I_J, std::vector<cl_uint>& blocks_ind,
00085                                       const std::vector<std::vector<unsigned int> >& g_I, const std::vector<std::vector<unsigned int> >& g_J){
00086               typedef typename VectorType::value_type ScalarType;
00087               std::vector<boost::numeric::ublas::matrix<ScalarType> > com_A_I_J(g_I.size());
00088               for(size_t i = 0; i < g_I.size(); ++i){
00089                   write_to_block( con_A_I_J, blocks_ind[i], g_I[i], g_J[i], com_A_I_J[i]);
00090                   std::cout<<com_A_I_J[i]<<std::endl;
00091               }
00092           }
00093           template<typename VectorType>
00094           void print_continious_vector(VectorType& con_v, std::vector<cl_uint>& block_ind, const std::vector<std::vector<unsigned int> >& g_J){
00095               typedef typename VectorType::value_type ScalarType;
00096               std::vector<boost::numeric::ublas::vector<ScalarType> > com_v(g_J.size());
00097               //Print<ScalarType>(std::cout, con_v.begin(), con_v.end());
00098               for(size_t i = 0; i < g_J.size(); ++i){
00099                   com_v[i].resize(g_J[i].size());
00100                   for(size_t j = 0; j < g_J[i].size(); ++j){
00101                       com_v[i](j) = con_v[block_ind[i] + j];
00102                   }
00103                   std::cout<<com_v[i]<<std::endl;
00104               }
00105           }
00106           
00108 
00115           void compute_blocks_size(const std::vector<std::vector<unsigned int> >& g_I, const std::vector<std::vector<unsigned int> >& g_J, 
00116                                   unsigned int& sz, std::vector<cl_uint>& blocks_ind, std::vector<cl_uint>& matrix_dims){
00117               sz = 0;
00118               for(size_t i = 0; i < g_I.size(); ++i){
00119                   sz += static_cast<unsigned int>(g_I[i].size()*g_J[i].size());
00120                   matrix_dims[2*i] = static_cast<cl_uint>(g_I[i].size());
00121                   matrix_dims[2*i + 1] = static_cast<cl_uint>(g_J[i].size());
00122                   blocks_ind[i+1] = blocks_ind[i] + static_cast<cl_uint>(g_I[i].size()*g_J[i].size());
00123                   
00124               }
00125           }
00130           void get_size(const std::vector<std::vector<unsigned int> >& inds, unsigned int& size){
00131               size = 0;
00132               for (size_t i = 0; i < inds.size(); ++i) {
00133                   size += static_cast<unsigned int>(inds[i].size());
00134               }
00135           }
00136           
00141           void init_start_inds(const std::vector<std::vector<unsigned int> >& inds, std::vector<cl_uint>& start_inds){
00142               for(size_t i = 0; i < inds.size(); ++i){
00143                   start_inds[i+1] = start_inds[i] + static_cast<cl_uint>(inds[i].size());
00144               }
00145           }
00146 
00147           //*************************************  QR FUNCTIONS  ***************************************//
00153           template<typename MatrixType, typename ScalarType>
00154           void dot_prod(const MatrixType& A,  unsigned int beg_ind, ScalarType& res){
00155               res = static_cast<ScalarType>(0);
00156               for(size_t i = beg_ind; i < A.size1(); ++i){
00157                   res += A(i, beg_ind-1)*A(i, beg_ind-1);
00158               }
00159           }
00167           template<typename MatrixType, typename VectorType, typename ScalarType>
00168           void custom_inner_prod(const MatrixType& A, const VectorType& v, unsigned int col_ind, unsigned int start_ind, ScalarType& res){
00169               res = static_cast<ScalarType>(0);
00170               for(unsigned int i = start_ind; i < static_cast<unsigned int>(A.size1()); ++i){
00171                   res += A(i, col_ind)*v(i);  
00172               }
00173           }
00174           
00180           template<typename MatrixType, typename VectorType>
00181           void copy_vector(const MatrixType & A, VectorType & v, const unsigned int beg_ind){
00182               for(unsigned int i = beg_ind; i < static_cast<unsigned int>(A.size1()); ++i){
00183                   v(i) = A( i, beg_ind-1);
00184               }
00185           }
00186           
00187           //householder reflection c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.210
00194           template<typename MatrixType, typename VectorType, typename ScalarType>
00195           void householder_vector(const MatrixType& A, unsigned int j, VectorType& v, ScalarType& b){
00196               ScalarType sg;
00197               //
00198               dot_prod(A, j+1, sg); 
00199               copy_vector(A, v, j+1);
00200               ScalarType mu;
00201               v(j) = static_cast<ScalarType>(1.0);
00202               if(sg == 0){
00203                   b = 0;
00204               }
00205               else{
00206                   mu = std::sqrt(A(j,j)*A(j, j) + sg);
00207                   if(A(j, j) <= 0){
00208                       v(j) = A(j, j) - mu;
00209                   }else{
00210                       v(j) = -sg/(A(j, j) + mu);
00211                   }
00212                   b = 2*(v(j)*v(j))/(sg + v(j)*v(j));
00213                   v = v/v(j);
00214               }
00215           }
00222           template<typename MatrixType, typename VectorType, typename ScalarType>
00223           void apply_householder_reflection(MatrixType& A, unsigned int iter_cnt, VectorType& v, ScalarType b){
00224               //update every column of matrix A
00225               ScalarType in_prod_res;
00226               for(unsigned int i = iter_cnt; i < static_cast<unsigned int>(A.size2()); ++i){
00227                   //update each column in a fashion: ai = ai - b*v*(v'*ai)
00228                   custom_inner_prod(A, v, i, iter_cnt, in_prod_res);
00229                   for(unsigned int j = iter_cnt; j < static_cast<unsigned int>(A.size1()); ++j){
00230                       A(j, i) -= b*in_prod_res*v(j);
00231                   }
00232               }
00233           }
00234           
00240           template<typename MatrixType, typename VectorType>
00241           void store_householder_vector(MatrixType& A, unsigned int ind, VectorType& v){
00242               for(unsigned int i = ind; i < static_cast<unsigned int>(A.size1()); ++i){
00243                   A(i, ind-1) = v(i);
00244               }
00245           }
00246           
00247           
00248           //QR algorithm 
00253           template<typename MatrixType, typename VectorType>
00254           void single_qr(MatrixType& R, VectorType& b_v){
00255               typedef typename MatrixType::value_type ScalarType;
00256               if((R.size1() > 0) && (R.size2() > 0)){
00257                   VectorType v = (VectorType)boost::numeric::ublas::zero_vector<ScalarType>(R.size1());
00258                   b_v = (VectorType)boost::numeric::ublas::zero_vector<ScalarType>(R.size2());
00259                   for(unsigned int i = 0; i < static_cast<unsigned int>(R.size2()); ++i){
00260                       householder_vector(R, i, v, b_v[i]);
00261                       apply_householder_reflection(R, i, v, b_v[i]);
00262                       if(i < R.size1()) store_householder_vector(R, i+1, v);
00263                   }
00264               }
00265           }
00266           
00267           //********************** HELP FUNCTIONS FOR GPU-based QR factorization *************************//
00272           void read_kernel_from_file(std::string& file_name, std::string& kernel_source){
00273               std::ifstream ifs(file_name.c_str(), std::ifstream::in);
00274               
00275               if (!ifs)
00276                 std::cerr << "WARNING: Cannot open file " << file_name << std::endl;
00277               
00278               std::string line;
00279               std::ostringstream ost;
00280               while (std::getline(ifs, line)) {
00281                   ost<<line<<std::endl;
00282               }
00283               kernel_source = ost.str();
00284           }
00285           
00290           void get_max_block_size(const std::vector<std::vector<unsigned int> >& inds, unsigned int& max_size){
00291               max_size = 0;
00292               for(unsigned int i = 0; i < inds.size(); ++i){
00293                   if(inds[i].size() > max_size){
00294                       max_size = static_cast<unsigned int>(inds[i].size());
00295                   }
00296               }
00297           }
00298           
00305           template<typename MatrixType, typename VectorType, typename ScalarType>
00306           void custom_dot_prod(const MatrixType& A, const VectorType& v, unsigned int ind, ScalarType& res){
00307               res = static_cast<ScalarType>(0);
00308               for(unsigned int j = ind; j < A.size1(); ++j){
00309                   if(j == ind){
00310                       res += v(j);
00311                   }else{
00312                       res += A(j, ind)*v(j);
00313                   }
00314               }
00315           }
00316           
00322           template<typename MatrixType, typename VectorType>
00323           void apply_q_trans_vec(const MatrixType& R, const VectorType& b_v, VectorType& y){
00324               typedef typename MatrixType::value_type ScalarType;
00325               ScalarType inn_prod = static_cast<ScalarType>(0);
00326               for(size_t i = 0; i < R.size2(); ++i){
00327                   custom_dot_prod(R, y, static_cast<unsigned int>(i), inn_prod);
00328                   for(size_t j = i; j < R.size1(); ++j){
00329                       if(i == j){
00330                           y(j) -= b_v(i)*inn_prod;
00331                       }
00332                       else{
00333                           y(j) -= b_v(i)*inn_prod*R(j,i);
00334                       }
00335                   }
00336               }
00337           }
00338           
00344           template<typename MatrixType, typename VectorType>
00345           void apply_q_trans_mat(const MatrixType& R, const VectorType& b_v, MatrixType& A){
00346               VectorType tmp_v;
00347               for(size_t i = 0; i < A.size2(); ++i){
00348                   tmp_v = (VectorType)column(A,i);
00349                   apply_q_trans_vec(R, b_v, tmp_v);
00350                   column(A,i) = tmp_v;
00351               }
00352           }
00353           
00354           //parallel QR for GPU
00364           template<typename ScalarType>
00365           void block_qr(std::vector<std::vector<unsigned int> >& g_I, 
00366                         std::vector<std::vector<unsigned int> >& g_J, 
00367                         block_matrix& g_A_I_J_vcl,
00368                         block_vector& g_bv_vcl,
00369                         std::vector<cl_uint>& g_is_update,
00370                         const unsigned int cur_iter){
00371               //typedef typename MatrixType::value_type ScalarType;
00372               unsigned int bv_size;
00373               unsigned int v_size;
00374               //set up arguments for GPU
00375               //find maximum size of rows/columns
00376               unsigned int local_r_n, local_c_n;
00377               //find max size for blocks
00378               get_max_block_size(g_I, local_r_n);
00379               get_max_block_size(g_J, local_c_n);
00380               //get size
00381               get_size(g_J, bv_size);
00382               get_size(g_I, v_size);
00383               //get start indices
00384               std::vector<cl_uint> start_bv_inds(g_I.size() + 1, 0);
00385               std::vector<cl_uint> start_v_inds(g_I.size() + 1, 0);
00386               init_start_inds(g_J, start_bv_inds);
00387               init_start_inds(g_I, start_v_inds);
00388               //init arrays
00389               std::vector<ScalarType> b_v(bv_size, static_cast<ScalarType>(0));
00390               std::vector<ScalarType> v(v_size, static_cast<ScalarType>(0));
00391               //call qr program
00392               block_vector v_vcl;
00393               /*if(cur_iter == 0)
00394               {
00395                   //if first run - compile the program
00396                   std::string qr_kernel_file_name = "kernels/spai/qr3_a_n.cl";
00397                   std::string qr_kernel_source;
00398                   read_kernel_from_file(qr_kernel_file_name, qr_kernel_source);
00399                   viennacl::ocl::program & qr_prog = viennacl::ocl::current_context().add_program(qr_kernel_source.c_str(), "qr_kernel_source");
00400                   qr_prog.add_kernel("block_qr");
00401                   //
00402               }*/
00403               
00404               g_bv_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,  
00405                                                                       static_cast<unsigned int>(sizeof(ScalarType)*bv_size), 
00406                                                                       &(b_v[0]));
00407               
00408               v_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,  
00409                                                                                                   static_cast<unsigned int>(sizeof(ScalarType)*v_size), 
00410                                                                                                   &(v[0]));
00411               //the same as j_start_inds
00412               g_bv_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00413                                                                                 static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()), 
00414                                                                                 &(start_bv_inds[0]));
00415               
00416               v_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00417                                                                                 static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()), 
00418                                                                                 &(start_v_inds[0]));
00419               viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
00420                                                                                 static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
00421                                                                                 &(g_is_update[0]));
00422               //local memory
00423               //viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp));
00424               viennacl::ocl::kernel& qr_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_qr");
00425               qr_kernel.local_work_size(0, local_c_n);
00426               qr_kernel.global_work_size(0, 256);
00427               viennacl::ocl::enqueue(qr_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle1(), g_bv_vcl.handle(), 
00428                                               v_vcl.handle(), g_A_I_J_vcl.handle2(), 
00429                                               g_bv_vcl.handle1(), v_vcl.handle1(), g_is_update_vcl,
00430                                               viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
00431                                               static_cast<cl_uint>(g_I.size())));
00432               
00433           }
00434         }
00435       }
00436     }
00437 }
00438 #endif