ViennaCL - The Vienna Computing Library: /data/development/ViennaCL/dev/viennacl/linalg/detail/spai/spai-dynamic.hpp Source File

Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
00002 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2011, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008 
00009                             -----------------
00010                   ViennaCL - The Vienna Computing Library
00011                             -----------------
00012 
00013    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00014                
00015    (A list of authors and contributors can be found in the PDF manual)
00016 
00017    License:         MIT (X11), see file LICENSE in the base directory
00018 ============================================================================= */
00019 
00026 #include <utility>
00027 #include <iostream>
00028 #include <fstream>
00029 #include <string>
00030 #include <algorithm>
00031 #include <vector>
00032 #include <math.h>
00033 #include <map>
00034 //#include "block_matrix.hpp"
00035 //#include "block_vector.hpp"
00036 //#include "benchmark-utils.hpp"
00037 #include "boost/numeric/ublas/vector.hpp"
00038 #include "boost/numeric/ublas/matrix.hpp"
00039 #include "boost/numeric/ublas/matrix_proxy.hpp"
00040 #include "boost/numeric/ublas/vector_proxy.hpp"
00041 #include "boost/numeric/ublas/storage.hpp"
00042 #include "boost/numeric/ublas/io.hpp"
00043 #include "boost/numeric/ublas/lu.hpp"
00044 #include "boost/numeric/ublas/triangular.hpp"
00045 #include "boost/numeric/ublas/matrix_expression.hpp"
00046 // ViennaCL includes
00047 #include "viennacl/linalg/prod.hpp"
00048 #include "viennacl/matrix.hpp"
00049 #include "viennacl/compressed_matrix.hpp"
00050 #include "viennacl/linalg/compressed_matrix_operations.hpp"
00051 #include "viennacl/linalg/matrix_operations.hpp"
00052 #include "viennacl/scalar.hpp"
00053 #include "viennacl/linalg/cg.hpp"
00054 #include "viennacl/linalg/inner_prod.hpp"
00055 #include "viennacl/linalg/ilu.hpp"
00056 #include "viennacl/ocl/backend.hpp"
00057 
00058 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
00059 #include "viennacl/linalg/detail/spai/block_vector.hpp"
00060 #include "viennacl/linalg/detail/spai/qr.hpp"
00061 #include "viennacl/linalg/detail/spai/spai_tag.hpp"
00062 #include "viennacl/linalg/kernels/spai_source.h"
00063 #include "viennacl/linalg/kernels/spai_kernels.h"
00064 
00065 namespace viennacl
00066 {
00067     namespace linalg
00068     {
00069       namespace detail
00070       {
00071         namespace spai
00072         {
00073         
00074           typedef std::pair<unsigned int, double> PairT;
00075           struct CompareSecond{
00076               bool operator()(const PairT& left, const PairT& right)
00077               {
00078                   return static_cast<double>(left.second) > static_cast<double>(right.second);
00079               }
00080           };
00081           
00082           
00089           template<typename SparseMatrixType, typename DenseMatrixType>
00090           void initProjectSubMatrix(const SparseMatrixType& A_in, const std::vector<unsigned int>& J, std::vector<unsigned int>& I,
00091                                     DenseMatrixType& A_out){
00092               typedef typename DenseMatrixType::value_type ScalarType;
00093               A_out.resize(I.size(), J.size(), false);
00094               for(size_t j = 0; j < J.size(); ++j){
00095                   for(size_t i = 0; i < I.size(); ++i){
00096                       A_out(i,j) = A_in(I[i],J[j]);
00097                   }
00098               }
00099           }
00100           
00105           bool isInIndexSet(const std::vector<unsigned int>& J, const unsigned int& ind){
00106               return (std::find(J.begin(), J.end(), ind) != J.end());
00107           }
00108           
00114           template<typename MatrixType>
00115           void composeNewR(const MatrixType& A, const MatrixType& R_n, MatrixType& R){
00116               typedef typename MatrixType::value_type ScalarType;
00117               size_t row_n = R_n.size1() - (A.size1() - R.size2()); 
00118               MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(R.size1() + row_n, R.size2() + A.size2());
00119               //write original R to new Composite R
00120               boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R;
00121               //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2()
00122               boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(), 
00123                                                                                                                         R.size2() + A.size2())) += 
00124               boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2()));
00125               //adding decomposed(QR) block to Composite R
00126               if(R_n.size1() > 0 && R_n.size2() > 0)
00127                   boost::numeric::ublas::project(C, boost::numeric::ublas::range(R.size2(), R.size1() + row_n),
00128                                                 boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n;
00129               R = C;
00130           }
00131           
00136           template<typename VectorType>
00137           void composeNewVector(const VectorType& v_n, VectorType& v){
00138               typedef typename VectorType::value_type ScalarType;
00139               VectorType w  = boost::numeric::ublas::zero_vector<ScalarType>(v.size() + v_n.size());
00140               boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v;
00141               boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n;
00142               v = w;
00143           }
00144           
00149           template<typename SparseVectorType, typename ScalarType>
00150           void sparse_norm_2(const SparseVectorType& v, ScalarType& norm){
00151               for(typename SparseVectorType::const_iterator vec_it  = v.begin(); vec_it != v.end(); ++vec_it){
00152                   norm += (vec_it->second)*(vec_it->second);
00153               }
00154               norm = std::sqrt(norm);
00155           }
00156           
00162           template<typename SparseVectorType, typename ScalarType>
00163           void sparse_inner_prod(const SparseVectorType& v1, const SparseVectorType& v2, ScalarType& res_v){
00164               typename SparseVectorType::const_iterator v_it1 = v1.begin();
00165               typename SparseVectorType::const_iterator v_it2 = v2.begin();
00166               while((v_it1 != v1.end())&&(v_it2 != v2.end())){
00167                   if(v_it1->first == v_it2->first){
00168                       res_v += (v_it1->second)*(v_it2->second);
00169                       ++v_it1;
00170                       ++v_it2;
00171                   }
00172                   else if(v_it1->first < v_it2->first){
00173                       ++v_it1;
00174                   }
00175                   else 
00176                       ++v_it2;
00177                       
00178                   
00179               }
00180           }
00181           
00189           template <typename SparseVectorType, typename ScalarType>
00190           bool buildAugmentedIndexSet(const std::vector<SparseVectorType>& A_v_c, 
00191                                       const SparseVectorType& res,
00192                                       std::vector<unsigned int>& J,
00193                                       std::vector<unsigned int>& J_u,
00194                                       const spai_tag& tag){
00195               std::vector<std::pair<unsigned int, ScalarType> > p;
00196               size_t cur_size = 0;
00197               ScalarType inprod, norm2;
00198               //print_sparse_vector(res);
00199               for(typename SparseVectorType::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it){
00200                   if(!isInIndexSet(J, res_it->first) && (std::abs(res_it->second) > tag.getResidualThreshold())){
00201                       inprod = norm2 = 0;
00202                       sparse_inner_prod(res, A_v_c[res_it->first], inprod);
00203                       sparse_norm_2(A_v_c[res_it->first], norm2);
00204                       p.push_back(std::pair<size_t, ScalarType>(res_it->first, (inprod*inprod)/(norm2*norm2)));
00205                   }
00206               }
00207               
00208               std::sort(p.begin(), p.end(), CompareSecond());
00209               while ((cur_size < J.size())&&(p.size() > 0)) {
00210                   J_u.push_back(p[0].first);
00211                   p.erase(p.begin());
00212                   cur_size++;
00213               }
00214               p.clear();
00215               return (cur_size > 0);
00216           }
00217           
00224           template<typename SparseVectorType>
00225           void buildNewRowSet(const std::vector<SparseVectorType>& A_v_c, const std::vector<unsigned int>& I, 
00226                               const std::vector<unsigned int>& J_n, std::vector<unsigned int>& I_n){
00227               for(size_t i = 0; i < J_n.size(); ++i){
00228                   for(typename SparseVectorType::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it){
00229                       if(!isInIndexSet(I, col_it->first)&&!isInIndexSet(I_n, col_it->first)){
00230                           I_n.push_back(col_it->first);
00231                       }
00232                   }
00233               }
00234           }
00235           
00241           template<typename MatrixType>
00242           void QRBlockComposition(const MatrixType& A_I_J, const MatrixType& A_I_J_u, MatrixType& A_I_u_J_u){
00243               typedef typename MatrixType::value_type ScalarType;
00244               size_t row_n1 = A_I_J_u.size1() - A_I_J.size2();
00245               size_t row_n2 = A_I_u_J_u.size1();
00246               size_t row_n = row_n1 + row_n2;
00247               size_t col_n = A_I_J_u.size2();
00248               MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(row_n, col_n);
00249               boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, row_n1), boost::numeric::ublas::range(0, col_n)) += 
00250               boost::numeric::ublas::project(A_I_J_u, boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()),
00251                                             boost::numeric::ublas::range(0, col_n));
00252                                             
00253               boost::numeric::ublas::project(C, boost::numeric::ublas::range(row_n1, row_n1 + row_n2),
00254                                             boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u;
00255               A_I_u_J_u = C;
00256           }
00257           
00269           template<typename SparseMatrixType, typename SparseVectorType, typename DenseMatrixType, typename VectorType>
00270           void block_update(const SparseMatrixType& A, const std::vector<SparseVectorType>& A_v_c,
00271                             std::vector<SparseVectorType>& g_res,
00272                             std::vector<bool>& g_is_update,
00273                             std::vector<std::vector<unsigned int> >& g_I,
00274                             std::vector<std::vector<unsigned int> >& g_J, 
00275                             std::vector<VectorType>& g_b_v,
00276                             std::vector<DenseMatrixType>& g_A_I_J,
00277                             spai_tag const & tag){
00278               typedef typename DenseMatrixType::value_type ScalarType;
00279               //set of new column indices
00280               std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
00281               //set of new row indices 
00282               std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
00283               //matrix A(I, \tilde J), cf. Kallischko p.31-32
00284               std::vector<DenseMatrixType> g_A_I_J_u(g_J.size());
00285               //matrix A(\tilde I, \tilde J), cf. Kallischko
00286               std::vector<DenseMatrixType> g_A_I_u_J_u(g_J.size());
00287               //new vector of beta coefficients from QR factorization
00288               std::vector<VectorType> g_b_v_u(g_J.size());
00289 #ifdef _OPENMP
00290               #pragma omp parallel for
00291 #endif              
00292               for(std::size_t i = 0; i < g_J.size(); ++i){
00293                   if(g_is_update[i]){
00294                       if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag)){
00295                           //initialize matrix A_I_\hatJ
00296                           initProjectSubMatrix(A, g_J_u[i], g_I[i], g_A_I_J_u[i]);
00297                           //multiplication of Q'*A_I_\hatJ
00298                           apply_q_trans_mat(g_A_I_J[i], g_b_v[i], g_A_I_J_u[i]);
00299                           //building new rows index set \hatI
00300                           buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]);
00301                           initProjectSubMatrix(A, g_J_u[i], g_I_u[i], g_A_I_u_J_u[i]);
00302                           //composition of block for new QR factorization
00303                           QRBlockComposition(g_A_I_J[i], g_A_I_J_u[i], g_A_I_u_J_u[i]);
00304                           //QR factorization
00305                           single_qr(g_A_I_u_J_u[i], g_b_v_u[i]);
00306                           //composition of new R and new vector b_v
00307                           composeNewR(g_A_I_J_u[i], g_A_I_u_J_u[i], g_A_I_J[i]);
00308                           composeNewVector(g_b_v_u[i], g_b_v[i]);
00309                           //composition of new sets: I and J
00310                           g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end());
00311                           g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end());
00312                       }else{
00313                           g_is_update[i] = false;
00314                       }
00315                   }
00316               }
00317           }
00318           /**************************************************** GPU SPAI Update ****************************************************************/
00319           
00320           
00321           //performs Q'*A(I, \tilde J) on GPU
00331           template<typename ScalarType>
00332           void block_q_multiplication(const std::vector<std::vector<unsigned int> >& g_J_u, 
00333                                       const std::vector<std::vector<unsigned int> >& g_I, 
00334                                       block_matrix& g_A_I_J_vcl, 
00335                                       block_vector& g_bv_vcl, 
00336                                       block_matrix& g_A_I_J_u_vcl, 
00337                                       std::vector<cl_uint>& g_is_update,
00338                                       const unsigned int cur_iter){
00339               unsigned int local_r_n, local_c_n, sz_blocks;
00340               get_max_block_size(g_I, local_r_n);
00341               get_max_block_size(g_J_u, local_c_n);
00342               //for debug 
00343               std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
00344               std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00345               compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims);
00346               std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
00347               /*if(cur_iter == 1){
00348                   //if first run - compile the program
00349                   std::string block_q_kernel_file_name = "kernels/spai/block_q.cl";
00350                   std::string block_q_kernel_source;
00351                   read_kernel_from_file(block_q_kernel_file_name, block_q_kernel_source);
00352                   viennacl::ocl::program & block_q_prog = viennacl::ocl::current_context().add_program(block_q_kernel_source.c_str(), "block_q_kernel_source");
00353                   block_q_prog.add_kernel("block_q_mult");
00354                   //
00355               }*/
00356               
00357               viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
00358                                                                                   static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
00359                                                                                                             &(g_is_update[0]));
00360               viennacl::ocl::kernel& block_q_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_q_mult");
00361               block_q_kernel.local_work_size(0, local_c_n);
00362               block_q_kernel.global_work_size(0, 256);
00363               viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), 
00364                                                     g_bv_vcl.handle(),   
00365                                                     g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl,
00366                                                     viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
00367                                                     static_cast<cl_uint>(g_I.size())));
00368           }
00369           
00376           void assemble_qr_row_inds(const std::vector<std::vector<unsigned int> >& g_I, const std::vector<std::vector<unsigned int> > g_J, 
00377                                     const std::vector<std::vector<unsigned int> >& g_I_u, 
00378                                     std::vector<std::vector<unsigned int> >& g_I_q){
00379 #ifdef _OPENMP
00380               #pragma omp parallel for
00381 #endif              
00382               for(std::size_t i = 0; i < g_I.size(); ++i){
00383                   for(std::size_t j = g_J[i].size(); j < g_I[i].size(); ++j){
00384                       g_I_q[i].push_back(g_I[i][j]);
00385                   }
00386                   
00387                   for(std::size_t j = 0; j < g_I_u[i].size(); ++j){
00388                       g_I_q[i].push_back(g_I_u[i][j]);
00389                   }
00390               }
00391           }
00392 
00407           template<typename ScalarType>
00408           void assemble_qr_block(
00409                                 const std::vector<std::vector<unsigned int> >& g_J, 
00410                                 const std::vector<std::vector<unsigned int> >& g_I, 
00411                                 const std::vector<std::vector<unsigned int> >& g_J_u,
00412                                 const std::vector<std::vector<unsigned int> >& g_I_u, 
00413                                 std::vector<std::vector<unsigned int> >& g_I_q,
00414                                 block_matrix& g_A_I_J_u_vcl, 
00415                                 viennacl::ocl::handle<cl_mem>& matrix_dimensions, 
00416                                 block_matrix& g_A_I_u_J_u_vcl, 
00417                                 std::vector<cl_uint>& g_is_update,
00418                                 const bool is_empty_block,
00419                                 const unsigned int cur_iter){
00420               //std::vector<std::vector<unsigned int> > g_I_q(g_I.size());
00421               assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q);
00422               unsigned int sz_blocks;
00423               std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
00424               std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00425               compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims);
00426               std::vector<ScalarType> con_A_I_J_q(sz_blocks, static_cast<ScalarType>(0));
00427               
00428               /*if(cur_iter == 1){
00429                   std::string qr_block_asm_file_name = "kernels/spai/qr_block_assembly_g.cl";
00430                   std::string qr_block_asm_source;
00431                   read_kernel_from_file(qr_block_asm_file_name, qr_block_asm_source);
00432                   viennacl::ocl::program & qr_block_asm_prog = viennacl::ocl::current_context().add_program(qr_block_asm_source.c_str(), 
00433                                                                                                         "qr_block_assembly_kernel_source");
00434                   
00435                   qr_block_asm_prog.add_kernel("block_qr_assembly");
00436                   
00437                   
00438                   //extra kernel in case of empty block A_I_u_J_u
00439                   std::string qr_block_asm_file_name_1 = "kernels/spai/qr_block_assembly_1_g.cl";
00440                   std::string qr_block_asm_source_1;
00441                   read_kernel_from_file(qr_block_asm_file_name_1, qr_block_asm_source_1);
00442                   viennacl::ocl::program & qr_block_asm_prog_1 = viennacl::ocl::current_context().add_program(qr_block_asm_source_1.c_str(), 
00443                                                                                                             "qr_block_assembly_kernel_source_1");
00444                   
00445                   qr_block_asm_prog_1.add_kernel("block_qr_assembly_1");
00446                   
00447               }*/
00448               block_matrix g_A_I_J_q_vcl;
00449               //need to allocate memory for QR block
00450               g_A_I_J_q_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
00451                                                                                       static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks),
00452                                                                                       &(con_A_I_J_q[0]));
00453               g_A_I_J_q_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00454                                                                                 static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())), 
00455                                                                                 &(matrix_dims[0]));
00456               g_A_I_J_q_vcl.handle2() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
00457                                                                   static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)),
00458                                                                                       &(blocks_ind[0]));
00459               viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
00460                                                                                                             static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
00461                                                                                                             &(g_is_update[0]));
00462               
00463               if(!is_empty_block){
00464                   viennacl::ocl::kernel& qr_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_qr_assembly");
00465                   qr_assembly_kernel.local_work_size(0, 1);
00466                   qr_assembly_kernel.global_work_size(0, 256);
00467                   viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, 
00468                                                             g_A_I_J_u_vcl.handle(), 
00469                                                             g_A_I_J_u_vcl.handle2(), 
00470                                                             g_A_I_J_u_vcl.handle1(), 
00471                                                             g_A_I_u_J_u_vcl.handle(),
00472                                                             g_A_I_u_J_u_vcl.handle2(), 
00473                                                             g_A_I_u_J_u_vcl.handle1(), 
00474                                                             g_A_I_J_q_vcl.handle(), 
00475                                                             g_A_I_J_q_vcl.handle2(), 
00476                                                             g_A_I_J_q_vcl.handle1(),
00477                                                             g_is_update_vcl,
00478                                                             static_cast<unsigned int>(g_I.size())));
00479               }else{
00480                   viennacl::ocl::kernel& qr_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_qr_assembly_1");
00481                   qr_assembly_kernel.local_work_size(0, 1);
00482                   qr_assembly_kernel.global_work_size(0, 256);
00483                   viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), 
00484                                                             g_A_I_J_u_vcl.handle1(),
00485                                                             g_A_I_J_q_vcl.handle(), 
00486                                                             g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(),
00487                                                             g_is_update_vcl,
00488                                                             static_cast<unsigned int>(g_I.size())));
00489               }
00490               g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle();
00491               g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1();
00492               g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2();
00493           }
00494 
00506           template<typename ScalarType>
00507           void assemble_r(std::vector<std::vector<unsigned int> >& g_I, std::vector<std::vector<unsigned int> >& g_J,
00508                           block_matrix& g_A_I_J_vcl, 
00509                           block_matrix& g_A_I_J_u_vcl,
00510                           block_matrix& g_A_I_u_J_u_vcl, 
00511                           block_vector& g_bv_vcl, 
00512                           block_vector& g_bv_vcl_u,
00513                           std::vector<cl_uint>& g_is_update,
00514                           const unsigned int cur_iter){
00515               std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
00516               std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00517               std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0);
00518               unsigned int sz_blocks, bv_size;
00519               compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
00520               get_size(g_J, bv_size);
00521               init_start_inds(g_J, start_bv_r_inds);
00522               std::vector<ScalarType> con_A_I_J_r(sz_blocks, static_cast<ScalarType>(0));
00523               std::vector<ScalarType> b_v_r(bv_size, static_cast<ScalarType>(0));
00524               /*if(cur_iter == 1){
00525                   std::string r_block_asm_file_name = "kernels/spai/r_block_assembly_g.cl";
00526                   std::string r_block_asm_source;
00527                   read_kernel_from_file(r_block_asm_file_name, r_block_asm_source);
00528                   viennacl::ocl::program & r_block_asm_prog = viennacl::ocl::current_context().add_program(r_block_asm_source.c_str(), 
00529                                                                                                             "r_block_assembly_kernel_source");
00530                   r_block_asm_prog.add_kernel("block_r_assembly");
00531                   
00532                   std::string bv_block_asm_file_name = "kernels/spai/bv_block_assembly_g.cl";
00533                   std::string bv_block_asm_source;
00534                   read_kernel_from_file(bv_block_asm_file_name, bv_block_asm_source);
00535                   viennacl::ocl::program & bv_block_asm_prog = viennacl::ocl::current_context().add_program(bv_block_asm_source.c_str(), 
00536                                                                                                           "bv_block_assembly_kernel_source");
00537                   bv_block_asm_prog.add_kernel("block_bv_assembly");
00538               }*/
00539               block_matrix g_A_I_J_r_vcl;
00540               block_vector g_bv_r_vcl;
00541               g_A_I_J_r_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
00542                                                                                       static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks),
00543                                                                                       &(con_A_I_J_r[0]));
00544               g_A_I_J_r_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00545                                                           static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())), 
00546                                                                                       &(matrix_dims[0]));
00547               g_A_I_J_r_vcl.handle2() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
00548                                                           static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)),
00549                                                                                       &(blocks_ind[0]));
00550               g_bv_r_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,  
00551                                                                         static_cast<unsigned int>(sizeof(ScalarType)*bv_size), 
00552                                                                         &(b_v_r[0]));
00553               g_bv_r_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00554                                                                                 static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), 
00555                                                                                 &(start_bv_r_inds[0]));
00556               viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
00557                                                                                           static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
00558                                                                                                             &(g_is_update[0]));
00559               viennacl::ocl::kernel& r_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_r_assembly");
00560               r_assembly_kernel.local_work_size(0, 1);
00561               r_assembly_kernel.global_work_size(0, 256);
00562               
00563               viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), 
00564                                                       g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(), 
00565                                                       g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(), 
00566                                                       g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(),
00567                                                       g_is_update_vcl, static_cast<cl_uint>(g_I.size())));
00568               
00569               viennacl::ocl::kernel & bv_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_bv_assembly");
00570               bv_assembly_kernel.local_work_size(0, 1);
00571               bv_assembly_kernel.global_work_size(0, 256);
00572               viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(),
00573                                                         g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(),
00574                                                         g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl,
00575                                                         static_cast<cl_uint>(g_I.size())));
00576               g_bv_vcl.handle() = g_bv_r_vcl.handle();
00577               g_bv_vcl.handle1() = g_bv_r_vcl.handle1();
00578               
00579               g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle();
00580               g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2();
00581               g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1();
00582           }
00583           
00596           template<typename ScalarType, unsigned int MAT_ALIGNMENT, typename SparseVectorType>
00597           void block_update(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, const std::vector<SparseVectorType>& A_v_c,
00598                             std::vector<cl_uint>& g_is_update,
00599                             std::vector<SparseVectorType>& g_res,
00600                             std::vector<std::vector<unsigned int> >& g_J, 
00601                             std::vector<std::vector<unsigned int> >& g_I,
00602                             block_matrix& g_A_I_J_vcl, 
00603                             block_vector& g_bv_vcl,
00604                             spai_tag const & tag,
00605                             const unsigned int cur_iter){
00606               //updated index set for columns
00607               std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
00608               //updated index set for rows
00609               std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
00610               //mixed index set of old and updated indices for rows 
00611               std::vector<std::vector<unsigned int> > g_I_q(g_J.size());
00612               //GPU memory for A_I_\hatJ
00613               block_matrix g_A_I_J_u_vcl;
00614               //GPU memory for A_\hatI_\hatJ
00615               block_matrix g_A_I_u_J_u_vcl;
00616               bool is_empty_block;
00617               //GPU memory for new b_v
00618               block_vector g_bv_u_vcl;
00619 #ifdef _OPENMP
00620               #pragma omp parallel for
00621 #endif              
00622               for(std::size_t i = 0; i < g_J.size(); ++i){
00623                   if(g_is_update[i]){
00624                       if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag)){
00625                           buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]);
00626                       }
00627                   }
00628               }
00629               //assemble new A_I_J_u blocks on GPU and multiply them with Q'
00630               block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block, cur_iter);
00631               //I have matrix A_I_J_u ready..
00632               block_q_multiplication<ScalarType>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, cur_iter);
00633               //assemble A_\hatI_\hatJ
00634               block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block, cur_iter);
00635               assemble_qr_block<ScalarType>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(),
00636                                             g_A_I_u_J_u_vcl, g_is_update, is_empty_block, cur_iter);
00637               
00638               block_qr<ScalarType>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, cur_iter);
00639               //concatanation of new and old indices
00640 #ifdef _OPENMP
00641               #pragma omp parallel for
00642 #endif              
00643               for(std::size_t i = 0; i < g_J.size(); ++i){
00644                   g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end());
00645                   g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end());
00646               }
00647               assemble_r<ScalarType>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl,  g_bv_vcl,  g_bv_u_vcl, g_is_update, cur_iter);
00648           }
00649         
00650         }        
00651       }        
00652     }
00653 }
00654 #endif