ViennaCL - The Vienna Computing Library: /data/development/ViennaCL/dev/viennacl/linalg/detail/spai/spai.hpp Source File

Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
00002 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2011, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008 
00009                             -----------------
00010                   ViennaCL - The Vienna Computing Library
00011                             -----------------
00012 
00013    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00014                
00015    (A list of authors and contributors can be found in the PDF manual)
00016 
00017    License:         MIT (X11), see file LICENSE in the base directory
00018 ============================================================================= */
00019 
00024 #include <utility>
00025 #include <iostream>
00026 #include <fstream>
00027 #include <string>
00028 #include <algorithm>
00029 #include <vector>
00030 #include <math.h>
00031 #include <map>
00032 
00033 //local includes
00034 #include "viennacl/linalg/detail/spai/spai_tag.hpp"
00035 #include "viennacl/linalg/qr.hpp"
00036 #include "viennacl/linalg/detail/spai/spai-dynamic.hpp"
00037 #include "viennacl/linalg/detail/spai/spai-static.hpp"
00038 #include "viennacl/linalg/detail/spai/sparse_vector.hpp"
00039 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
00040 #include "viennacl/linalg/detail/spai/block_vector.hpp"
00041 
00042 //boost includes
00043 #include "boost/numeric/ublas/vector.hpp"
00044 #include "boost/numeric/ublas/matrix.hpp"
00045 #include "boost/numeric/ublas/matrix_proxy.hpp"
00046 #include "boost/numeric/ublas/vector_proxy.hpp"
00047 #include "boost/numeric/ublas/storage.hpp"
00048 #include "boost/numeric/ublas/io.hpp"
00049 #include "boost/numeric/ublas/lu.hpp"
00050 #include "boost/numeric/ublas/triangular.hpp"
00051 #include "boost/numeric/ublas/matrix_expression.hpp"
00052 
00053 // ViennaCL includes
00054 #include "viennacl/linalg/prod.hpp"
00055 #include "viennacl/matrix.hpp"
00056 #include "viennacl/compressed_matrix.hpp"
00057 #include "viennacl/linalg/compressed_matrix_operations.hpp"
00058 #include "viennacl/linalg/matrix_operations.hpp"
00059 #include "viennacl/scalar.hpp"
00060 #include "viennacl/linalg/inner_prod.hpp"
00061 #include "viennacl/linalg/ilu.hpp"
00062 #include "viennacl/ocl/backend.hpp"
00063 #include "viennacl/linalg/kernels/spai_source.h"
00064 #include "viennacl/linalg/kernels/spai_kernels.h"
00065 
00066 
00067 
00068 #define VIENNACL_SPAI_K_b 20
00069 
00070 namespace viennacl
00071 {
00072   namespace linalg
00073   {
00074     namespace detail
00075     {
00076       namespace spai
00077       {
00078         
00079         //debug function for print
00080         template<typename SparseVectorType>
00081         void print_sparse_vector(const SparseVectorType& v){
00082             for(typename SparseVectorType::const_iterator vec_it = v.begin(); vec_it!= v.end(); ++vec_it){
00083                 std::cout<<"[ "<<vec_it->first<<" ]:"<<vec_it->second<<std::endl;
00084             }
00085         }
00086         template<typename DenseMatrixType>
00087         void print_matrix( DenseMatrixType & m){
00088             for(int i = 0; i < m.size2(); ++i){
00089                 for(int j = 0; j < m.size1(); ++j){
00090                     std::cout<<m(j, i)<<" ";
00091                 }
00092                 std::cout<<std::endl;
00093             }
00094         }
00095         
00101         template<typename SparseVectorType, typename ScalarType>
00102         void add_sparse_vectors(const SparseVectorType& v, const ScalarType b,  SparseVectorType& res_v){
00103             for(typename SparseVectorType::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it){
00104                 res_v[v_it->first] += b*v_it->second;
00105             }
00106         }
00107         //sparse-matrix - vector product
00114         template<typename SparseVectorType, typename ScalarType>
00115         void compute_spai_residual(const std::vector<SparseVectorType>& A_v_c, const SparseVectorType& v,
00116                                    const unsigned int ind, SparseVectorType& res){
00117             for(typename SparseVectorType::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it){
00118                 add_sparse_vectors(A_v_c[v_it->first], v_it->second, res);
00119             }
00120             res[ind] -= static_cast<ScalarType>(1);
00121         }
00122         
00129         template<typename SparseVectorType>
00130         void build_index_set(const std::vector<SparseVectorType>& A_v_c, const SparseVectorType& v, std::vector<unsigned int>& J, 
00131                              std::vector<unsigned int>& I){
00132             buildColumnIndexSet(v, J);
00133             projectRows(A_v_c, J, I);
00134         }
00135         
00136         /************************************************** CPU BLOCK SET UP ***************************************/
00146         template<typename SparseMatrixType, typename DenseMatrixType, typename SparseVectorType, typename VectorType>
00147         void block_set_up(const SparseMatrixType& A,
00148                           const std::vector<SparseVectorType>& A_v_c,
00149                           const std::vector<SparseVectorType>& M_v, 
00150                           std::vector<std::vector<unsigned int> >& g_I, 
00151                           std::vector<std::vector<unsigned int> >& g_J,
00152                           std::vector<DenseMatrixType>& g_A_I_J,
00153                           std::vector<VectorType>& g_b_v){
00154 #ifdef _OPENMP
00155             #pragma omp parallel for
00156 #endif            
00157             for(std::size_t i = 0; i < M_v.size(); ++i){
00158                 build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
00159                 initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]);
00160                 //print_matrix(g_A_I_J[i]);
00161                 single_qr(g_A_I_J[i], g_b_v[i]);
00162                 //print_matrix(g_A_I_J[i]);
00163             }            
00164         }
00165         
00172         template<typename SparseVectorType>
00173         void index_set_up(const std::vector<SparseVectorType>& A_v_c,
00174                           const std::vector<SparseVectorType>& M_v, 
00175                           std::vector<std::vector<unsigned int> >& g_J, 
00176                           std::vector<std::vector<unsigned int> >& g_I)
00177         {
00178 #ifdef _OPENMP
00179             #pragma omp parallel for
00180 #endif            
00181             for(std::size_t i = 0; i < M_v.size(); ++i){
00182                 build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
00183             }
00184         }
00185         
00186         /************************************************** GPU BLOCK SET UP ***************************************/
00198         template<typename ScalarType, unsigned int MAT_ALIGNMENT, typename SparseVectorType>
00199         void block_set_up(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A,
00200                           const std::vector<SparseVectorType>& A_v_c, 
00201                           const std::vector<SparseVectorType>& M_v,
00202                           std::vector<cl_uint> g_is_update,
00203                           std::vector<std::vector<unsigned int> >& g_I, 
00204                           std::vector<std::vector<unsigned int> >& g_J, 
00205                           block_matrix & g_A_I_J, 
00206                           block_vector & g_bv,
00207                           const unsigned int cur_iter)
00208         {
00209             bool is_empty_block;
00210             //build index set
00211             index_set_up(A_v_c, M_v, g_J, g_I);
00212             block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block, cur_iter);
00213             block_qr<ScalarType>(g_I, g_J, g_A_I_J, g_bv, g_is_update, cur_iter);
00214             
00215         }
00216         
00217         
00218         /***************************************************************************************************/
00219         /******************************** SOLVING LS PROBLEMS ON GPU ***************************************/
00220         /***************************************************************************************************/
00227         template<typename ScalarType, typename SparseVectorType>
00228         void custom_fan_out(const std::vector<ScalarType> & m_in,
00229                             unsigned int start_m_ind,
00230                             const std::vector<unsigned int> & J,
00231                             SparseVectorType & m)
00232         {
00233             unsigned int  cnt = 0;
00234             for (std::size_t i = 0; i < J.size(); ++i) {
00235                 m[J[i]] = m_in[start_m_ind + cnt++];
00236             }
00237         }
00238         
00239         
00240 
00241         //GPU based least square problem
00254         template<typename SparseVectorType, typename ScalarType>
00255         void least_square_solve(std::vector<SparseVectorType> & A_v_c, 
00256                                 std::vector<SparseVectorType> & M_v,
00257                                 std::vector<std::vector<unsigned int> >& g_I, 
00258                                 std::vector<std::vector<unsigned int> > & g_J,
00259                                 block_matrix & g_A_I_J_vcl,
00260                                 block_vector & g_bv_vcl,
00261                                 std::vector<SparseVectorType> & g_res,
00262                                 std::vector<cl_uint> & g_is_update,
00263                                 const spai_tag & tag,
00264                                 const unsigned int cur_iter){
00265             unsigned int y_sz, m_sz;
00266             std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));
00267             std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));
00268             get_size(g_I, y_sz);
00269             init_start_inds(g_I, y_inds);
00270             init_start_inds(g_J, m_inds);
00271             //create y_v
00272             std::vector<ScalarType> y_v(y_sz, static_cast<ScalarType>(0));
00273             for(std::size_t i = 0; i < M_v.size(); ++i){
00274                 for(std::size_t j = 0; j < g_I[i].size(); ++j){
00275                     if(g_I[i][j] == i)
00276                         y_v[y_inds[i] + j] = static_cast<ScalarType>(1.0);
00277                 }
00278             }
00279             //compute m_v
00280             get_size(g_J, m_sz);
00281             std::vector<ScalarType> m_v(m_sz, static_cast<cl_uint>(0));
00282             
00283             //acquire kernel
00284             /*if(cur_iter == 0){
00285                 std::string ls_kernel_file_name = "kernels/spai/ls_g.cl";
00286                 std::string ls_kernel_source;
00287                 read_kernel_from_file(ls_kernel_file_name, ls_kernel_source);
00288                 //compilation of a kernel
00289                 viennacl::ocl::program & ls_prog = viennacl::ocl::current_context().add_program(ls_kernel_source.c_str(), "ls_kernel_source");
00290                 //least square kernel
00291                 ls_prog.add_kernel("block_least_squares");
00292             }*/
00293             block_vector y_v_vcl;
00294             block_vector m_v_vcl;
00295             //prepearing memory for least square problem on GPU
00296             y_v_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00297                                                                               static_cast<unsigned int>(sizeof(ScalarType)*y_v.size()), 
00298                                                                               &(y_v[0]));
00299             m_v_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00300                                                                               static_cast<unsigned int>(sizeof(ScalarType)*m_v.size()),
00301                                                                               &(m_v[0]));
00302             y_v_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
00303                                                                                static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), 
00304                                                                                &(y_inds[0]));
00305             viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
00306                                                                                     static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
00307                                                                                         &(g_is_update[0]));
00308             viennacl::ocl::kernel& ls_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_least_squares");
00309             ls_kernel.local_work_size(0, 1);
00310             ls_kernel.global_work_size(0, 256);
00311             viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(), 
00312                                              y_v_vcl.handle(), y_v_vcl.handle1(), 
00313                                              g_A_I_J_vcl.handle1(), g_is_update_vcl,
00314                                              //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
00315                                              static_cast<unsigned int>(M_v.size())));
00316             //copy vector m_v back from GPU to CPU
00317             cl_int vcl_err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
00318                                                  m_v_vcl.handle(), CL_TRUE, 0, 
00319                                                  sizeof(ScalarType)*(m_v.size()),
00320                                                  &(m_v[0]), 0, NULL, NULL);
00321             VIENNACL_ERR_CHECK(vcl_err);
00322             //fan out vector in parallel
00323             //#pragma omp parallel for
00324             for(std::size_t i = 0; i < M_v.size(); ++i){
00325                 if(g_is_update[i]){
00326                     //faned out onto sparse vector
00327                     custom_fan_out(m_v, m_inds[i], g_J[i], M_v[i]);
00328                     g_res[i].clear();
00329                     compute_spai_residual<SparseVectorType, ScalarType>(A_v_c,  M_v[i], static_cast<unsigned int>(i), g_res[i]);
00330                     ScalarType res_norm = 0;
00331                     //compute norm of res - just to make sure that this implementatino works correct
00332                     sparse_norm_2(g_res[i], res_norm);
00333                     //std::cout<<"Residual norm of column #: "<<i<<std::endl;
00334                     //std::cout<<res_norm<<std::endl;
00335                     //std::cout<<"************************"<<std::endl;
00336                     g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);
00337                     
00338                 }
00339             }
00340         }
00341         
00342         //CPU based least square problems
00354         template<typename SparseVectorType, typename DenseMatrixType, typename VectorType>
00355         void least_square_solve(const std::vector<SparseVectorType>& A_v_c,
00356                                 std::vector<DenseMatrixType>& g_R, 
00357                                 std::vector<VectorType>& g_b_v,
00358                                 std::vector<std::vector<unsigned int> >& g_I, 
00359                                 std::vector<std::vector<unsigned int> >& g_J,
00360                                 std::vector<SparseVectorType>& g_res, 
00361                                 std::vector<bool>& g_is_update, 
00362                                 std::vector<SparseVectorType>& M_v,
00363                                 const spai_tag& tag){
00364             typedef typename DenseMatrixType::value_type ScalarType;
00365             //VectorType m_new, y;
00366 #ifdef _OPENMP
00367             #pragma omp parallel for
00368 #endif            
00369             for(std::size_t i = 0; i < M_v.size(); ++i){
00370                 if(g_is_update[i]){
00371                     VectorType y = boost::numeric::ublas::zero_vector<ScalarType>(g_I[i].size());
00372                     //std::cout<<y<<std::endl;
00373                     projectI<VectorType, ScalarType>(g_I[i], y, static_cast<unsigned int>(tag.getBegInd() + i));
00374                     apply_q_trans_vec(g_R[i], g_b_v[i], y);
00375                     VectorType m_new =  boost::numeric::ublas::zero_vector<ScalarType>(g_R[i].size2());
00376                     backwardSolve(g_R[i], y, m_new);
00377                     fanOutVector(m_new, g_J[i], M_v[i]);
00378                     g_res[i].clear();
00379                     compute_spai_residual<SparseVectorType, ScalarType>(A_v_c,  M_v[i], static_cast<unsigned int>(tag.getBegInd() + i), g_res[i]); 
00380                     ScalarType res_norm = 0;
00381                     sparse_norm_2(g_res[i], res_norm);
00382 //                    std::cout<<"Residual norm of column #: "<<i<<std::endl;
00383 //                    std::cout<<res_norm<<std::endl;
00384 //                    std::cout<<"************************"<<std::endl;
00385                     g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic());
00386                 }
00387             }
00388         } 
00389         
00390         //************************************ UPDATE CHECK ***************************************************//
00391         template<typename VectorType>
00392         bool is_all_update(VectorType& parallel_is_update){
00393             
00394             for(unsigned int i = 0; i < parallel_is_update.size(); ++i){
00395                 if(parallel_is_update[i])
00396                     return true;
00397             }
00398             return false;
00399         }
00400         
00401         //********************************** MATRIX VECTORIZATION ***********************************************//
00402         //Matrix vectorization, column based approach
00407         template<typename SparseMatrixType, typename SparseVectorType>
00408         void vectorize_column_matrix(const SparseMatrixType& M_in, std::vector<SparseVectorType>& M_v){
00409             for(typename SparseMatrixType::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it){
00410                 //
00411                 for(typename SparseMatrixType::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it){
00412                     M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned int>(col_it.index1())] = *col_it; 
00413                 }
00414                 //std::cout<<std::endl;
00415             }
00416         }
00417         
00418         //Matrix vectorization row based approach
00419         template<typename SparseMatrixType, typename SparseVectorType>
00420         void vectorize_row_matrix(const SparseMatrixType& M_in, std::vector<SparseVectorType>& M_v){
00421             for(typename SparseMatrixType::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it){
00422                 for(typename SparseMatrixType::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it){
00423                     M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned int>(col_it.index2())] = *col_it; 
00424                 }
00425             }
00426         }
00427         
00428         //************************************* BLOCK ASSEMBLY CODE *********************************************//
00429         
00430         
00431         
00432         void write_set_to_array(const std::vector<std::vector<unsigned int> >& ind_set, std::vector<cl_uint>& a){
00433             unsigned int cnt = 0;
00434             //unsigned int tmp;
00435             for(size_t i = 0; i < ind_set.size(); ++i){
00436                 for(size_t j = 0; j < ind_set[i].size(); ++j){
00437                     a[cnt++] = static_cast<cl_uint>(ind_set[i][j]); 
00438                 }
00439             }
00440         }
00441         
00442         
00443         
00444         //assembling blocks on GPU
00454         template<typename ScalarType, unsigned int MAT_ALIGNMENT>
00455         void block_assembly(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, const std::vector<std::vector<unsigned int> >& g_J, 
00456                             const std::vector<std::vector<unsigned int> >& g_I,
00457                             block_matrix& g_A_I_J_vcl, 
00458                             std::vector<cl_uint>& g_is_update,
00459                             bool& is_empty_block,
00460                             const unsigned int cur_iter){
00461             //computing start indices for index sets and start indices for block matrices
00462             unsigned int sz_I, sz_J, sz_blocks;
00463             std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
00464             std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00465             std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00466             std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00467             // 
00468             init_start_inds(g_J, j_ind);
00469             init_start_inds(g_I, i_ind);
00470             //
00471             get_size(g_J, sz_J);
00472             get_size(g_I, sz_I);
00473             std::vector<cl_uint> I_set(sz_I, static_cast<cl_uint>(0));
00474             //
00475             std::vector<cl_uint> J_set(sz_J, static_cast<cl_uint>(0));
00476             // computing size for blocks
00477             // writing set to arrays
00478             write_set_to_array(g_I, I_set);
00479             write_set_to_array(g_J, J_set);
00480             // if block for assembly does exist
00481             if(I_set.size() > 0 && J_set.size() > 0){
00482                 compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
00483                 std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
00484                 
00485                 /*if(cur_iter == 0){
00486                     std::string block_asm_file_name = "kernels/spai/block_assembly_g.cl";
00487                     std::string block_asm_source;
00488                     read_kernel_from_file(block_asm_file_name, block_asm_source);
00489                     viennacl::ocl::program & block_asm_prog = viennacl::ocl::current_context().add_program(block_asm_source.c_str(), 
00490                                                                                                            "block_assembly_kernel_source");
00491                     
00492                     block_asm_prog.add_kernel("assemble_blocks");
00493                 }*/
00494                 block_vector set_I_vcl, set_J_vcl;
00495                 //init memory on GPU
00496                 //contigious g_A_I_J
00497                 g_A_I_J_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00498                                                                                       static_cast<unsigned int>(sizeof(ScalarType)*(sz_blocks)), 
00499                                                                                       &(con_A_I_J[0]));
00500                 //matrix_dimensions
00501                 g_A_I_J_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00502                                                                                    static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())), 
00503                                                                                    &(matrix_dims[0]));
00504                 //start_block inds
00505                 g_A_I_J_vcl.handle2() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00506                                                                                        static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), 
00507                                                                                        &(blocks_ind[0]));
00508                 //set_I
00509                 set_I_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00510                                                                                     static_cast<unsigned int>(sizeof(cl_uint)*sz_I), 
00511                                                                                     &(I_set[0]));
00512                 //set_J
00513                 set_J_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00514                                                                                     static_cast<unsigned int>(sizeof(cl_uint)*sz_J), 
00515                                                                                     &(J_set[0]));
00516                 //i_ind
00517                 set_I_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00518                                                                                      static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), 
00519                                                                                      &(i_ind[0]));
00520                 //j_ind
00521                 set_J_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00522                                                                                      static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), 
00523                                                                                      &(j_ind[0]));
00524                 
00525                 viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, 
00526                                                                                                                static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
00527                                                                                                                &(g_is_update[0]));
00528                 viennacl::ocl::kernel& assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "assemble_blocks");
00529                 assembly_kernel.local_work_size(0, 1);
00530                 assembly_kernel.global_work_size(0, 256);
00531                 viennacl::ocl::enqueue(assembly_kernel(A.handle1(), A.handle2(), A.handle(), 
00532                                                        set_I_vcl.handle(), set_J_vcl.handle(), set_I_vcl.handle1(), 
00533                                                        set_J_vcl.handle1(), 
00534                                                        g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),
00535                                                        g_is_update_vcl,
00536                                                        static_cast<unsigned int>(g_I.size())));
00537                 is_empty_block = false;
00538             }else{ 
00539                 is_empty_block = true;
00540             }
00541         }
00542         
00543         /************************************************************************************************************************/
00544         
00550         template<typename SparseMatrixType, typename SparseVectorType>
00551         void insert_sparse_columns(const std::vector<SparseVectorType>& M_v,
00552                                    SparseMatrixType& M,
00553                                    bool is_right){
00554             if (is_right)
00555             {
00556               for(unsigned int i = 0; i < M_v.size(); ++i){
00557                   for(typename SparseVectorType::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it){
00558                       M(vec_it->first, i) = vec_it->second;
00559                   }
00560               }
00561             }
00562             else  //transposed fill of M
00563             {
00564               for(unsigned int i = 0; i < M_v.size(); ++i){
00565                   for(typename SparseVectorType::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it){
00566                       M(i, vec_it->first) = vec_it->second;
00567                   }
00568               }
00569             }
00570         }
00571         
00576         template<typename MatrixType>
00577         void sparse_transpose(const MatrixType& A_in, MatrixType& A){
00578             typedef typename MatrixType::value_type ScalarType;
00579             std::vector<std::map<size_t, ScalarType> >   temp_A(A_in.size2());
00580             A.resize(A_in.size2(), A_in.size1(), false);
00581             
00582             for (typename MatrixType::const_iterator1 row_it = A_in.begin1();
00583                  row_it != A_in.end1();
00584                  ++row_it)
00585             {
00586                 for (typename MatrixType::const_iterator2 col_it = row_it.begin();
00587                      col_it != row_it.end();
00588                      ++col_it)
00589                 {
00590                     temp_A[col_it.index2()][col_it.index1()] = *col_it;
00591                 }
00592             }
00593             
00594             for (size_t i=0; i<temp_A.size(); ++i)
00595             {
00596                 for (typename std::map<size_t, ScalarType>::const_iterator it = temp_A[i].begin();
00597                      it != temp_A[i].end();
00598                      ++it)
00599                     A(i, it->first) = it->second;
00600             }
00601         }
00602         
00603         
00604         
00605         
00606 //        template<typename SparseVectorType>
00607 //        void custom_copy(std::vector<SparseVectorType> & M_v, std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){
00608 //            for(int i = 0; i < l_M_v.size(); ++i){
00609 //                l_M_v[i] = M_v[i + beg_ind];
00610 //            }
00611 //        }
00612         
00613         //CPU version
00619         template <typename MatrixType>
00620         void computeSPAI(const MatrixType & A, MatrixType & M, spai_tag & tag){
00621             typedef typename MatrixType::value_type ScalarType;
00622             typedef typename boost::numeric::ublas::vector<ScalarType> VectorType;
00623             typedef typename viennacl::linalg::detail::spai::sparse_vector<ScalarType> SparseVectorType;
00624             typedef typename boost::numeric::ublas::matrix<ScalarType> DenseMatrixType;
00625             //sparse matrix transpose...
00626             unsigned int cur_iter = 0;
00627             tag.setBegInd(0); tag.setEndInd(VIENNACL_SPAI_K_b);
00628             bool go_on = true;
00629             std::vector<SparseVectorType> A_v_c(M.size2());
00630             std::vector<SparseVectorType> M_v(M.size2());
00631             vectorize_column_matrix(A, A_v_c);
00632             vectorize_column_matrix(M, M_v);
00633             
00634             
00635             while(go_on){
00636                 go_on = (tag.getEndInd() < static_cast<long>(M.size2()));
00637                 cur_iter = 0;
00638                 unsigned int l_sz = tag.getEndInd() - tag.getBegInd();
00639                 //std::vector<bool> g_is_update(M.size2(), true);
00640                 std::vector<bool> g_is_update(l_sz, true);
00641                 //init is update
00642                 //init_parallel_is_update(g_is_update);
00643                 //std::vector<SparseVectorType> A_v_c(K);
00644                 //std::vector<SparseVectorType> M_v(K);
00645                 //vectorization of marices
00646                 //print_matrix(M_v);
00647                 std::vector<SparseVectorType> l_M_v(l_sz);
00648                 //custom_copy(M_v, l_M_v, beg_ind);
00649                 std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), l_M_v.begin()); 
00650                 //print_matrix(l_M_v);
00651                 //std::vector<SparseVectorType> l_A_v_c(K);
00652                 //custom_copy(A_v_c, l_A_v_c, beg_ind);
00653                 //std::copy(A_v_c.begin() + beg_ind, A_v_c.begin() + end_ind, l_A_v_c.begin());
00654                 //print_matrix(l_A_v_c);
00655                 //vectorize_row_matrix(A, A_v_r);
00656                 //working blocks
00657                 //std::vector<DenseMatrixType> g_A_I_J(M.size2())                
00658                 std::vector<DenseMatrixType> g_A_I_J(l_sz);
00659                 //std::vector<VectorType> g_b_v(M.size2());
00660                 std::vector<VectorType> g_b_v(l_sz);
00661                 //std::vector<SparseVectorType> g_res(M.size2());
00662                 std::vector<SparseVectorType> g_res(l_sz);
00663                 //std::vector<std::vector<unsigned int> > g_I(M.size2());
00664                 std::vector<std::vector<unsigned int> > g_I(l_sz);
00665                 //std::vector<std::vector<unsigned int> > g_J(M.size2());
00666                 std::vector<std::vector<unsigned int> > g_J(l_sz);
00667                 while((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update)){
00668                     // SET UP THE BLOCKS..
00669                     // PHASE ONE
00670                     if(cur_iter == 0) block_set_up(A, A_v_c, l_M_v,  g_I, g_J, g_A_I_J, g_b_v);
00671                     else block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, tag);
00672                     
00673                     //PHASE TWO, LEAST SQUARE SOLUTION
00674                     least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, l_M_v, tag);
00675                     
00676                     if(tag.getIsStatic()) break;
00677                     cur_iter++;
00678                     
00679                     
00680                 }
00681                 std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
00682                 tag.setBegInd(tag.getEndInd());//beg_ind = end_ind;
00683                 tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + VIENNACL_SPAI_K_b), static_cast<long>(M.size2())));
00684                 //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd()); 
00685                 
00686             }
00687             M.resize(M.size1(), M.size2(), false);
00688             insert_sparse_columns(M_v, M, tag.getIsRight());
00689         }
00690 
00691         
00692         //GPU - based version
00700         template <typename ScalarType, unsigned int MAT_ALIGNMENT>
00701         void computeSPAI(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, //input
00702                          const boost::numeric::ublas::compressed_matrix<ScalarType>& cpu_A,
00703                          boost::numeric::ublas::compressed_matrix<ScalarType>& cpu_M, //output
00704                          viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& M,
00705                          const spai_tag& tag){
00706             typedef typename boost::numeric::ublas::vector<ScalarType> VectorType;
00707             typedef typename viennacl::linalg::detail::spai::sparse_vector<ScalarType> SparseVectorType;
00708             typedef typename boost::numeric::ublas::matrix<ScalarType> DenseMatrixType;
00709             typedef typename boost::numeric::ublas::compressed_matrix<ScalarType> CPUMatrixType;
00710             //typedef typename viennacl::compressed_matrix<ScalarType> GPUSparseMatrixType;
00711             //sparse matrix transpose...
00712             unsigned int cur_iter = 0;
00713             std::vector<cl_uint> g_is_update(cpu_M.size2(), static_cast<cl_uint>(1));
00714             //init is update
00715             //init_parallel_is_update(g_is_update);
00716             std::vector<SparseVectorType> A_v_c(cpu_M.size2());
00717             std::vector<SparseVectorType> M_v(cpu_M.size2());
00718             vectorize_column_matrix(cpu_A, A_v_c);
00719             vectorize_column_matrix(cpu_M, M_v);
00720             std::vector<SparseVectorType> g_res(cpu_M.size2());
00721             std::vector<std::vector<unsigned int> > g_I(cpu_M.size2());
00722             std::vector<std::vector<unsigned int> > g_J(cpu_M.size2());
00723             
00724             //OpenCL variables
00725             block_matrix g_A_I_J_vcl;
00726             block_vector g_bv_vcl;
00727             while((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update)){
00728                 // SET UP THE BLOCKS..
00729                 // PHASE ONE..
00730                 //timer.start();
00731                 //index set up on CPU
00732                 if(cur_iter == 0) block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, cur_iter);
00733                 else block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, g_bv_vcl, tag, cur_iter);
00734                 //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl;
00735                 //PERFORM LEAST SQUARE problems solution
00736                 //PHASE TWO
00737                 //timer.start();
00738                 least_square_solve<SparseVectorType, ScalarType>(A_v_c, M_v, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, cur_iter);
00739                 //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl;
00740                 if(tag.getIsStatic()) break;
00741                 cur_iter++;
00742             }
00743             cpu_M.resize(cpu_M.size1(), cpu_M.size2(), false);
00744             insert_sparse_columns(M_v, cpu_M, tag.getIsRight());
00745             //copy back to GPU
00746             M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned int>(cpu_M.size2()));
00747             viennacl::copy(cpu_M, M);
00748         }
00749         
00750       }        
00751     }
00752   }
00753 }
00754 #endif