#define USE_COLUMN_MAJOR
//#undef USE_COLUMN_MAJOR

#include "libscl.h"

#include "viennacl/version.hpp"

#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/amg.hpp"
#include "viennacl/linalg/prod.hpp"
#include "viennacl/linalg/direct_solve.hpp"
#include "viennacl/linalg/norm_2.hpp"

using namespace scl;
using namespace std;

const INTEGER Arows = 1000;
const INTEGER Acols = 10000;
const INTEGER Bcols = 1000;

const INTEGER Brows = Acols;
const INTEGER Crows = Arows;
const INTEGER Ccols = Bcols;

class cpu_matrix {  // vclmat in libscl is a wrapper with more features
private:
  realmat a;
public:
  cpu_matrix() : a() {}
  cpu_matrix(const realmat& x) : a(x) {}
  unsigned int size() const { return a.size(); }
  unsigned int size1() const { return a.nrow(); }
  unsigned int size2() const { return a.ncol(); }
  REAL operator()(unsigned int i, unsigned int j) const { return a(i+1,j+1); }
  REAL& operator()(unsigned int i, unsigned int j) { return a(i+1,j+1); }
  operator realmat() const { return a; }
};

class cycle {
private:
  int i;
public:
  cycle() : i(0) {}
  float operator()()  { return float(++i % 1000); }
};


int main(int argc, char** argp, char** envp)
{
  cout << '\n';
  #if defined USE_COLUMN_MAJOR      
    cout << *argp << " is using column major" << '\n';
  #else
    cout << *argp << " is not using column major" << '\n';
  #endif

  realmat A(Arows,Acols);
  realmat B(Brows,Bcols);
  realmat C(Crows,Ccols);

  cycle fill;  
  for (INTEGER i=1; i<=A.size(); ++i) A[i] = fill();
  for (INTEGER i=1; i<=B.size(); ++i) B[i] = fill();

  stopwatch timer;

  C = A*B;

  REAL cpu_time = timer.time();

  cout << '\n';
  cout << "libscl_float mult time = " << cpu_time << '\n';

  realmat Answer = C;
  
  timer.reset(); 

  #if defined USE_COLUMN_MAJOR
    viennacl::matrix<float,viennacl::column_major> gpu_A(Arows,Acols);
    viennacl::matrix<float,viennacl::column_major> gpu_B(Brows,Bcols);
    viennacl::matrix<float,viennacl::column_major> gpu_C(Crows,Ccols);
  #else
    viennacl::matrix<float> gpu_A(Arows,Acols);
    viennacl::matrix<float> gpu_B(Brows,Bcols);
    viennacl::matrix<float> gpu_C(Crows,Ccols);
  #endif

  cpu_matrix cpu_A(A);
  cpu_matrix cpu_B(B);
  cpu_matrix cpu_C(C);

  timer.reset();

  viennacl::copy(cpu_A,gpu_A);
  viennacl::copy(cpu_B,gpu_B);

  REAL vcl_time_1 = timer.time();

  cout << "viennacl A & B copy time = " << vcl_time_1 << '\n';

  timer.reset();

  gpu_C = viennacl::linalg::prod(gpu_A, gpu_B);

  REAL vcl_time_2 = timer.time();
  
  cout << "viennacl mult time = " << vcl_time_2 << '\n';

  timer.reset();

  viennacl::copy(gpu_C,cpu_C);

/*

  REAL vcl_time_3 = timer.time();

  cout << "viennacl C copy time = " << vcl_time_3 << '\n';


  unsigned int count = 0;

  C = cpu_C;

  for (INTEGER i=1; i<=Answer.size(); ++ i) {
      float err = C[i] - Answer[i]; 
      float tol =  Answer[i]*1.0e-4;
      err = err > 0 ? err : -err;
      tol = tol > 0 ? tol : -tol;
      if (err > tol) ++count;
  }

  REAL vcl_time = vcl_time_1 + vcl_time_2 + vcl_time_3;

  cout << "viennacl total time = " << vcl_time << '\n';

  cout << "GPU/CPU total time = " 
       << fmt('f',8,4,100.0*vcl_time/cpu_time) << " per cent" << '\n';
  cout << "GPU/CPU mult time  = " 
       << fmt('f',8,4,100.0*vcl_time_2/cpu_time) << " per cent" << '\n';

  cout << "viennacl errors = " << count << '\n';
  cout << '\n';

*/

  return 0;
}
