1 #ifndef VIENNACL_LINALG_CUDA_SPGEMM_RMERGE_HPP_
2 #define VIENNACL_LINALG_CUDA_SPGEMM_RMERGE_HPP_
45 template<
typename NumericT>
48 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
49 return __ldg(address);
59 template<
typename IndexT>
78 template<
typename IndexT>
80 const IndexT * A_row_indices,
81 const IndexT * A_col_indices,
83 const IndexT * B_row_indices,
84 IndexT *subwarpsize_per_group,
85 IndexT *max_nnz_row_A_per_group,
86 IndexT *max_nnz_row_B_per_group)
88 unsigned int subwarpsize_in_thread = 0;
89 unsigned int max_nnz_row_A = 0;
90 unsigned int max_nnz_row_B = 0;
92 unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
93 unsigned int row_per_group_end =
min(A_size1, rows_per_group * (blockIdx.x + 1));
95 for (
unsigned int row = rows_per_group * blockIdx.x + threadIdx.x;
row < row_per_group_end;
row += blockDim.x)
97 unsigned int A_row_start = A_row_indices[
row];
98 unsigned int A_row_end = A_row_indices[
row+1];
99 unsigned int row_num = A_row_end - A_row_start;
100 subwarpsize_in_thread =
max(A_row_end - A_row_start, subwarpsize_in_thread);
101 max_nnz_row_A =
max(max_nnz_row_A, row_num);
102 for (
unsigned int j = A_row_start; j < A_row_end; ++j)
104 unsigned int col = A_col_indices[j];
105 unsigned int row_len_B = B_row_indices[col + 1] - B_row_indices[col];
106 max_nnz_row_B =
max(row_len_B, max_nnz_row_B);
111 __shared__
unsigned int shared_subwarpsize[256];
112 __shared__
unsigned int shared_max_nnz_row_A[256];
113 __shared__
unsigned int shared_max_nnz_row_B[256];
115 shared_subwarpsize[threadIdx.x] = subwarpsize_in_thread;
116 shared_max_nnz_row_A[threadIdx.x] = max_nnz_row_A;
117 shared_max_nnz_row_B[threadIdx.x] = max_nnz_row_B;
123 shared_subwarpsize[threadIdx.x] =
max( shared_subwarpsize[threadIdx.x], shared_subwarpsize[threadIdx.x +
stride]);
124 shared_max_nnz_row_A[threadIdx.x] =
max(shared_max_nnz_row_A[threadIdx.x], shared_max_nnz_row_A[threadIdx.x +
stride]);
125 shared_max_nnz_row_B[threadIdx.x] =
max(shared_max_nnz_row_B[threadIdx.x], shared_max_nnz_row_B[threadIdx.x +
stride]);
129 if (threadIdx.x == 0)
132 max_nnz_row_A_per_group[blockIdx.x] = shared_max_nnz_row_A[0];
133 max_nnz_row_B_per_group[blockIdx.x] = shared_max_nnz_row_B[0];
142 template<
unsigned int SubWarpSizeV,
typename IndexT>
145 for (
unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
146 min_index =
min(min_index, __shfl_xor((
int)min_index, (
int)i));
151 template<
unsigned int SubWarpSizeV,
typename IndexT>
154 shared_buffer[threadIdx.x] = min_index;
155 for (
unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
156 shared_buffer[threadIdx.x] =
min(shared_buffer[threadIdx.x], shared_buffer[(threadIdx.x + i) % 512]);
157 return shared_buffer[threadIdx.x - id_in_warp];
161 template<
unsigned int SubWarpSizeV,
typename IndexT>
163 const IndexT * A_row_indices,
164 const IndexT * A_col_indices,
166 const IndexT * B_row_indices,
167 const IndexT * B_col_indices,
169 IndexT * C_row_indices)
171 __shared__
unsigned int shared_buffer[512];
173 unsigned int num_warps = blockDim.x / SubWarpSizeV;
174 unsigned int warp_id = threadIdx.x / SubWarpSizeV;
175 unsigned int id_in_warp = threadIdx.x % SubWarpSizeV;
177 unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
178 unsigned int row_per_group_end =
min(A_size1, rows_per_group * (blockIdx.x + 1));
180 for (
unsigned int row = rows_per_group * blockIdx.x + warp_id;
row < row_per_group_end;
row += num_warps)
182 unsigned int row_A_start = A_row_indices[
row];
183 unsigned int row_A_end = A_row_indices[
row+1];
185 unsigned int my_row_B = row_A_start + id_in_warp;
186 unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
187 unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
188 unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
190 unsigned int num_nnz = 0;
191 if (row_A_end - row_A_start > 1)
193 unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
198 unsigned int min_index = current_front_index;
199 min_index = subwarp_minimum_shared<SubWarpSizeV>(min_index, id_in_warp, shared_buffer);
201 if (min_index == B_size2)
205 if (current_front_index == min_index)
208 current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
216 num_nnz = row_B_end - row_B_start;
220 C_row_indices[
row] = num_nnz;
231 template<
unsigned int SubWarpSizeV,
typename NumericT>
234 for (
unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
235 output_value += __shfl_xor((
int)output_value, (
int)i);
240 template<
unsigned int SubWarpSizeV,
typename NumericT>
243 shared_buffer[threadIdx.x] = output_value;
244 for (
unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
245 shared_buffer[threadIdx.x] += shared_buffer[(threadIdx.x + i) % 512];
246 return shared_buffer[threadIdx.x - id_in_warp];
250 template<
unsigned int SubWarpSizeV,
typename IndexT,
typename NumericT>
252 const IndexT * A_row_indices,
253 const IndexT * A_col_indices,
256 const IndexT * B_row_indices,
257 const IndexT * B_col_indices,
260 IndexT
const * C_row_indices,
261 IndexT * C_col_indices,
264 __shared__
unsigned int shared_indices[512];
265 __shared__
NumericT shared_values[512];
267 unsigned int num_warps = blockDim.x / SubWarpSizeV;
268 unsigned int warp_id = threadIdx.x / SubWarpSizeV;
269 unsigned int id_in_warp = threadIdx.x % SubWarpSizeV;
271 unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
272 unsigned int row_per_group_end =
min(A_size1, rows_per_group * (blockIdx.x + 1));
274 for (
unsigned int row = rows_per_group * blockIdx.x + warp_id;
row < row_per_group_end;
row += num_warps)
276 unsigned int row_A_start = A_row_indices[
row];
277 unsigned int row_A_end = A_row_indices[
row+1];
279 unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? id_in_warp : 0);
280 unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
281 unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
282 unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
283 NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
285 unsigned int index_in_C = C_row_indices[
row];
287 if (row_A_end - row_A_start > 1)
289 unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
290 NumericT current_front_value = (row_B_start < row_B_end) ? load_and_cache(B_elements + row_B_start) : 0;
292 unsigned int index_buffer = 0;
294 unsigned int buffer_size = 0;
298 unsigned int min_index = subwarp_minimum_shared<SubWarpSizeV>(current_front_index, id_in_warp, shared_indices);
300 if (min_index == B_size2)
304 NumericT output_value = (current_front_index == min_index) ? val_A * current_front_value : 0;
305 output_value = subwarp_accumulate_shared<SubWarpSizeV>(output_value, id_in_warp, shared_values);
308 if (current_front_index == min_index)
311 current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
312 current_front_value = (row_B_start < row_B_end) ? load_and_cache(B_elements + row_B_start) : 0;
316 index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer;
317 value_buffer = (id_in_warp == buffer_size) ? output_value : value_buffer;
321 if (buffer_size == SubWarpSizeV)
323 C_col_indices[index_in_C + id_in_warp] = index_buffer;
324 C_elements[index_in_C + id_in_warp] = value_buffer;
327 index_in_C += (buffer_size == SubWarpSizeV) ? SubWarpSizeV : 0;
328 buffer_size = (buffer_size == SubWarpSizeV) ? 0 : buffer_size;
332 if (id_in_warp < buffer_size)
334 C_col_indices[index_in_C + id_in_warp] = index_buffer;
335 C_elements[index_in_C + id_in_warp] = value_buffer;
340 for (
unsigned int i = row_B_start + id_in_warp; i < row_B_end; i += SubWarpSizeV)
342 C_col_indices[index_in_C + id_in_warp] = load_and_cache(B_col_indices + i);
343 C_elements[index_in_C + id_in_warp] = val_A * load_and_cache(B_elements + i);
344 index_in_C += SubWarpSizeV;
357 template<
typename IndexT>
359 const IndexT * A_row_indices,
362 IndexT *chunks_per_row)
364 for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
366 IndexT num_entries = A_row_indices[i+1] - A_row_indices[i];
367 chunks_per_row[i] = (num_entries < max_per_row) ? 1 : ((num_entries - 1)/ max_per_row + 1);
372 template<
typename IndexT,
typename NumericT>
374 IndexT * A2_row_indices,
375 IndexT * A2_col_indices,
378 IndexT *new_row_buffer)
380 for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A2_size1; i += blockDim.x * gridDim.x)
382 unsigned int index_start = new_row_buffer[i];
383 unsigned int index_stop = new_row_buffer[i+1];
385 A2_row_indices[i] = index_start;
387 for (IndexT j = index_start; j < index_stop; ++j)
389 A2_col_indices[j] = j;
395 if (threadIdx.x == 0 && blockIdx.x == 0)
396 A2_row_indices[A2_size1] = new_row_buffer[A2_size1];
399 template<
typename IndexT,
typename NumericT>
401 IndexT * G1_row_indices,
402 IndexT * G1_col_indices,
405 IndexT
const *A_row_indices,
406 IndexT
const *A_col_indices,
411 IndexT *new_row_buffer)
414 for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_nnz; i += blockDim.x * gridDim.x)
416 G1_col_indices[i] = A_col_indices[i];
417 G1_elements[i] = A_elements[i];
421 for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
423 unsigned int old_start = A_row_indices[i];
424 unsigned int new_start = new_row_buffer[i];
425 unsigned int row_chunks = new_row_buffer[i+1] - new_start;
427 for (IndexT j=0; j<row_chunks; ++j)
428 G1_row_indices[new_start + j] = old_start + j * max_per_row;
432 if (threadIdx.x == 0 && blockIdx.x == 0)
433 G1_row_indices[G1_size1] = A_row_indices[A_size1];
447 template<
class NumericT,
unsigned int AlignmentV>
454 unsigned int blocknum = 256;
455 unsigned int threadnum = 128;
464 compressed_matrix_gemm_stage_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
465 viennacl::cuda_arg<unsigned int>(A.
handle2()),
466 static_cast<unsigned int>(A.
size1()),
467 viennacl::cuda_arg<unsigned int>(B.
handle1()),
475 unsigned int * subwarp_sizes_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(subwarp_sizes.handle());
478 unsigned int const * max_nnz_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_A.handle());
481 unsigned int const * max_nnz_row_B_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_B.handle());
483 unsigned int max_subwarp_size = 0;
485 for (std::size_t i=0; i<subwarp_sizes.size(); ++i)
486 max_subwarp_size =
std::max(max_subwarp_size, subwarp_sizes_ptr[i]);
487 unsigned int A_max_nnz_per_row = 0;
488 for (std::size_t i=0; i<max_nnz_row_A.size(); ++i)
489 A_max_nnz_per_row =
std::max(A_max_nnz_per_row, max_nnz_row_A_ptr[i]);
491 if (max_subwarp_size > 32)
494 unsigned int max_entries_in_G = 32;
495 if (A_max_nnz_per_row <= 256)
496 max_entries_in_G = 16;
497 if (A_max_nnz_per_row <= 64)
498 max_entries_in_G = 8;
501 compressed_matrix_gemm_decompose_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
502 static_cast<unsigned int>(A.
size1()),
503 static_cast<unsigned int>(max_entries_in_G),
509 unsigned int augmented_size = exclusive_scan_helper[A.
size1()];
516 compressed_matrix_gemm_A2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A2.handle1()),
517 viennacl::cuda_arg<unsigned int>(A2.handle2()),
518 viennacl::cuda_arg<NumericT>(A2.handle()),
519 static_cast<unsigned int>(A2.size1()),
525 compressed_matrix_gemm_G1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(G1.handle1()),
526 viennacl::cuda_arg<unsigned int>(G1.handle2()),
527 viennacl::cuda_arg<NumericT>(G1.handle()),
528 static_cast<unsigned int>(G1.size1()),
529 viennacl::cuda_arg<unsigned int>(A.
handle1()),
530 viennacl::cuda_arg<unsigned int>(A.
handle2()),
531 viennacl::cuda_arg<NumericT>(A.
handle()),
532 static_cast<unsigned int>(A.
size1()),
533 static_cast<unsigned int>(A.
nnz()),
534 static_cast<unsigned int>(max_entries_in_G),
557 if (max_subwarp_size == 32)
559 compressed_matrix_gemm_stage_2<32><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
560 viennacl::cuda_arg<unsigned int>(A.
handle2()),
561 static_cast<unsigned int>(A.
size1()),
562 viennacl::cuda_arg<unsigned int>(B.
handle1()),
563 viennacl::cuda_arg<unsigned int>(B.
handle2()),
564 static_cast<unsigned int>(B.
size2()),
565 viennacl::cuda_arg<unsigned int>(C.
handle1())
569 else if (max_subwarp_size == 16)
571 compressed_matrix_gemm_stage_2<16><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
572 viennacl::cuda_arg<unsigned int>(A.
handle2()),
573 static_cast<unsigned int>(A.
size1()),
574 viennacl::cuda_arg<unsigned int>(B.
handle1()),
575 viennacl::cuda_arg<unsigned int>(B.
handle2()),
576 static_cast<unsigned int>(B.
size2()),
577 viennacl::cuda_arg<unsigned int>(C.
handle1())
583 compressed_matrix_gemm_stage_2<8><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
584 viennacl::cuda_arg<unsigned int>(A.
handle2()),
585 static_cast<unsigned int>(A.
size1()),
586 viennacl::cuda_arg<unsigned int>(B.
handle1()),
587 viennacl::cuda_arg<unsigned int>(B.
handle2()),
588 static_cast<unsigned int>(B.
size2()),
589 viennacl::cuda_arg<unsigned int>(C.
handle1())
597 unsigned int current_offset = 0;
598 for (std::size_t i=0; i<C.
size1(); ++i)
600 unsigned int tmp = row_buffer[i];
601 row_buffer.set(i, current_offset);
602 current_offset += tmp;
604 row_buffer.
set(C.
size1(), current_offset);
611 C.
reserve(current_offset,
false);
613 if (max_subwarp_size == 32)
615 compressed_matrix_gemm_stage_3<32><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
616 viennacl::cuda_arg<unsigned int>(A.
handle2()),
617 viennacl::cuda_arg<NumericT>(A.
handle()),
618 static_cast<unsigned int>(A.
size1()),
619 viennacl::cuda_arg<unsigned int>(B.
handle1()),
620 viennacl::cuda_arg<unsigned int>(B.
handle2()),
621 viennacl::cuda_arg<NumericT>(B.
handle()),
622 static_cast<unsigned int>(B.
size2()),
623 viennacl::cuda_arg<unsigned int>(C.
handle1()),
624 viennacl::cuda_arg<unsigned int>(C.
handle2()),
625 viennacl::cuda_arg<NumericT>(C.
handle())
629 else if (max_subwarp_size == 16)
631 compressed_matrix_gemm_stage_3<16><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
632 viennacl::cuda_arg<unsigned int>(A.
handle2()),
633 viennacl::cuda_arg<NumericT>(A.
handle()),
634 static_cast<unsigned int>(A.
size1()),
635 viennacl::cuda_arg<unsigned int>(B.
handle1()),
636 viennacl::cuda_arg<unsigned int>(B.
handle2()),
637 viennacl::cuda_arg<NumericT>(B.
handle()),
638 static_cast<unsigned int>(B.
size2()),
639 viennacl::cuda_arg<unsigned int>(C.
handle1()),
640 viennacl::cuda_arg<unsigned int>(C.
handle2()),
641 viennacl::cuda_arg<NumericT>(C.
handle())
647 compressed_matrix_gemm_stage_3<8><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.
handle1()),
648 viennacl::cuda_arg<unsigned int>(A.
handle2()),
649 viennacl::cuda_arg<NumericT>(A.
handle()),
650 static_cast<unsigned int>(A.
size1()),
651 viennacl::cuda_arg<unsigned int>(B.
handle1()),
652 viennacl::cuda_arg<unsigned int>(B.
handle2()),
653 viennacl::cuda_arg<NumericT>(B.
handle()),
654 static_cast<unsigned int>(B.
size2()),
655 viennacl::cuda_arg<unsigned int>(C.
handle1()),
656 viennacl::cuda_arg<unsigned int>(C.
handle2()),
657 viennacl::cuda_arg<NumericT>(C.
handle())
const vcl_size_t & size2() const
Returns the number of columns.
Helper class implementing an array on the host. Default case: No conversion necessary.
void memory_write(mem_handle &dst_buffer, vcl_size_t dst_offset, vcl_size_t bytes_to_write, const void *ptr, bool async=false)
Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'.
__device__ IndexT subwarp_minimum_shared(IndexT min_index, IndexT id_in_warp, IndexT *shared_buffer)
const vcl_size_t & size1() const
Returns the number of rows.
__global__ void compressed_matrix_gemm_decompose_1(const IndexT *A_row_indices, IndexT A_size1, IndexT max_per_row, IndexT *chunks_per_row)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
T max(const T &lhs, const T &rhs)
Maximum.
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
__global__ void compressed_matrix_gemm_stage_3(const IndexT *A_row_indices, const IndexT *A_col_indices, const NumericT *A_elements, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, const NumericT *B_elements, IndexT B_size2, IndexT const *C_row_indices, IndexT *C_col_indices, NumericT *C_elements, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices, NumericT *scratchpad_values)
Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also su...
__device__ IndexT round_to_next_power_of_2(IndexT val)
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
__global__ void compressed_matrix_gemm_stage_1(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, IndexT *subwarpsize_per_group, IndexT *max_nnz_row_A_per_group, IndexT *max_nnz_row_B_per_group)
__device__ IndexT subwarp_minimum_shuffle(IndexT min_index)
__global__ void compressed_matrix_gemm_A2(IndexT *A2_row_indices, IndexT *A2_col_indices, NumericT *A2_elements, IndexT A2_size1, IndexT *new_row_buffer)
__device__ NumericT subwarp_accumulate_shared(NumericT output_value, unsigned int id_in_warp, NumericT *shared_buffer)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Implementations of direct triangular solvers for sparse matrices using CUDA.
void reserve(vcl_size_t new_nonzeros, bool preserve=true)
Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved...
A simple, yet (mostly) sufficiently accurate timer for benchmarking and profiling.
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Common routines for CUDA execution.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
void set(vcl_size_t index, U value)
__global__ void compressed_matrix_gemm_G1(IndexT *G1_row_indices, IndexT *G1_col_indices, NumericT *G1_elements, IndexT G1_size1, IndexT const *A_row_indices, IndexT const *A_col_indices, NumericT const *A_elements, IndexT A_size1, IndexT A_nnz, IndexT max_per_row, IndexT *new_row_buffer)
vcl_size_t raw_size() const
Returns the number of bytes of the currently active buffer.
A sparse square matrix in compressed sparse rows format.
void exclusive_scan(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
This function implements an exclusive scan.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
Implementation of the ViennaCL scalar class.
void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve=true)
Resize the matrix.
__device__ NumericT subwarp_accumulate_shuffle(NumericT output_value)
__global__ void compressed_matrix_gemm_stage_2(const IndexT *A_row_indices, const IndexT *A_col_indices, IndexT A_size1, const IndexT *B_row_indices, const IndexT *B_col_indices, IndexT B_size2, IndexT *C_row_indices, unsigned int *subwarpsize_array, unsigned int *max_row_size_A, unsigned int *max_row_size_B, unsigned int *scratchpad_offsets, unsigned int *scratchpad_indices)
NumericT min(std::vector< NumericT > const &v1)