1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
52 template<
typename NumericT>
54 const unsigned int * row_indices,
55 const unsigned int * column_indices,
61 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
63 row += gridDim.x * blockDim.x)
66 unsigned int row_end = row_indices[
row+1];
71 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
72 value =
max(value, fabs(elements[i]));
76 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
77 value += fabs(elements[i]);
81 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
82 value += elements[i] * elements[i];
87 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
89 if (column_indices[i] ==
row)
105 template<
typename NumericT,
unsigned int AligmentV>
110 csr_row_info_extractor_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
111 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
112 viennacl::cuda_arg<NumericT>(mat.
handle()),
114 static_cast<unsigned int>(mat.
size1()),
115 static_cast<unsigned int>(info_selector)
124 template<
unsigned int SubWarpSizeV,
typename NumericT>
126 const unsigned int * row_indices,
127 const unsigned int * column_indices,
130 unsigned int start_x,
133 unsigned int start_result,
134 unsigned int inc_result,
135 unsigned int size_result)
137 __shared__
NumericT shared_elements[512];
139 const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
140 const unsigned int block_increment = blockDim.x * ((size_result - 1) / (gridDim.x * blockDim.x) + 1);
141 const unsigned int block_start = blockIdx.x * block_increment;
142 const unsigned int block_stop =
min(block_start + block_increment, size_result);
144 for (
unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
146 row += blockDim.x / SubWarpSizeV)
149 unsigned int row_end = row_indices[
row+1];
150 for (
unsigned int i = row_indices[
row] + id_in_row; i < row_end; i += SubWarpSizeV)
151 dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
153 shared_elements[threadIdx.x] =
dot_prod;
154 if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
155 if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
156 if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
157 if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
158 if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
161 result[
row * inc_result + start_result] = shared_elements[threadIdx.x];
166 template<
typename NumericT>
168 const unsigned int * row_indices,
169 const unsigned int * column_indices,
170 const unsigned int * row_blocks,
172 unsigned int num_blocks,
174 unsigned int start_x,
177 unsigned int start_result,
178 unsigned int inc_result,
179 unsigned int size_result)
181 __shared__
NumericT shared_elements[1024];
183 for (
unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
185 unsigned int row_start = row_blocks[block_id];
186 unsigned int row_stop = row_blocks[block_id + 1];
187 unsigned int element_start = row_indices[row_start];
188 unsigned int element_stop = row_indices[row_stop];
189 unsigned int rows_to_process = row_stop - row_start;
191 if (rows_to_process > 1)
194 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
195 shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
200 for (
unsigned int row = row_start + threadIdx.x;
row < row_stop;
row += blockDim.x)
203 unsigned int thread_row_start = row_indices[
row] - element_start;
204 unsigned int thread_row_stop = row_indices[
row + 1] - element_start;
205 for (
unsigned int i = thread_row_start; i < thread_row_stop; ++i)
206 dot_prod += shared_elements[i];
207 result[
row * inc_result + start_result] =
dot_prod;
214 shared_elements[threadIdx.x] = 0;
215 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
216 shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
223 shared_elements[threadIdx.x] += shared_elements[threadIdx.x+
stride];
226 if (threadIdx.x == 0)
227 result[row_start * inc_result + start_result] = shared_elements[0];
245 template<
class NumericT,
unsigned int AlignmentV>
250 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
251 if (
double(mat.
nnz()) /
double(mat.
size1()) > 6.4)
253 compressed_matrix_vec_mul_kernel<8, NumericT><<<512, 256>>>(
255 if (
double(mat.
nnz()) /
double(mat.
size1()) > 12.0)
257 compressed_matrix_vec_mul_kernel<16, NumericT><<<512, 256>>>(
259 viennacl::cuda_arg<unsigned int>(mat.
handle1()),
260 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
261 viennacl::cuda_arg<NumericT>(mat.
handle()),
263 static_cast<unsigned int>(vec.
start()),
264 static_cast<unsigned int>(vec.
stride()),
266 static_cast<unsigned int>(result.
start()),
267 static_cast<unsigned int>(result.
stride()),
268 static_cast<unsigned int>(result.
size())
274 compressed_matrix_vec_mul_adaptive_kernel<<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
275 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
276 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
277 viennacl::cuda_arg<NumericT>(mat.
handle()),
278 static_cast<unsigned int>(mat.
blocks1()),
280 static_cast<unsigned int>(vec.
start()),
281 static_cast<unsigned int>(vec.
stride()),
283 static_cast<unsigned int>(result.
start()),
284 static_cast<unsigned int>(result.
stride()),
285 static_cast<unsigned int>(result.
size())
295 template<
typename LayoutT>
298 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
299 unsigned int row_start,
unsigned int row_inc,
300 unsigned int col_start,
unsigned int col_inc,
301 unsigned int internal_rows,
unsigned int internal_cols)
303 return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
311 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
312 unsigned int row_start,
unsigned int row_inc,
313 unsigned int col_start,
unsigned int col_inc,
314 unsigned int internal_rows,
unsigned int internal_cols)
316 return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
322 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
324 const unsigned int * sp_mat_row_indices,
325 const unsigned int * sp_mat_col_indices,
328 unsigned int d_mat_row_start,
329 unsigned int d_mat_col_start,
330 unsigned int d_mat_row_inc,
331 unsigned int d_mat_col_inc,
332 unsigned int d_mat_row_size,
333 unsigned int d_mat_col_size,
334 unsigned int d_mat_internal_rows,
335 unsigned int d_mat_internal_cols,
337 unsigned int result_row_start,
338 unsigned int result_col_start,
339 unsigned int result_row_inc,
340 unsigned int result_col_inc,
341 unsigned int result_row_size,
342 unsigned int result_col_size,
343 unsigned int result_internal_rows,
344 unsigned int result_internal_cols)
346 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x)
348 unsigned int row_start = sp_mat_row_indices[
row];
349 unsigned int row_end = sp_mat_row_indices[
row+1];
351 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
355 for (
unsigned int k = row_start; k < row_end; k++)
357 unsigned int j = sp_mat_col_indices[k];
359 NumericT y = d_mat[ DMatIndexT::apply(j, col,
360 d_mat_row_start, d_mat_row_inc,
361 d_mat_col_start, d_mat_col_inc,
362 d_mat_internal_rows, d_mat_internal_cols) ];
367 result[ResultIndexT::apply(
row, col,
368 result_row_start, result_row_inc,
369 result_col_start, result_col_inc,
370 result_internal_rows, result_internal_cols)] = r;
384 template<
typename NumericT,
unsigned int AlignmentV>
392 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
393 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
394 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
413 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
414 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
415 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
434 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
435 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
436 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
455 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
456 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
457 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
476 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
478 const unsigned int * sp_mat_row_indices,
479 const unsigned int * sp_mat_col_indices,
482 unsigned int d_mat_row_start,
483 unsigned int d_mat_col_start,
484 unsigned int d_mat_row_inc,
485 unsigned int d_mat_col_inc,
486 unsigned int d_mat_row_size,
487 unsigned int d_mat_col_size,
488 unsigned int d_mat_internal_rows,
489 unsigned int d_mat_internal_cols,
491 unsigned int result_row_start,
492 unsigned int result_col_start,
493 unsigned int result_row_inc,
494 unsigned int result_col_inc,
495 unsigned int result_row_size,
496 unsigned int result_col_size,
497 unsigned int result_internal_rows,
498 unsigned int result_internal_cols)
500 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x)
502 unsigned int row_start = sp_mat_row_indices[
row];
503 unsigned int row_end = sp_mat_row_indices[
row+1];
505 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
509 for (
unsigned int k = row_start; k < row_end; k++)
511 unsigned int j = sp_mat_col_indices[k];
513 NumericT y = d_mat[ DMatIndexT::apply(col, j,
514 d_mat_row_start, d_mat_row_inc,
515 d_mat_col_start, d_mat_col_inc,
516 d_mat_internal_rows, d_mat_internal_cols) ];
521 result [ ResultIndexT::apply(
row, col,
522 result_row_start, result_row_inc,
523 result_col_start, result_col_inc,
524 result_internal_rows, result_internal_cols) ] = r;
539 template<
typename NumericT,
unsigned int AlignmentV>
547 if (d_mat.lhs().row_major() && result.
row_major())
550 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
551 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
552 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
568 else if (d_mat.lhs().row_major() && !result.
row_major())
571 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
572 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
573 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
589 else if (!d_mat.lhs().row_major() && result.
row_major())
592 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
593 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
594 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
613 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle1()),
614 viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
615 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
638 template<
typename NumericT>
640 const unsigned int * row_indices,
641 const unsigned int * column_indices,
646 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
648 row += gridDim.x * blockDim.x)
651 unsigned int row_end = row_indices[
row+1];
652 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
654 unsigned int col_index = column_indices[i];
655 if (col_index ==
row)
671 template<
typename SparseMatrixT,
typename NumericT>
677 csr_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
678 viennacl::cuda_arg<unsigned int>(mat.handle2()),
679 viennacl::cuda_arg<NumericT>(mat.handle()),
681 static_cast<unsigned int>(mat.size1())
692 template<
typename SparseMatrixT,
typename NumericT>
698 csr_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
699 viennacl::cuda_arg<unsigned int>(mat.handle2()),
700 viennacl::cuda_arg<NumericT>(mat.handle()),
702 static_cast<unsigned int>(mat.size1())
714 template<
typename SparseMatrixT,
typename NumericT>
720 csr_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
721 viennacl::cuda_arg<unsigned int>(mat.handle2()),
722 viennacl::cuda_arg<NumericT>(mat.handle()),
724 static_cast<unsigned int>(mat.size1())
735 template<
typename SparseMatrixT,
typename NumericT>
741 csr_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
742 viennacl::cuda_arg<unsigned int>(mat.handle2()),
743 viennacl::cuda_arg<NumericT>(mat.handle()),
745 static_cast<unsigned int>(mat.size1())
759 template<
typename SparseMatrixT,
typename NumericT>
765 csr_trans_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
766 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
767 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
769 static_cast<unsigned int>(mat.
lhs().size1())
780 template<
typename SparseMatrixT,
typename NumericT>
788 compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
789 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
790 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
792 static_cast<unsigned int>(mat.
size1())
795 csr_trans_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
796 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
797 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
800 static_cast<unsigned int>(mat.
lhs().size1())
811 template<
typename SparseMatrixT,
typename NumericT>
817 csr_trans_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
818 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
819 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
821 static_cast<unsigned int>(mat.
lhs().size1())
832 template<
typename SparseMatrixT,
typename NumericT>
840 compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
841 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
842 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
844 static_cast<unsigned int>(mat.
size1())
847 csr_trans_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
lhs().handle1()),
848 viennacl::cuda_arg<unsigned int>(mat.
lhs().handle2()),
849 viennacl::cuda_arg<NumericT>(mat.
lhs().handle()),
852 static_cast<unsigned int>(mat.
lhs().size1())
862 template<
typename NumericT,
unsigned int AlignmentV>
871 csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(L.lhs().handle1()),
872 viennacl::cuda_arg<unsigned int>(L.lhs().handle2()),
873 viennacl::cuda_arg<NumericT>(L.lhs().handle()),
874 viennacl::cuda_arg<unsigned int>(block_indices),
876 static_cast<unsigned int>(L.lhs().size1())
881 template<
typename NumericT,
unsigned int AlignmentV>
890 csr_block_trans_lu_backward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(U.lhs().handle1()),
891 viennacl::cuda_arg<unsigned int>(U.lhs().handle2()),
892 viennacl::cuda_arg<NumericT>(U.lhs().handle()),
894 viennacl::cuda_arg<unsigned int>(block_indices),
896 static_cast<unsigned int>(U.lhs().size1())
908 template<
typename NumericT>
910 const unsigned int * row_jumper,
911 const unsigned int * row_indices,
912 const unsigned int * column_indices,
914 unsigned int nonzero_rows,
916 unsigned int start_x,
919 unsigned int start_result,
920 unsigned int inc_result,
921 unsigned int size_result)
923 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
925 i += gridDim.x * blockDim.x)
927 result[i * inc_result + start_result] = 0;
930 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
932 i += gridDim.x * blockDim.x)
935 unsigned int row_end = row_jumper[i+1];
936 for (
unsigned int j = row_jumper[i]; j < row_end; ++j)
937 dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
938 result[row_indices[i] * inc_result + start_result] =
dot_prod;
951 template<
typename NumericT>
956 compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
957 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
958 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
959 viennacl::cuda_arg<NumericT>(mat.
handle()),
960 static_cast<unsigned int>(mat.
nnz1()),
962 static_cast<unsigned int>(vec.
start()),
963 static_cast<unsigned int>(vec.
stride()),
965 static_cast<unsigned int>(result.
start()),
966 static_cast<unsigned int>(result.
stride()),
967 static_cast<unsigned int>(result.
size())
980 template<
typename NumericT>
983 const unsigned int * group_boundaries,
987 __shared__
unsigned int shared_rows[128];
988 __shared__
NumericT inter_results[128];
992 unsigned int last_index = blockDim.x - 1;
993 unsigned int group_start = group_boundaries[blockIdx.x];
994 unsigned int group_end = group_boundaries[blockIdx.x + 1];
995 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
997 unsigned int local_index = 0;
999 for (
unsigned int k = 0; k < k_end; ++k)
1001 local_index = group_start + k * blockDim.x + threadIdx.x;
1003 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1004 val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
1007 if (threadIdx.x == 0 && k > 0)
1009 if (tmp.x == shared_rows[last_index])
1015 val =
max(val, fabs(inter_results[last_index]));
1019 val = fabs(val) + inter_results[last_index];
1023 val = sqrt(val * val + inter_results[last_index]);
1037 result[shared_rows[last_index]] = inter_results[last_index];
1041 result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
1050 shared_rows[threadIdx.x] = tmp.x;
1055 inter_results[threadIdx.x] = val;
1058 inter_results[threadIdx.x] = fabs(val);
1061 inter_results[threadIdx.x] = val * val;
1069 NumericT left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1075 inter_results[threadIdx.x] =
max(inter_results[threadIdx.x], left);
1079 inter_results[threadIdx.x] += left;
1083 inter_results[threadIdx.x] += left;
1093 if (threadIdx.x != last_index &&
1094 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
1095 inter_results[threadIdx.x] != 0)
1097 result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1103 if (local_index + 1 == group_end && inter_results[threadIdx.x] != 0)
1104 result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1107 template<
typename NumericT,
unsigned int AlignmentV>
1112 coo_row_info_extractor<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle12()),
1113 viennacl::cuda_arg<NumericT>(mat.
handle()),
1114 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
1116 static_cast<unsigned int>(info_selector)
1124 template<
typename NumericT>
1127 const unsigned int * group_boundaries,
1129 unsigned int start_x,
1132 unsigned int start_result,
1133 unsigned int inc_result
1136 __shared__
unsigned int shared_rows[128];
1137 __shared__
NumericT inter_results[128];
1141 unsigned int group_start = group_boundaries[blockIdx.x];
1142 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1143 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1145 unsigned int local_index = 0;
1147 for (
unsigned int k = 0; k < k_end; ++k)
1149 local_index = group_start + k * blockDim.x + threadIdx.x;
1151 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1152 val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
1155 if (threadIdx.x == 0 && k > 0)
1157 if (tmp.x == shared_rows[blockDim.x-1])
1158 val += inter_results[blockDim.x-1];
1160 result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
1165 shared_rows[threadIdx.x] = tmp.x;
1166 inter_results[threadIdx.x] = val;
1172 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1174 inter_results[threadIdx.x] += left;
1179 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1180 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1182 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1188 if (local_index + 1 == group_end)
1189 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1201 template<
typename NumericT,
unsigned int AlignmentV>
1208 coordinate_matrix_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle12()),
1209 viennacl::cuda_arg<NumericT>(mat.
handle()),
1210 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
1212 static_cast<unsigned int>(vec.
start()),
1213 static_cast<unsigned int>(vec.
stride()),
1215 static_cast<unsigned int>(result.
start()),
1216 static_cast<unsigned int>(result.
stride())
1224 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1227 const unsigned int * group_boundaries,
1229 unsigned int d_mat_row_start,
1230 unsigned int d_mat_col_start,
1231 unsigned int d_mat_row_inc,
1232 unsigned int d_mat_col_inc,
1233 unsigned int d_mat_row_size,
1234 unsigned int d_mat_col_size,
1235 unsigned int d_mat_internal_rows,
1236 unsigned int d_mat_internal_cols,
1238 unsigned int result_row_start,
1239 unsigned int result_col_start,
1240 unsigned int result_row_inc,
1241 unsigned int result_col_inc,
1242 unsigned int result_row_size,
1243 unsigned int result_col_size,
1244 unsigned int result_internal_rows,
1245 unsigned int result_internal_cols)
1247 __shared__
unsigned int shared_rows[128];
1248 __shared__
NumericT inter_results[128];
1252 unsigned int group_start = group_boundaries[blockIdx.x];
1253 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1254 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1256 unsigned int local_index = 0;
1258 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1260 for (
unsigned int k = 0; k < k_end; ++k)
1262 local_index = group_start + k * blockDim.x + threadIdx.x;
1264 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1265 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1266 d_mat_row_start, d_mat_row_inc,
1267 d_mat_col_start, d_mat_col_inc,
1268 d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1271 if (threadIdx.x == 0 && k > 0)
1273 if (tmp.x == shared_rows[blockDim.x-1])
1274 val += inter_results[blockDim.x-1];
1276 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1277 result_row_start, result_row_inc,
1278 result_col_start, result_col_inc,
1279 result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1284 shared_rows[threadIdx.x] = tmp.x;
1285 inter_results[threadIdx.x] = val;
1291 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1293 inter_results[threadIdx.x] += left;
1298 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1299 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1301 result[ResultIndexT::apply(tmp.x, result_col,
1302 result_row_start, result_row_inc,
1303 result_col_start, result_col_inc,
1304 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1310 if (local_index + 1 == group_end)
1311 result[ResultIndexT::apply(tmp.x, result_col,
1312 result_row_start, result_row_inc,
1313 result_col_start, result_col_inc,
1314 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1327 template<
typename NumericT,
unsigned int AlignmentV>
1335 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1336 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1337 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1356 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1357 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1358 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1377 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1378 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1379 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1398 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1399 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1400 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1419 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1422 const unsigned int * group_boundaries,
1424 unsigned int d_mat_row_start,
1425 unsigned int d_mat_col_start,
1426 unsigned int d_mat_row_inc,
1427 unsigned int d_mat_col_inc,
1428 unsigned int d_mat_row_size,
1429 unsigned int d_mat_col_size,
1430 unsigned int d_mat_internal_rows,
1431 unsigned int d_mat_internal_cols,
1433 unsigned int result_row_start,
1434 unsigned int result_col_start,
1435 unsigned int result_row_inc,
1436 unsigned int result_col_inc,
1437 unsigned int result_row_size,
1438 unsigned int result_col_size,
1439 unsigned int result_internal_rows,
1440 unsigned int result_internal_cols)
1442 __shared__
unsigned int shared_rows[128];
1443 __shared__
NumericT inter_results[128];
1447 unsigned int group_start = group_boundaries[blockIdx.x];
1448 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1449 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1451 unsigned int local_index = 0;
1453 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1455 for (
unsigned int k = 0; k < k_end; ++k)
1457 local_index = group_start + k * blockDim.x + threadIdx.x;
1459 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1460 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1461 d_mat_row_start, d_mat_row_inc,
1462 d_mat_col_start, d_mat_col_inc,
1463 d_mat_internal_rows, d_mat_internal_cols)] : 0;
1466 if (threadIdx.x == 0 && k > 0)
1468 if (tmp.x == shared_rows[blockDim.x-1])
1469 val += inter_results[blockDim.x-1];
1471 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1472 result_row_start, result_row_inc,
1473 result_col_start, result_col_inc,
1474 result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1479 shared_rows[threadIdx.x] = tmp.x;
1480 inter_results[threadIdx.x] = val;
1486 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1488 inter_results[threadIdx.x] += left;
1493 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1494 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1496 result[ ResultIndexT::apply(tmp.x, result_col,
1497 result_row_start, result_row_inc,
1498 result_col_start, result_col_inc,
1499 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1505 if (local_index + 1 == group_end)
1506 result[ ResultIndexT::apply(tmp.x, result_col,
1507 result_row_start, result_row_inc,
1508 result_col_start, result_col_inc,
1509 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1521 template<
typename NumericT,
unsigned int AlignmentV>
1528 if (d_mat.lhs().row_major() && result.
row_major())
1531 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1532 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1533 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1549 else if (d_mat.lhs().row_major() && !result.
row_major())
1552 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1553 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1554 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1570 else if (!d_mat.lhs().row_major() && result.
row_major())
1573 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1574 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1575 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1594 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle12()),
1595 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1596 viennacl::cuda_arg<unsigned int>(sp_mat.
handle3()),
1619 template<
typename NumericT>
1623 unsigned int start_x,
1626 unsigned int start_result,
1627 unsigned int inc_result,
1628 unsigned int row_num,
1629 unsigned int col_num,
1630 unsigned int internal_row_num,
1631 unsigned int items_per_row,
1632 unsigned int aligned_items_per_row
1635 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1636 unsigned int glb_sz = gridDim.x * blockDim.x;
1638 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1642 unsigned int offset = row_id;
1643 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1649 int col = coords[offset];
1650 sum += x[col * inc_x + start_x] * val;
1654 result[row_id * inc_result + start_result] =
sum;
1667 template<
typename NumericT,
unsigned int AlignmentV>
1672 ell_matrix_vec_mul_kernel<<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle2()),
1673 viennacl::cuda_arg<NumericT>(mat.
handle()),
1675 static_cast<unsigned int>(vec.
start()),
1676 static_cast<unsigned int>(vec.
stride()),
1678 static_cast<unsigned int>(result.
start()),
1679 static_cast<unsigned int>(result.
stride()),
1680 static_cast<unsigned int>(mat.
size1()),
1681 static_cast<unsigned int>(mat.
size2()),
1683 static_cast<unsigned int>(mat.
maxnnz()),
1689 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1692 unsigned int sp_mat_row_num,
1693 unsigned int sp_mat_col_num,
1694 unsigned int sp_mat_internal_row_num,
1695 unsigned int sp_mat_items_per_row,
1696 unsigned int sp_mat_aligned_items_per_row,
1698 unsigned int d_mat_row_start,
1699 unsigned int d_mat_col_start,
1700 unsigned int d_mat_row_inc,
1701 unsigned int d_mat_col_inc,
1702 unsigned int d_mat_row_size,
1703 unsigned int d_mat_col_size,
1704 unsigned int d_mat_internal_rows,
1705 unsigned int d_mat_internal_cols,
1707 unsigned int result_row_start,
1708 unsigned int result_col_start,
1709 unsigned int result_row_inc,
1710 unsigned int result_col_inc,
1711 unsigned int result_row_size,
1712 unsigned int result_col_size,
1713 unsigned int result_internal_rows,
1714 unsigned int result_internal_cols)
1716 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1717 unsigned int glb_sz = gridDim.x * blockDim.x;
1719 for (
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz)
1721 unsigned int row = rc % sp_mat_row_num;
1722 unsigned int col = rc / sp_mat_row_num;
1724 unsigned int offset =
row;
1727 for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1729 unsigned int j = sp_mat_coords[offset];
1734 NumericT y = d_mat[ DMatIndexT::apply(j, col,
1735 d_mat_row_start, d_mat_row_inc,
1736 d_mat_col_start, d_mat_col_inc,
1737 d_mat_internal_rows, d_mat_internal_cols) ];
1742 result [ ResultIndexT::apply(row, col,
1743 result_row_start, result_row_inc,
1744 result_col_start, result_col_inc,
1745 result_internal_rows, result_internal_cols) ] = r;
1759 template<
typename NumericT,
unsigned int AlignmentV>
1767 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
1768 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1769 static_cast<unsigned int>(sp_mat.
size1()),
1770 static_cast<unsigned int>(sp_mat.
size2()),
1772 static_cast<unsigned int>(sp_mat.
maxnnz()),
1791 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
1792 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1793 static_cast<unsigned int>(sp_mat.
size1()),
1794 static_cast<unsigned int>(sp_mat.
size2()),
1796 static_cast<unsigned int>(sp_mat.
maxnnz()),
1815 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
1816 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1817 static_cast<unsigned int>(sp_mat.
size1()),
1818 static_cast<unsigned int>(sp_mat.
size2()),
1820 static_cast<unsigned int>(sp_mat.
maxnnz()),
1839 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
1840 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1841 static_cast<unsigned int>(sp_mat.
size1()),
1842 static_cast<unsigned int>(sp_mat.
size2()),
1844 static_cast<unsigned int>(sp_mat.
maxnnz()),
1862 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT >
1865 unsigned int sp_mat_row_num,
1866 unsigned int sp_mat_col_num,
1867 unsigned int sp_mat_internal_row_num,
1868 unsigned int sp_mat_items_per_row,
1869 unsigned int sp_mat_aligned_items_per_row,
1871 unsigned int d_mat_row_start,
1872 unsigned int d_mat_col_start,
1873 unsigned int d_mat_row_inc,
1874 unsigned int d_mat_col_inc,
1875 unsigned int d_mat_row_size,
1876 unsigned int d_mat_col_size,
1877 unsigned int d_mat_internal_rows,
1878 unsigned int d_mat_internal_cols,
1880 unsigned int result_row_start,
1881 unsigned int result_col_start,
1882 unsigned int result_row_inc,
1883 unsigned int result_col_inc,
1884 unsigned int result_row_size,
1885 unsigned int result_col_size,
1886 unsigned int result_internal_rows,
1887 unsigned int result_internal_cols)
1889 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1890 unsigned int glb_sz = gridDim.x * blockDim.x;
1892 for (
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz)
1894 unsigned int row = rc % sp_mat_row_num;
1895 unsigned int col = rc / sp_mat_row_num;
1897 unsigned int offset =
row;
1900 for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1902 unsigned int j = sp_mat_coords[offset];
1907 NumericT y = d_mat[ DMatIndexT::apply(col, j,
1908 d_mat_row_start, d_mat_row_inc,
1909 d_mat_col_start, d_mat_col_inc,
1910 d_mat_internal_rows, d_mat_internal_cols) ];
1915 result [ ResultIndexT::apply(row, col,
1916 result_row_start, result_row_inc,
1917 result_col_start, result_col_inc,
1918 result_internal_rows, result_internal_cols) ] = r;
1932 template<
typename NumericT,
unsigned int AlignmentV>
1939 if (d_mat.lhs().row_major() && result.
row_major())
1942 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
1943 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1944 static_cast<unsigned int>(sp_mat.
size1()),
1945 static_cast<unsigned int>(sp_mat.
size2()),
1947 static_cast<unsigned int>(sp_mat.
maxnnz()),
1964 else if (d_mat.lhs().row_major() && !result.
row_major())
1967 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
1968 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1969 static_cast<unsigned int>(sp_mat.
size1()),
1970 static_cast<unsigned int>(sp_mat.
size2()),
1972 static_cast<unsigned int>(sp_mat.
maxnnz()),
1989 else if (!d_mat.lhs().row_major() && result.
row_major())
1992 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
1993 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
1994 static_cast<unsigned int>(sp_mat.
size1()),
1995 static_cast<unsigned int>(sp_mat.
size2()),
1997 static_cast<unsigned int>(sp_mat.
maxnnz()),
2017 (viennacl::cuda_arg<unsigned int>(sp_mat.
handle2()),
2018 viennacl::cuda_arg<NumericT>(sp_mat.
handle()),
2019 static_cast<unsigned int>(sp_mat.
size1()),
2020 static_cast<unsigned int>(sp_mat.
size2()),
2022 static_cast<unsigned int>(sp_mat.
maxnnz()),
2045 template<
typename NumericT>
2047 const unsigned int * column_indices,
2048 const unsigned int * block_start,
2051 unsigned int start_x,
2053 unsigned int size_x,
2055 unsigned int start_result,
2056 unsigned int inc_result,
2057 unsigned int size_result,
2058 unsigned int block_size)
2060 unsigned int blocks_per_threadblock = blockDim.x / block_size;
2061 unsigned int id_in_block = threadIdx.x % block_size;
2062 unsigned int num_blocks = (size_result - 1) / block_size + 1;
2063 unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
2064 unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
2066 for (
unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
2068 unsigned int row = block_idx * block_size + id_in_block;
2069 unsigned int offset = block_start[block_idx];
2070 unsigned int num_columns = columns_per_block[block_idx];
2073 for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
2075 unsigned int index = offset + item_id * block_size + id_in_block;
2078 sum += val ? (x[column_indices[index] * inc_x + start_x] * val) : 0;
2081 if (row < size_result)
2082 result[row * inc_result + start_result] =
sum;
2094 template<
typename NumericT,
typename IndexT>
2099 sliced_ell_matrix_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(mat.
handle1()),
2100 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2101 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2102 viennacl::cuda_arg<NumericT>(mat.
handle()),
2104 static_cast<unsigned int>(vec.
start()),
2105 static_cast<unsigned int>(vec.
stride()),
2106 static_cast<unsigned int>(vec.
size()),
2108 static_cast<unsigned int>(result.
start()),
2109 static_cast<unsigned int>(result.
stride()),
2110 static_cast<unsigned int>(result.
size()),
2122 template<
typename NumericT>
2125 const unsigned int * csr_rows,
2126 const unsigned int * csr_cols,
2129 unsigned int start_x,
2132 unsigned int start_result,
2133 unsigned int inc_result,
2134 unsigned int row_num,
2135 unsigned int internal_row_num,
2136 unsigned int items_per_row,
2137 unsigned int aligned_items_per_row
2140 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2141 unsigned int glb_sz = gridDim.x * blockDim.x;
2143 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2147 unsigned int offset = row_id;
2148 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2150 NumericT val = ell_elements[offset];
2155 int col = ell_coords[offset];
2156 sum += (x[col * inc_x + start_x] * val);
2160 unsigned int col_begin = csr_rows[row_id];
2161 unsigned int col_end = csr_rows[row_id + 1];
2163 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2164 sum += x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id];
2166 result[row_id * inc_result + start_result] =
sum;
2180 template<
typename NumericT,
unsigned int AlignmentV>
2185 hyb_matrix_vec_mul_kernel<<<256, 128>>>(viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2186 viennacl::cuda_arg<NumericT>(mat.
handle()),
2187 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2188 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2189 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2191 static_cast<unsigned int>(vec.
start()),
2192 static_cast<unsigned int>(vec.
stride()),
2194 static_cast<unsigned int>(result.
start()),
2195 static_cast<unsigned int>(result.
stride()),
2196 static_cast<unsigned int>(mat.
size1()),
2198 static_cast<unsigned int>(mat.
ell_nnz()),
2206 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
2209 const unsigned int * csr_rows,
2210 const unsigned int * csr_cols,
2212 unsigned int row_num,
2213 unsigned int internal_row_num,
2214 unsigned int items_per_row,
2215 unsigned int aligned_items_per_row,
2217 unsigned int d_mat_row_start,
2218 unsigned int d_mat_col_start,
2219 unsigned int d_mat_row_inc,
2220 unsigned int d_mat_col_inc,
2221 unsigned int d_mat_row_size,
2222 unsigned int d_mat_col_size,
2223 unsigned int d_mat_internal_rows,
2224 unsigned int d_mat_internal_cols,
2226 unsigned int result_row_start,
2227 unsigned int result_col_start,
2228 unsigned int result_row_inc,
2229 unsigned int result_col_inc,
2230 unsigned int result_row_size,
2231 unsigned int result_col_size,
2232 unsigned int result_internal_rows,
2233 unsigned int result_internal_cols)
2235 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2236 unsigned int glb_sz = gridDim.x * blockDim.x;
2238 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2240 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2244 unsigned int offset = row_id;
2245 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2247 NumericT val = ell_elements[offset];
2251 sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
2252 d_mat_row_start, d_mat_row_inc,
2253 d_mat_col_start, d_mat_col_inc,
2254 d_mat_internal_rows, d_mat_internal_cols)] * val;
2258 unsigned int col_begin = csr_rows[row_id];
2259 unsigned int col_end = csr_rows[row_id + 1];
2261 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2263 sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
2264 d_mat_row_start, d_mat_row_inc,
2265 d_mat_col_start, d_mat_col_inc,
2266 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2269 result[ResultIndexT::apply(row_id, result_col,
2270 result_row_start, result_row_inc,
2271 result_col_start, result_col_inc,
2272 result_internal_rows, result_internal_cols)] =
sum;
2287 template<
typename NumericT,
unsigned int AlignmentV>
2295 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2296 viennacl::cuda_arg<NumericT>(mat.
handle()),
2297 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2298 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2299 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2300 static_cast<unsigned int>(mat.
size1()),
2302 static_cast<unsigned int>(mat.
ell_nnz()),
2322 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2323 viennacl::cuda_arg<NumericT>(mat.
handle()),
2324 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2325 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2326 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2327 static_cast<unsigned int>(mat.
size1()),
2329 static_cast<unsigned int>(mat.
ell_nnz()),
2349 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2350 viennacl::cuda_arg<NumericT>(mat.
handle()),
2351 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2352 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2353 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2354 static_cast<unsigned int>(mat.
size1()),
2356 static_cast<unsigned int>(mat.
ell_nnz()),
2376 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2377 viennacl::cuda_arg<NumericT>(mat.
handle()),
2378 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2379 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2380 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2381 static_cast<unsigned int>(mat.
size1()),
2383 static_cast<unsigned int>(mat.
ell_nnz()),
2404 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
2407 const unsigned int * csr_rows,
2408 const unsigned int * csr_cols,
2410 unsigned int row_num,
2411 unsigned int internal_row_num,
2412 unsigned int items_per_row,
2413 unsigned int aligned_items_per_row,
2415 unsigned int d_mat_row_start,
2416 unsigned int d_mat_col_start,
2417 unsigned int d_mat_row_inc,
2418 unsigned int d_mat_col_inc,
2419 unsigned int d_mat_row_size,
2420 unsigned int d_mat_col_size,
2421 unsigned int d_mat_internal_rows,
2422 unsigned int d_mat_internal_cols,
2424 unsigned int result_row_start,
2425 unsigned int result_col_start,
2426 unsigned int result_row_inc,
2427 unsigned int result_col_inc,
2428 unsigned int result_row_size,
2429 unsigned int result_col_size,
2430 unsigned int result_internal_rows,
2431 unsigned int result_internal_cols)
2433 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2434 unsigned int glb_sz = gridDim.x * blockDim.x;
2436 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2438 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2442 unsigned int offset = row_id;
2443 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2445 NumericT val = ell_elements[offset];
2449 sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
2450 d_mat_row_start, d_mat_row_inc,
2451 d_mat_col_start, d_mat_col_inc,
2452 d_mat_internal_rows, d_mat_internal_cols)] * val;
2456 unsigned int col_begin = csr_rows[row_id];
2457 unsigned int col_end = csr_rows[row_id + 1];
2459 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2461 sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
2462 d_mat_row_start, d_mat_row_inc,
2463 d_mat_col_start, d_mat_col_inc,
2464 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2467 result[ResultIndexT::apply(row_id, result_col,
2468 result_row_start, result_row_inc,
2469 result_col_start, result_col_inc,
2470 result_internal_rows, result_internal_cols)] =
sum;
2485 template<
typename NumericT,
unsigned int AlignmentV>
2492 if (d_mat.lhs().row_major() && result.
row_major())
2495 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2496 viennacl::cuda_arg<NumericT>(mat.
handle()),
2497 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2498 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2499 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2500 static_cast<unsigned int>(mat.
size1()),
2502 static_cast<unsigned int>(mat.
ell_nnz()),
2519 else if (d_mat.lhs().row_major() && !result.
row_major())
2522 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2523 viennacl::cuda_arg<NumericT>(mat.
handle()),
2524 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2525 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2526 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2527 static_cast<unsigned int>(mat.
size1()),
2529 static_cast<unsigned int>(mat.
ell_nnz()),
2546 else if (!d_mat.lhs().row_major() && result.
row_major())
2549 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2550 viennacl::cuda_arg<NumericT>(mat.
handle()),
2551 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2552 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2553 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2554 static_cast<unsigned int>(mat.
size1()),
2556 static_cast<unsigned int>(mat.
ell_nnz()),
2576 viennacl::cuda_arg<unsigned int>(mat.
handle2()),
2577 viennacl::cuda_arg<NumericT>(mat.
handle()),
2578 viennacl::cuda_arg<unsigned int>(mat.
handle3()),
2579 viennacl::cuda_arg<unsigned int>(mat.
handle4()),
2580 viennacl::cuda_arg<NumericT>(mat.
handle5()),
2581 static_cast<unsigned int>(mat.
size1()),
2583 static_cast<unsigned int>(mat.
ell_nnz()),
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Simple enable-if variant that uses the SFINAE pattern.
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
viennacl::scalar_expression< const viennacl::vector_base< NumericT >, const viennacl::vector_base< NumericT >, viennacl::op_sum > sum(viennacl::vector_base< NumericT > const &x)
User interface function for computing the sum of all elements of a vector.
const handle_type & handle4() const
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
__global__ void compressed_matrix_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
A tag class representing a lower triangular matrix.
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
vcl_size_t internal_ellnnz() const
Expression template class for representing a tree of expressions which ultimately result in a matrix...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
void row_info(compressed_matrix< NumericT, AligmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
vcl_size_t rows_per_block() const
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, unsigned int size_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result, unsigned int block_size)
vcl_size_t size1() const
Returns the size of the result vector.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Helper struct for accessing an element of a row- or column-major matrix.
vcl_size_t internal_size1() const
const vcl_size_t & nnz() const
Returns the number of nonzero entries.
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size, unsigned int option)
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
vcl_size_t ell_nnz() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void coo_row_info_extractor(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, NumericT *result, unsigned int option)
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
const handle_type & handle2() const
A tag class representing an upper triangular matrix.
vcl_size_t internal_size1() const
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, unsigned int nonzero_rows, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Sparse matrix class using the sliced ELLPACK with parameters C, .
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
vcl_size_t maxnnz() const
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Implementations of direct triangular solvers for sparse matrices using CUDA.
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result)
void clear()
Resets all entries to zero. Does not change the size of the vector.
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
size_type size() const
Returns the length of the vector (cf. std::vector)
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
const handle_type & handle() const
A tag class representing a lower triangular matrix with unit diagonal.
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
A tag class representing transposed matrices.
A sparse square matrix in compressed sparse rows format.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
A tag for column-major storage of a dense matrix.
NumericT * cuda_arg(scalar< NumericT > &obj)
Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version.
LHS & lhs() const
Get left hand side operand.
size_type start() const
Returns the offset within the buffer.
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Implementation of the ViennaCL scalar class.
const handle_type & handle3() const
A tag class representing an upper triangular matrix with unit diagonal.
NumericT min(std::vector< NumericT > const &v1)
const handle_type & handle5() const
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)