1 #ifndef VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
29 #ifdef VIENNACL_WITH_AVX2
30 #include "immintrin.h"
43 #ifdef VIENNACL_WITH_AVX2
45 unsigned int row_C_scan_symbolic_vector_AVX2(
int const *row_indices_B_begin,
int const *row_indices_B_end,
46 int const *B_row_buffer,
int const *B_col_buffer,
int B_size2,
47 int *row_C_vector_output)
49 __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
50 __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
52 __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
53 __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
54 __m256i avx_load_mask2 = avx_load_mask;
56 __m256i avx_row_indices = _mm256_set1_epi32(0);
57 avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
58 avx_load_mask = avx_load_mask2;
59 __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4);
60 avx_load_mask = avx_load_mask2;
61 __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
63 avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
64 __m256i avx_index_front = avx_all_bsize2;
65 avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
67 int *output_ptr = row_C_vector_output;
72 __m256i avx_index_min1 = avx_index_front;
73 __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
74 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
76 avx_temp = _mm256_shuffle_epi32(avx_index_min1,
int(78));
77 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
79 avx_temp = _mm256_shuffle_epi32(avx_index_min1,
int(177));
80 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
82 int min_index_in_front = ((
int*)&avx_index_min1)[0];
84 if (min_index_in_front == B_size2)
88 *output_ptr = min_index_in_front;
92 avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
94 avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
95 avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
97 avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask);
98 avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp);
100 avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
101 avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
104 return static_cast<unsigned int>(output_ptr - row_C_vector_output);
112 template<
unsigned int IndexNum>
114 unsigned int const *B_row_buffer,
unsigned int const *B_col_buffer,
unsigned int B_size2,
115 unsigned int const *row_C_vector_input,
unsigned int const *row_C_vector_input_end,
116 unsigned int *row_C_vector_output)
118 unsigned int index_front[IndexNum+1];
119 unsigned int const *index_front_start[IndexNum+1];
120 unsigned int const *index_front_end[IndexNum+1];
123 for (
unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
125 index_front_start[i] = B_col_buffer + B_row_buffer[*row_indices_B];
126 index_front_end[i] = B_col_buffer + B_row_buffer[*row_indices_B + 1];
128 index_front_start[IndexNum] = row_C_vector_input;
129 index_front_end[IndexNum] = row_C_vector_input_end;
132 for (
unsigned int i=0; i<=IndexNum; ++i)
133 index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
135 unsigned int *output_ptr = row_C_vector_output;
140 unsigned int min_index_in_front = B_size2;
141 for (
unsigned int i=0; i<=IndexNum; ++i)
142 min_index_in_front =
std::min(min_index_in_front, index_front[i]);
144 if (min_index_in_front == B_size2)
148 for (
unsigned int i=0; i<=IndexNum; ++i)
150 if (index_front[i] == min_index_in_front)
152 index_front_start[i] += 1;
153 index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
158 *output_ptr = min_index_in_front;
162 return static_cast<unsigned int>(output_ptr - row_C_vector_output);
168 template<
typename OutputWriterT>
170 unsigned int const *input2_begin,
unsigned int const *input2_end,
171 unsigned int termination_index,
172 unsigned int *output_begin)
174 unsigned int *output_ptr = output_begin;
176 unsigned int val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
177 unsigned int val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
180 unsigned int min_index =
std::min(val_1, val_2);
182 if (min_index == termination_index)
185 if (min_index == val_1)
188 val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
191 if (min_index == val_2)
194 val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
198 OutputWriterT::apply(output_ptr, min_index);
202 return static_cast<unsigned int>(output_ptr - output_begin);
207 unsigned int const *B_row_buffer,
unsigned int const *B_col_buffer,
unsigned int B_size2,
208 unsigned int *row_C_vector_1,
unsigned int *row_C_vector_2,
unsigned int *row_C_vector_3)
211 if (row_start_A == row_end_A)
215 if (row_end_A - row_start_A == 1)
217 unsigned int A_col = A_col_buffer[row_start_A];
218 return B_row_buffer[A_col + 1] - B_row_buffer[A_col];
222 unsigned int row_C_len = 0;
223 if (row_end_A - row_start_A == 2)
225 unsigned int A_col_1 = A_col_buffer[row_start_A];
226 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
227 return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
228 B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
234 #ifdef VIENNACL_WITH_AVX2
235 row_C_len = row_C_scan_symbolic_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A),
236 (
const int*)B_row_buffer, (
const int*)B_col_buffer,
int(B_size2),
237 (
int*)row_C_vector_1);
240 unsigned int A_col_1 = A_col_buffer[row_start_A];
241 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
242 row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
243 B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
251 while (row_end_A > row_start_A)
253 #ifdef VIENNACL_WITH_AVX2
254 if (row_end_A - row_start_A > 2)
256 unsigned int merged_len = row_C_scan_symbolic_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A),
257 (
const int*)B_row_buffer, (
const int*)B_col_buffer,
int(B_size2),
258 (
int*)row_C_vector_3);
259 if (row_start_A + 8 >= row_end_A)
260 row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
261 row_C_vector_1, row_C_vector_1 + row_C_len,
265 row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
266 row_C_vector_1, row_C_vector_1 + row_C_len,
273 if (row_start_A == row_end_A - 1)
276 unsigned int row_index_B = A_col_buffer[row_start_A];
277 return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
278 row_C_vector_1, row_C_vector_1 + row_C_len,
282 else if (row_start_A + 1 < row_end_A)
285 unsigned int A_col_1 = A_col_buffer[row_start_A];
286 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
287 unsigned int merged_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
288 B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
291 if (row_start_A + 2 == row_end_A)
292 return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
293 row_C_vector_1, row_C_vector_1 + row_C_len,
297 row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
298 row_C_vector_1, row_C_vector_1 + row_C_len,
306 unsigned int row_index_B = A_col_buffer[row_start_A];
307 row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
308 row_C_vector_1, row_C_vector_1 + row_C_len,
314 std::swap(row_C_vector_1, row_C_vector_2);
326 template<
unsigned int IndexNum,
typename NumericT>
328 unsigned int const *B_row_buffer,
unsigned int const *B_col_buffer,
NumericT const *B_elements,
unsigned int B_size2,
329 unsigned int const *row_C_vector_input,
unsigned int const *row_C_vector_input_end,
NumericT *row_C_vector_input_values,
330 unsigned int *row_C_vector_output,
NumericT *row_C_vector_output_values)
332 unsigned int index_front[IndexNum+1];
333 unsigned int const *index_front_start[IndexNum+1];
334 unsigned int const *index_front_end[IndexNum+1];
335 NumericT const * value_front_start[IndexNum+1];
339 for (
unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
341 unsigned int row_B = *row_indices_B;
343 index_front_start[i] = B_col_buffer + B_row_buffer[row_B];
344 index_front_end[i] = B_col_buffer + B_row_buffer[row_B + 1];
345 value_front_start[i] = B_elements + B_row_buffer[row_B];
346 values_A[i] = val_A[i];
348 index_front_start[IndexNum] = row_C_vector_input;
349 index_front_end[IndexNum] = row_C_vector_input_end;
350 value_front_start[IndexNum] = row_C_vector_input_values;
354 for (
unsigned int i=0; i<=IndexNum; ++i)
355 index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
357 unsigned int *output_ptr = row_C_vector_output;
362 unsigned int min_index_in_front = B_size2;
363 for (
unsigned int i=0; i<=IndexNum; ++i)
364 min_index_in_front =
std::min(min_index_in_front, index_front[i]);
366 if (min_index_in_front == B_size2)
371 for (
unsigned int i=0; i<=IndexNum; ++i)
373 if (index_front[i] == min_index_in_front)
375 index_front_start[i] += 1;
376 index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
378 row_C_value += values_A[i] * *value_front_start[i];
379 value_front_start[i] += 1;
384 *output_ptr = min_index_in_front;
386 *row_C_vector_output_values = row_C_value;
387 ++row_C_vector_output_values;
390 return static_cast<unsigned int>(output_ptr - row_C_vector_output);
395 #ifdef VIENNACL_WITH_AVX2
397 unsigned int row_C_scan_numeric_vector_AVX2(
int const *row_indices_B_begin,
int const *row_indices_B_end,
double const *values_A,
398 int const *B_row_buffer,
int const *B_col_buffer,
double const *B_elements,
400 int *row_C_vector_output,
double *row_C_vector_output_values)
402 __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
403 __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
405 __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
406 __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
407 __m256i avx_load_mask2 = avx_load_mask;
409 __m256i avx_row_indices = _mm256_set1_epi32(0);
410 avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
413 avx_load_mask = avx_load_mask2;
414 __m256d avx_value_A_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0),
416 _mm256_extractf128_si256(avx_row_indices_offsets, 0),
417 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8);
418 avx_load_mask = avx_load_mask2;
419 __m256d avx_value_A_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0),
421 _mm256_extractf128_si256(avx_row_indices_offsets, 1),
422 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8);
425 avx_load_mask = avx_load_mask2;
426 __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4);
427 avx_load_mask = avx_load_mask2;
428 __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
430 avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
431 avx_load_mask2 = avx_load_mask;
432 __m256i avx_index_front = avx_all_bsize2;
433 avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
436 avx_load_mask = avx_load_mask2;
437 __m256d avx_value_front_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0),
439 _mm256_extractf128_si256(avx_row_start, 0),
440 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8);
441 avx_load_mask = avx_load_mask2;
442 __m256d avx_value_front_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0),
444 _mm256_extractf128_si256(avx_row_start, 1),
445 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8);
447 int *output_ptr = row_C_vector_output;
452 __m256i avx_index_min1 = avx_index_front;
453 __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
454 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
456 avx_temp = _mm256_shuffle_epi32(avx_index_min1,
int(78));
457 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
459 avx_temp = _mm256_shuffle_epi32(avx_index_min1,
int(177));
460 avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp);
462 int min_index_in_front = ((
int*)&avx_index_min1)[0];
464 if (min_index_in_front == B_size2)
469 value += (min_index_in_front == ((
int*)&avx_index_front)[0]) ? ((
double*)&avx_value_front_low)[0] * ((
double*)&avx_value_A_low)[0] : 0;
470 value += (min_index_in_front == ((
int*)&avx_index_front)[1]) ? ((
double*)&avx_value_front_low)[1] * ((
double*)&avx_value_A_low)[1] : 0;
471 value += (min_index_in_front == ((
int*)&avx_index_front)[2]) ? ((
double*)&avx_value_front_low)[2] * ((
double*)&avx_value_A_low)[2] : 0;
472 value += (min_index_in_front == ((
int*)&avx_index_front)[3]) ? ((
double*)&avx_value_front_low)[3] * ((
double*)&avx_value_A_low)[3] : 0;
473 value += (min_index_in_front == ((
int*)&avx_index_front)[4]) ? ((
double*)&avx_value_front_high)[0] * ((
double*)&avx_value_A_high)[0] : 0;
474 value += (min_index_in_front == ((
int*)&avx_index_front)[5]) ? ((
double*)&avx_value_front_high)[1] * ((
double*)&avx_value_A_high)[1] : 0;
475 value += (min_index_in_front == ((
int*)&avx_index_front)[6]) ? ((
double*)&avx_value_front_high)[2] * ((
double*)&avx_value_A_high)[2] : 0;
476 value += (min_index_in_front == ((
int*)&avx_index_front)[7]) ? ((
double*)&avx_value_front_high)[3] * ((
double*)&avx_value_A_high)[3] : 0;
477 *row_C_vector_output_values = value;
478 ++row_C_vector_output_values;
481 *output_ptr = min_index_in_front;
485 avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
487 avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
488 avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
490 avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask);
491 avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp);
493 avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
494 avx_load_mask2 = avx_load_mask;
495 avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
498 avx_load_mask = avx_load_mask2;
499 avx_value_front_low = _mm256_mask_i32gather_pd(avx_value_front_low,
501 _mm256_extractf128_si256(avx_row_start, 0),
502 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8);
504 avx_load_mask = avx_load_mask2;
505 avx_value_front_high = _mm256_mask_i32gather_pd(avx_value_front_high,
507 _mm256_extractf128_si256(avx_row_start, 1),
508 _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8);
514 return static_cast<unsigned int>(output_ptr - row_C_vector_output);
519 template<
typename NumericT>
521 unsigned int const *input2_index_begin,
unsigned int const *input2_index_end,
NumericT const *input2_values_begin,
NumericT factor2,
522 unsigned int termination_index,
523 unsigned int *output_index_begin,
NumericT *output_values_begin)
525 unsigned int *output_ptr = output_index_begin;
527 unsigned int index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
528 unsigned int index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
532 unsigned int min_index =
std::min(index1, index2);
535 if (min_index == termination_index)
538 if (min_index == index1)
540 ++input1_index_begin;
541 index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
543 value += factor1 * *input1_values_begin;
544 ++input1_values_begin;
547 if (min_index == index2)
549 ++input2_index_begin;
550 index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
552 value += factor2 * *input2_values_begin;
553 ++input2_values_begin;
557 *output_ptr = min_index;
559 *output_values_begin = value;
560 ++output_values_begin;
563 return static_cast<unsigned int>(output_ptr - output_index_begin);
566 template<
typename NumericT>
568 unsigned int const *B_row_buffer,
unsigned int const *B_col_buffer,
NumericT const *B_elements,
unsigned int B_size2,
569 unsigned int row_start_C,
unsigned int row_end_C,
unsigned int *C_col_buffer,
NumericT *C_elements,
570 unsigned int *row_C_vector_1,
NumericT *row_C_vector_1_values,
571 unsigned int *row_C_vector_2,
NumericT *row_C_vector_2_values,
572 unsigned int *row_C_vector_3,
NumericT *row_C_vector_3_values)
577 if (row_start_A == row_end_A)
581 if (row_end_A - row_start_A == 1)
583 unsigned int A_col = A_col_buffer[row_start_A];
584 unsigned int B_end = B_row_buffer[A_col + 1];
585 NumericT A_value = A_elements[row_start_A];
586 C_col_buffer += row_start_C;
587 C_elements += row_start_C;
588 for (
unsigned int j = B_row_buffer[A_col]; j < B_end; ++j, ++C_col_buffer, ++C_elements)
590 *C_col_buffer = B_col_buffer[j];
591 *C_elements = A_value * B_elements[j];
596 unsigned int row_C_len = 0;
597 if (row_end_A - row_start_A == 2)
599 unsigned int A_col_1 = A_col_buffer[row_start_A];
600 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
602 unsigned int B_offset_1 = B_row_buffer[A_col_1];
603 unsigned int B_offset_2 = B_row_buffer[A_col_2];
605 row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
606 B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
608 C_col_buffer + row_start_C, C_elements + row_start_C);
611 #ifdef VIENNACL_WITH_AVX2
612 else if (row_end_A - row_start_A > 10)
614 row_C_len = row_C_scan_numeric_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
615 (
const int*)B_row_buffer, (
const int*)B_col_buffer, B_elements,
int(B_size2),
616 (
int*)row_C_vector_1, row_C_vector_1_values);
622 unsigned int A_col_1 = A_col_buffer[row_start_A];
623 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
625 unsigned int B_offset_1 = B_row_buffer[A_col_1];
626 unsigned int B_offset_2 = B_row_buffer[A_col_2];
628 row_C_len =
row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
629 B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
631 row_C_vector_1, row_C_vector_1_values);
636 while (row_end_A > row_start_A)
638 #ifdef VIENNACL_WITH_AVX2
639 if (row_end_A - row_start_A > 9)
641 unsigned int merged_len = row_C_scan_numeric_vector_AVX2((
const int*)(A_col_buffer + row_start_A), (
const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
642 (
const int*)B_row_buffer, (
const int*)B_col_buffer, B_elements,
int(B_size2),
643 (
int*)row_C_vector_3, row_C_vector_3_values);
645 row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values,
NumericT(1.0),
647 row_C_vector_2, row_C_vector_2_values);
652 if (row_start_A + 1 == row_end_A)
654 unsigned int A_col = A_col_buffer[row_start_A];
655 unsigned int B_offset = B_row_buffer[A_col];
657 row_C_len =
row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
658 row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values,
NumericT(1.0),
660 C_col_buffer + row_start_C, C_elements + row_start_C);
663 else if (row_start_A + 2 < row_end_A)
666 unsigned int A_col_1 = A_col_buffer[row_start_A];
667 unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
669 unsigned int B_offset_1 = B_row_buffer[A_col_1];
670 unsigned int B_offset_2 = B_row_buffer[A_col_2];
672 unsigned int merged_len =
row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
673 B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
675 row_C_vector_3, row_C_vector_3_values);
677 row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values,
NumericT(1.0),
679 row_C_vector_2, row_C_vector_2_values);
684 unsigned int A_col = A_col_buffer[row_start_A];
685 unsigned int B_offset = B_row_buffer[A_col];
687 row_C_len =
row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
688 row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values,
NumericT(1.0),
690 row_C_vector_2, row_C_vector_2_values);
694 std::swap(row_C_vector_1, row_C_vector_2);
695 std::swap(row_C_vector_1_values, row_C_vector_2_values);
unsigned int row_C_scan_numeric_vector_1(unsigned int const *input1_index_begin, unsigned int const *input1_index_end, NumericT const *input1_values_begin, NumericT factor1, unsigned int const *input2_index_begin, unsigned int const *input2_index_end, NumericT const *input2_values_begin, NumericT factor2, unsigned int termination_index, unsigned int *output_index_begin, NumericT *output_values_begin)
This file provides the forward declarations for the main types used within ViennaCL.
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, unsigned int const *input1_end, unsigned int const *input2_begin, unsigned int const *input2_end, unsigned int termination_index, unsigned int *output_begin)
void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements, unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values, unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values, unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
static void apply(unsigned int *, unsigned int)
unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, NumericT const *val_A, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT *row_C_vector_input_values, unsigned int *row_C_vector_output, NumericT *row_C_vector_output_values)
Merges up to IndexNum rows from B into the result buffer.
unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, unsigned int *row_C_vector_output)
Merges up to IndexNum rows from B into the result buffer.
Common routines for single-threaded or OpenMP-enabled execution on CPU.
viennacl::enable_if< viennacl::is_scalar< ScalarT1 >::value &&viennacl::is_scalar< ScalarT2 >::value >::type swap(ScalarT1 &s1, ScalarT2 &s2)
Swaps the contents of two scalars, data is copied.
static void apply(unsigned int *ptr, unsigned int value)
T min(const T &lhs, const T &rhs)
Minimum.