ViennaCL - The Vienna Computing Library  1.7.0
Free open-source GPU-accelerated linear algebra and solver library.
spgemm_vector.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2015, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
27 
28 
29 #ifdef VIENNACL_WITH_AVX2
30 #include "immintrin.h"
31 #endif
32 
33 
34 namespace viennacl
35 {
36 namespace linalg
37 {
38 namespace host_based
39 {
40 
41 
42 
43 #ifdef VIENNACL_WITH_AVX2
44 inline
45 unsigned int row_C_scan_symbolic_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end,
46  int const *B_row_buffer, int const *B_col_buffer, int B_size2,
47  int *row_C_vector_output)
48 {
49  __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
50  __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
51 
52  __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
53  __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
54  __m256i avx_load_mask2 = avx_load_mask;
55 
56  __m256i avx_row_indices = _mm256_set1_epi32(0);
57  avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
58  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
59  __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4);
60  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
61  __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
62 
63  avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
64  __m256i avx_index_front = avx_all_bsize2;
65  avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
66 
67  int *output_ptr = row_C_vector_output;
68 
69  while (1)
70  {
71  // get minimum index in current front:
72  __m256i avx_index_min1 = avx_index_front;
73  __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
74  avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements
75 
76  avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(78)); // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency
77  avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register)
78 
79  avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(177)); // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency
80  avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum
81 
82  int min_index_in_front = ((int*)&avx_index_min1)[0];
83  // check for end of merge operation:
84  if (min_index_in_front == B_size2)
85  break;
86 
87  // write current entry:
88  *output_ptr = min_index_in_front;
89  ++output_ptr;
90 
91  // advance index front where equal to minimum index:
92  avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
93  // first part: set index to B_size2 if equal to minimum index:
94  avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
95  avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
96  // second part: increment row_start registers where minimum found:
97  avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found
98  avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp);
99  // third part part: load new data where more entries available:
100  avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
101  avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
102  }
103 
104  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
105 }
106 #endif
107 
112 template<unsigned int IndexNum>
113 unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B,
114  unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2,
115  unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end,
116  unsigned int *row_C_vector_output)
117 {
118  unsigned int index_front[IndexNum+1];
119  unsigned int const *index_front_start[IndexNum+1];
120  unsigned int const *index_front_end[IndexNum+1];
121 
122  // Set up pointers for loading the indices:
123  for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
124  {
125  index_front_start[i] = B_col_buffer + B_row_buffer[*row_indices_B];
126  index_front_end[i] = B_col_buffer + B_row_buffer[*row_indices_B + 1];
127  }
128  index_front_start[IndexNum] = row_C_vector_input;
129  index_front_end[IndexNum] = row_C_vector_input_end;
130 
131  // load indices:
132  for (unsigned int i=0; i<=IndexNum; ++i)
133  index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
134 
135  unsigned int *output_ptr = row_C_vector_output;
136 
137  while (1)
138  {
139  // get minimum index in current front:
140  unsigned int min_index_in_front = B_size2;
141  for (unsigned int i=0; i<=IndexNum; ++i)
142  min_index_in_front = std::min(min_index_in_front, index_front[i]);
143 
144  if (min_index_in_front == B_size2) // we're done
145  break;
146 
147  // advance index front where equal to minimum index:
148  for (unsigned int i=0; i<=IndexNum; ++i)
149  {
150  if (index_front[i] == min_index_in_front)
151  {
152  index_front_start[i] += 1;
153  index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
154  }
155  }
156 
157  // write current entry:
158  *output_ptr = min_index_in_front;
159  ++output_ptr;
160  }
161 
162  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
163 }
164 
165 struct spgemm_output_write_enabled { static void apply(unsigned int *ptr, unsigned int value) { *ptr = value; } };
166 struct spgemm_output_write_disabled { static void apply(unsigned int * , unsigned int ) { } };
167 
168 template<typename OutputWriterT>
169 unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, unsigned int const *input1_end,
170  unsigned int const *input2_begin, unsigned int const *input2_end,
171  unsigned int termination_index,
172  unsigned int *output_begin)
173 {
174  unsigned int *output_ptr = output_begin;
175 
176  unsigned int val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
177  unsigned int val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
178  while (1)
179  {
180  unsigned int min_index = std::min(val_1, val_2);
181 
182  if (min_index == termination_index)
183  break;
184 
185  if (min_index == val_1)
186  {
187  ++input1_begin;
188  val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
189  }
190 
191  if (min_index == val_2)
192  {
193  ++input2_begin;
194  val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
195  }
196 
197  // write current entry:
198  OutputWriterT::apply(output_ptr, min_index); // *output_ptr = min_index; if necessary
199  ++output_ptr;
200  }
201 
202  return static_cast<unsigned int>(output_ptr - output_begin);
203 }
204 
205 inline
206 unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer,
207  unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2,
208  unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
209 {
210  // Trivial case: row length 0:
211  if (row_start_A == row_end_A)
212  return 0;
213 
214  // Trivial case: row length 1:
215  if (row_end_A - row_start_A == 1)
216  {
217  unsigned int A_col = A_col_buffer[row_start_A];
218  return B_row_buffer[A_col + 1] - B_row_buffer[A_col];
219  }
220 
221  // Optimizations for row length 2:
222  unsigned int row_C_len = 0;
223  if (row_end_A - row_start_A == 2)
224  {
225  unsigned int A_col_1 = A_col_buffer[row_start_A];
226  unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
227  return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
228  B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
229  B_size2,
230  row_C_vector_1);
231  }
232  else // for more than two rows we can safely merge the first two:
233  {
234 #ifdef VIENNACL_WITH_AVX2
235  row_C_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A),
236  (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2),
237  (int*)row_C_vector_1);
238  row_start_A += 8;
239 #else
240  unsigned int A_col_1 = A_col_buffer[row_start_A];
241  unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
242  row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
243  B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
244  B_size2,
245  row_C_vector_1);
246  row_start_A += 2;
247 #endif
248  }
249 
250  // all other row lengths:
251  while (row_end_A > row_start_A)
252  {
253 #ifdef VIENNACL_WITH_AVX2
254  if (row_end_A - row_start_A > 2) // we deal with one or two remaining rows more efficiently below:
255  {
256  unsigned int merged_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A),
257  (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2),
258  (int*)row_C_vector_3);
259  if (row_start_A + 8 >= row_end_A)
260  row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
261  row_C_vector_1, row_C_vector_1 + row_C_len,
262  B_size2,
263  row_C_vector_2);
264  else
265  row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
266  row_C_vector_1, row_C_vector_1 + row_C_len,
267  B_size2,
268  row_C_vector_2);
269  row_start_A += 8;
270  }
271  else
272 #endif
273  if (row_start_A == row_end_A - 1) // last merge operation. No need to write output
274  {
275  // process last row
276  unsigned int row_index_B = A_col_buffer[row_start_A];
277  return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
278  row_C_vector_1, row_C_vector_1 + row_C_len,
279  B_size2,
280  row_C_vector_2);
281  }
282  else if (row_start_A + 1 < row_end_A)// at least two more rows left, so merge them
283  {
284  // process single row:
285  unsigned int A_col_1 = A_col_buffer[row_start_A];
286  unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
287  unsigned int merged_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
288  B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
289  B_size2,
290  row_C_vector_3);
291  if (row_start_A + 2 == row_end_A) // last merge does not need a write:
292  return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
293  row_C_vector_1, row_C_vector_1 + row_C_len,
294  B_size2,
295  row_C_vector_2);
296  else
297  row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
298  row_C_vector_1, row_C_vector_1 + row_C_len,
299  B_size2,
300  row_C_vector_2);
301  row_start_A += 2;
302  }
303  else // at least two more rows left
304  {
305  // process single row:
306  unsigned int row_index_B = A_col_buffer[row_start_A];
307  row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
308  row_C_vector_1, row_C_vector_1 + row_C_len,
309  B_size2,
310  row_C_vector_2);
311  ++row_start_A;
312  }
313 
314  std::swap(row_C_vector_1, row_C_vector_2);
315  }
316 
317  return row_C_len;
318 }
319 
321 
326 template<unsigned int IndexNum, typename NumericT>
327 unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, NumericT const *val_A,
328  unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2,
329  unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT *row_C_vector_input_values,
330  unsigned int *row_C_vector_output, NumericT *row_C_vector_output_values)
331 {
332  unsigned int index_front[IndexNum+1];
333  unsigned int const *index_front_start[IndexNum+1];
334  unsigned int const *index_front_end[IndexNum+1];
335  NumericT const * value_front_start[IndexNum+1];
336  NumericT values_A[IndexNum+1];
337 
338  // Set up pointers for loading the indices:
339  for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
340  {
341  unsigned int row_B = *row_indices_B;
342 
343  index_front_start[i] = B_col_buffer + B_row_buffer[row_B];
344  index_front_end[i] = B_col_buffer + B_row_buffer[row_B + 1];
345  value_front_start[i] = B_elements + B_row_buffer[row_B];
346  values_A[i] = val_A[i];
347  }
348  index_front_start[IndexNum] = row_C_vector_input;
349  index_front_end[IndexNum] = row_C_vector_input_end;
350  value_front_start[IndexNum] = row_C_vector_input_values;
351  values_A[IndexNum] = NumericT(1);
352 
353  // load indices:
354  for (unsigned int i=0; i<=IndexNum; ++i)
355  index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
356 
357  unsigned int *output_ptr = row_C_vector_output;
358 
359  while (1)
360  {
361  // get minimum index in current front:
362  unsigned int min_index_in_front = B_size2;
363  for (unsigned int i=0; i<=IndexNum; ++i)
364  min_index_in_front = std::min(min_index_in_front, index_front[i]);
365 
366  if (min_index_in_front == B_size2) // we're done
367  break;
368 
369  // advance index front where equal to minimum index:
370  NumericT row_C_value = 0;
371  for (unsigned int i=0; i<=IndexNum; ++i)
372  {
373  if (index_front[i] == min_index_in_front)
374  {
375  index_front_start[i] += 1;
376  index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
377 
378  row_C_value += values_A[i] * *value_front_start[i];
379  value_front_start[i] += 1;
380  }
381  }
382 
383  // write current entry:
384  *output_ptr = min_index_in_front;
385  ++output_ptr;
386  *row_C_vector_output_values = row_C_value;
387  ++row_C_vector_output_values;
388  }
389 
390  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
391 }
392 
393 
394 
395 #ifdef VIENNACL_WITH_AVX2
396 inline
397 unsigned int row_C_scan_numeric_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end, double const *values_A,
398  int const *B_row_buffer, int const *B_col_buffer, double const *B_elements,
399  int B_size2,
400  int *row_C_vector_output, double *row_C_vector_output_values)
401 {
402  __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
403  __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
404 
405  __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
406  __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
407  __m256i avx_load_mask2 = avx_load_mask;
408 
409  __m256i avx_row_indices = _mm256_set1_epi32(0);
410  avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
411 
412  // load values from A:
413  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
414  __m256d avx_value_A_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
415  values_A, //base ptr
416  _mm256_extractf128_si256(avx_row_indices_offsets, 0), //indices
417  _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
418  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
419  __m256d avx_value_A_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
420  values_A, //base ptr
421  _mm256_extractf128_si256(avx_row_indices_offsets, 1), //indices
422  _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
423 
424 
425  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
426  __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4);
427  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
428  __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
429 
430  avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
431  avx_load_mask2 = avx_load_mask;
432  __m256i avx_index_front = avx_all_bsize2;
433  avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
434 
435  // load front values from B:
436  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
437  __m256d avx_value_front_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
438  B_elements, //base ptr
439  _mm256_extractf128_si256(avx_row_start, 0), //indices
440  _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
441  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
442  __m256d avx_value_front_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
443  B_elements, //base ptr
444  _mm256_extractf128_si256(avx_row_start, 1), //indices
445  _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
446 
447  int *output_ptr = row_C_vector_output;
448 
449  while (1)
450  {
451  // get minimum index in current front:
452  __m256i avx_index_min1 = avx_index_front;
453  __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
454  avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements
455 
456  avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(78)); // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency
457  avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register)
458 
459  avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(177)); // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency
460  avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum
461 
462  int min_index_in_front = ((int*)&avx_index_min1)[0];
463  // check for end of merge operation:
464  if (min_index_in_front == B_size2)
465  break;
466 
467  // accumulate value (can certainly be done more elegantly...)
468  double value = 0;
469  value += (min_index_in_front == ((int*)&avx_index_front)[0]) ? ((double*)&avx_value_front_low)[0] * ((double*)&avx_value_A_low)[0] : 0;
470  value += (min_index_in_front == ((int*)&avx_index_front)[1]) ? ((double*)&avx_value_front_low)[1] * ((double*)&avx_value_A_low)[1] : 0;
471  value += (min_index_in_front == ((int*)&avx_index_front)[2]) ? ((double*)&avx_value_front_low)[2] * ((double*)&avx_value_A_low)[2] : 0;
472  value += (min_index_in_front == ((int*)&avx_index_front)[3]) ? ((double*)&avx_value_front_low)[3] * ((double*)&avx_value_A_low)[3] : 0;
473  value += (min_index_in_front == ((int*)&avx_index_front)[4]) ? ((double*)&avx_value_front_high)[0] * ((double*)&avx_value_A_high)[0] : 0;
474  value += (min_index_in_front == ((int*)&avx_index_front)[5]) ? ((double*)&avx_value_front_high)[1] * ((double*)&avx_value_A_high)[1] : 0;
475  value += (min_index_in_front == ((int*)&avx_index_front)[6]) ? ((double*)&avx_value_front_high)[2] * ((double*)&avx_value_A_high)[2] : 0;
476  value += (min_index_in_front == ((int*)&avx_index_front)[7]) ? ((double*)&avx_value_front_high)[3] * ((double*)&avx_value_A_high)[3] : 0;
477  *row_C_vector_output_values = value;
478  ++row_C_vector_output_values;
479 
480  // write current entry:
481  *output_ptr = min_index_in_front;
482  ++output_ptr;
483 
484  // advance index front where equal to minimum index:
485  avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
486  // first part: set index to B_size2 if equal to minimum index:
487  avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
488  avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
489  // second part: increment row_start registers where minimum found:
490  avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found
491  avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp);
492  // third part part: load new data where more entries available:
493  avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
494  avx_load_mask2 = avx_load_mask;
495  avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
496 
497  // load new values where necessary:
498  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
499  avx_value_front_low = _mm256_mask_i32gather_pd(avx_value_front_low, //src
500  B_elements, //base ptr
501  _mm256_extractf128_si256(avx_row_start, 0), //indices
502  _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
503 
504  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
505  avx_value_front_high = _mm256_mask_i32gather_pd(avx_value_front_high, //src
506  B_elements, //base ptr
507  _mm256_extractf128_si256(avx_row_start, 1), //indices
508  _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
509 
510  //multiply new entries:
511 
512  }
513 
514  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
515 }
516 #endif
517 
518 
519 template<typename NumericT>
520 unsigned int row_C_scan_numeric_vector_1(unsigned int const *input1_index_begin, unsigned int const *input1_index_end, NumericT const *input1_values_begin, NumericT factor1,
521  unsigned int const *input2_index_begin, unsigned int const *input2_index_end, NumericT const *input2_values_begin, NumericT factor2,
522  unsigned int termination_index,
523  unsigned int *output_index_begin, NumericT *output_values_begin)
524 {
525  unsigned int *output_ptr = output_index_begin;
526 
527  unsigned int index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
528  unsigned int index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
529 
530  while (1)
531  {
532  unsigned int min_index = std::min(index1, index2);
533  NumericT value = 0;
534 
535  if (min_index == termination_index)
536  break;
537 
538  if (min_index == index1)
539  {
540  ++input1_index_begin;
541  index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
542 
543  value += factor1 * *input1_values_begin;
544  ++input1_values_begin;
545  }
546 
547  if (min_index == index2)
548  {
549  ++input2_index_begin;
550  index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
551 
552  value += factor2 * *input2_values_begin;
553  ++input2_values_begin;
554  }
555 
556  // write current entry:
557  *output_ptr = min_index;
558  ++output_ptr;
559  *output_values_begin = value;
560  ++output_values_begin;
561  }
562 
563  return static_cast<unsigned int>(output_ptr - output_index_begin);
564 }
565 
566 template<typename NumericT>
567 void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements,
568  unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2,
569  unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements,
570  unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values,
571  unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values,
572  unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
573 {
574  (void)row_end_C;
575 
576  // Trivial case: row length 0:
577  if (row_start_A == row_end_A)
578  return;
579 
580  // Trivial case: row length 1:
581  if (row_end_A - row_start_A == 1)
582  {
583  unsigned int A_col = A_col_buffer[row_start_A];
584  unsigned int B_end = B_row_buffer[A_col + 1];
585  NumericT A_value = A_elements[row_start_A];
586  C_col_buffer += row_start_C;
587  C_elements += row_start_C;
588  for (unsigned int j = B_row_buffer[A_col]; j < B_end; ++j, ++C_col_buffer, ++C_elements)
589  {
590  *C_col_buffer = B_col_buffer[j];
591  *C_elements = A_value * B_elements[j];
592  }
593  return;
594  }
595 
596  unsigned int row_C_len = 0;
597  if (row_end_A - row_start_A == 2) // directly merge to C:
598  {
599  unsigned int A_col_1 = A_col_buffer[row_start_A];
600  unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
601 
602  unsigned int B_offset_1 = B_row_buffer[A_col_1];
603  unsigned int B_offset_2 = B_row_buffer[A_col_2];
604 
605  row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
606  B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
607  B_size2,
608  C_col_buffer + row_start_C, C_elements + row_start_C);
609  return;
610  }
611 #ifdef VIENNACL_WITH_AVX2
612  else if (row_end_A - row_start_A > 10) // safely merge eight rows into temporary buffer:
613  {
614  row_C_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
615  (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2),
616  (int*)row_C_vector_1, row_C_vector_1_values);
617  row_start_A += 8;
618  }
619 #endif
620  else // safely merge two rows into temporary buffer:
621  {
622  unsigned int A_col_1 = A_col_buffer[row_start_A];
623  unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
624 
625  unsigned int B_offset_1 = B_row_buffer[A_col_1];
626  unsigned int B_offset_2 = B_row_buffer[A_col_2];
627 
628  row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
629  B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
630  B_size2,
631  row_C_vector_1, row_C_vector_1_values);
632  row_start_A += 2;
633  }
634 
635  // process remaining rows:
636  while (row_end_A > row_start_A)
637  {
638 #ifdef VIENNACL_WITH_AVX2
639  if (row_end_A - row_start_A > 9) // code in other if-conditionals ensures that values get written to C
640  {
641  unsigned int merged_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
642  (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2),
643  (int*)row_C_vector_3, row_C_vector_3_values);
644  row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0),
645  row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
646  B_size2,
647  row_C_vector_2, row_C_vector_2_values);
648  row_start_A += 8;
649  }
650  else
651 #endif
652  if (row_start_A + 1 == row_end_A) // last row to merge, write directly to C:
653  {
654  unsigned int A_col = A_col_buffer[row_start_A];
655  unsigned int B_offset = B_row_buffer[A_col];
656 
657  row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
658  row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
659  B_size2,
660  C_col_buffer + row_start_C, C_elements + row_start_C);
661  return;
662  }
663  else if (row_start_A + 2 < row_end_A)// at least three more rows left, so merge two
664  {
665  // process single row:
666  unsigned int A_col_1 = A_col_buffer[row_start_A];
667  unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
668 
669  unsigned int B_offset_1 = B_row_buffer[A_col_1];
670  unsigned int B_offset_2 = B_row_buffer[A_col_2];
671 
672  unsigned int merged_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
673  B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
674  B_size2,
675  row_C_vector_3, row_C_vector_3_values);
676  row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0),
677  row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
678  B_size2,
679  row_C_vector_2, row_C_vector_2_values);
680  row_start_A += 2;
681  }
682  else
683  {
684  unsigned int A_col = A_col_buffer[row_start_A];
685  unsigned int B_offset = B_row_buffer[A_col];
686 
687  row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
688  row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
689  B_size2,
690  row_C_vector_2, row_C_vector_2_values);
691  ++row_start_A;
692  }
693 
694  std::swap(row_C_vector_1, row_C_vector_2);
695  std::swap(row_C_vector_1_values, row_C_vector_2_values);
696  }
697 }
698 
699 
700 } // namespace host_based
701 } //namespace linalg
702 } //namespace viennacl
703 
704 
705 #endif
unsigned int row_C_scan_numeric_vector_1(unsigned int const *input1_index_begin, unsigned int const *input1_index_end, NumericT const *input1_values_begin, NumericT factor1, unsigned int const *input2_index_begin, unsigned int const *input2_index_end, NumericT const *input2_values_begin, NumericT factor2, unsigned int termination_index, unsigned int *output_index_begin, NumericT *output_values_begin)
This file provides the forward declarations for the main types used within ViennaCL.
float NumericT
Definition: bisect.cpp:40
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:34
unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, unsigned int const *input1_end, unsigned int const *input2_begin, unsigned int const *input2_end, unsigned int termination_index, unsigned int *output_begin)
void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements, unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values, unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values, unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
static void apply(unsigned int *, unsigned int)
unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, NumericT const *val_A, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2, unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT *row_C_vector_input_values, unsigned int *row_C_vector_output, NumericT *row_C_vector_output_values)
Merges up to IndexNum rows from B into the result buffer.
unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B, unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2, unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, unsigned int *row_C_vector_output)
Merges up to IndexNum rows from B into the result buffer.
Common routines for single-threaded or OpenMP-enabled execution on CPU.
viennacl::enable_if< viennacl::is_scalar< ScalarT1 >::value &&viennacl::is_scalar< ScalarT2 >::value >::type swap(ScalarT1 &s1, ScalarT2 &s2)
Swaps the contents of two scalars, data is copied.
static void apply(unsigned int *ptr, unsigned int value)
T min(const T &lhs, const T &rhs)
Minimum.
Definition: util.hpp:45