Re: [PR] GH-43693: [C++][Acero] Support AVX2 swiss join decoding [arrow]

via GitHub Tue, 03 Sep 2024 21:17:59 -0700


zanmato1984 commented on code in PR #43832:
URL: https://github.com/apache/arrow/pull/43832#discussion_r1743012373



##########
cpp/src/arrow/acero/swiss_join_avx2.cc:
##########
@@ -233,27 +243,288 @@ int RowArrayAccessor::VisitNulls_avx2(const 
RowTableImpl& rows, int column_id,
   const uint8_t* null_masks = rows.null_masks();
   __m256i null_bits_per_row =
       _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row);
+  __m256i pos_after_encoding =
+      _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id));
   for (int i = 0; i < num_rows / unroll; ++i) {
     __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const 
__m256i*>(row_ids) + i);
     __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row);
-    bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id));
+    bit_id = _mm256_add_epi32(bit_id, pos_after_encoding);
     __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const 
int*>(null_masks),
                                            _mm256_srli_epi32(bit_id, 3), 1);
     __m256i bit_in_word = _mm256_sllv_epi32(
         _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7)));
     __m256i result =
         _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word);
-    uint64_t null_bytes = static_cast<uint64_t>(
+    // NB: Be careful about sign-extension when casting the return value of
+    // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+    // higher bits of the following OR.
+    uint32_t null_bytes_lo = static_cast<uint32_t>(
         
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result))));
-    null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8(
-                      _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1))))
-                  << 32;
+    uint64_t null_bytes_hi =
+        
_mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 
1)));
+    uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32);
 
     process_8_values_fn(i * unroll, null_bytes);
   }
 
   return num_rows - (num_rows % unroll);
 }
 
+namespace {
+
+inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  // Extend to 64-bit.
+  __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo);
+  __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi);
+  // Keep the first 8 bits in each 64-bit row.
+  row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF));
+  row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF));
+  // If the 64-bit is zero, then we get 64 set bits.
+  __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, 
_mm256_setzero_si256());
+  __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, 
_mm256_setzero_si256());
+  // 64 set bits to 8 set bits.
+  int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64);
+  int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64);
+  // 8 set bits to 1 set bit.
+  uint8_t is_zero = static_cast<uint8_t>(
+      _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8)));
+  *output = static_cast<uint8_t>(~is_zero);
+}
+
+inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 8 bits of each 32-bit rows to the lower 32 bits of each 
128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_4_8_12 = 0x0c080400ULL;
+  const __m256i shuffle_const =
+      _mm256_setr_epi64x(kByteSequence_0_4_8_12, -1, kByteSequence_0_4_8_12, 
-1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Get the lower 32-bits (4 8-bit rows) from each 128-bit lane.
+  // NB: Be careful about sign-extension when casting the return value of
+  // _mm256_extract_epi32 (signed 32-bit) to unsigned 64-bit, which will 
pollute the
+  // higher bits of the following OR.
+  uint32_t compact_row_lo = static_cast<uint32_t>(_mm256_extract_epi32(row, 
0));
+  uint64_t compact_row_hi = static_cast<uint64_t>(_mm256_extract_epi32(row, 
4)) << 32;
+  *reinterpret_cast<uint64_t*>(output) = compact_row_lo | compact_row_hi;
+}
+
+inline void Decode8FixedLength2_avx2(uint16_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit (only lower 16 bits interesting) rows 
based on the
+  // lower/higher 4 64-bit row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  // Shuffle the lower 16 bits of each 32-bit rows to the lower 64 bits of 
each 128-bit
+  // lane.
+  constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL;
+  const __m256i shuffle_const = 
_mm256_setr_epi64x(kByteSequence_0_1_4_5_8_9_12_13, -1,
+                                                   
kByteSequence_0_1_4_5_8_9_12_13, -1);
+  row = _mm256_shuffle_epi8(row, shuffle_const);
+  // Swap the second and the third 64-bit lane.
+  row = _mm256_permute4x64_epi64(row, 0xd8);
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(output), 
_mm256_castsi256_si128(row));
+}
+
+inline void Decode8FixedLength4_avx2(uint32_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  // Gather the lower/higher 4 32-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 
1);
+  __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 
1);
+  __m256i row = _mm256_set_m128i(row_hi, row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row);
+}
+
+inline void Decode8FixedLength8_avx2(uint64_t* output, const uint8_t* 
row_ptr_base,
+                                     __m256i offset_lo, __m256i offset_hi) {
+  auto row_ptr_base_i64 =
+      reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_ptr_base);
+  // Gather the lower/higher 4 64-bit rows based on the lower/higher 4 64-bit 
row offsets.
+  __m256i row_lo = _mm256_i64gather_epi64(row_ptr_base_i64, offset_lo, 1);
+  __m256i row_hi = _mm256_i64gather_epi64(row_ptr_base_i64, offset_hi, 1);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row_lo);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + 4), row_hi);
+}
+
+inline void Decode1_avx2(uint8_t* output, const uint8_t* row_ptr, uint32_t 
num_bytes) {
+  // Copy 32 bytes at a time.
+  __m256i* output_i256 = reinterpret_cast<__m256i*>(output);
+  const __m256i* row_ptr_i256 = reinterpret_cast<const __m256i*>(row_ptr);
+  for (int istripe = 0; istripe < bit_util::CeilDiv(num_bytes, 32); ++istripe) 
{
+    _mm256_storeu_si256(output_i256 + istripe,
+                        _mm256_loadu_si256(row_ptr_i256 + istripe));
+  }
+}
+
+inline uint32_t Decode8Offset_avx2(uint32_t* output, uint32_t current_length,
+                                   __m256i num_bytes) {
+  uint32_t num_bytes_last = 
static_cast<uint32_t>(_mm256_extract_epi32(num_bytes, 7));
+  // Init every offset with the current length.
+  __m256i offsets = _mm256_set1_epi32(current_length);
+  // We keep right-shifting the length and accumulate the offset by adding the 
length.
+  __m256i length =
+      _mm256_permutevar8x32_epi32(num_bytes, _mm256_setr_epi32(7, 0, 1, 2, 3, 
4, 5, 6));
+  length = _mm256_insert_epi32(length, 0, 0);
+  for (int i = 0; i < 7; ++i) {
+    offsets = _mm256_add_epi32(offsets, length);
+    length =
+        _mm256_permutevar8x32_epi32(length, _mm256_setr_epi32(7, 0, 1, 2, 3, 
4, 5, 6));
+    length = _mm256_insert_epi32(length, 0, 0);
+  }
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), offsets);
+  return _mm256_extract_epi32(offsets, 7) + num_bytes_last;
+}
+
+inline void Decode8Null_avx2(uint8_t* output, uint64_t null_bytes) {
+  uint8_t null_bits =
+      
static_cast<uint8_t>(_mm256_movemask_epi8(_mm256_set1_epi64x(null_bytes)));
+  *output = ~null_bits;
+}
+
+}  // namespace
+
+int RowArray::DecodeFixedLength_avx2(ResizableArrayData* output, int 
output_start_row,
+                                     int column_id, uint32_t fixed_length,
+                                     int num_rows_to_append,
+                                     const uint32_t* row_ids) const {
+  // Benchmarking shows that when the data element width is <= 8, the scalar 
code almost
+  // always outperforms the vectorized code - about 2X~3X faster when the 
whole data set
+  // falls into L1~LLC, and the ratio goes down to about 1:1 as the data size 
increases
+  // when most of the accesses hit the main memory. This is possibly because 
that decoding
+  // is mostly copying scattered pieces of data and there is not enough 
computation to pay
+  // off the cost of the heavy gather instructions.
+  // For fixed length 0 (boolean column), the vectorized code wins by batching 
8 bits into
+  // a single byte instead of modifying the same byte 8 times in the scalar 
code.

Review Comment:
   Nice reminder!
   
   @raulcd I think I can put something like "the performance improvement on 
specific CPU models (blablabla) may not be as significant as expected due to 
blablabla" in the PR description. Is there something we should do to ensure 
that will appear in the coming release notes?
   
   Thanks. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] GH-43693: [C++][Acero] Support AVX2 swiss join decoding [arrow]

Reply via email to