zanmato1984 commented on code in PR #43832: URL: https://github.com/apache/arrow/pull/43832#discussion_r1743012373
########## cpp/src/arrow/acero/swiss_join_avx2.cc: ########## @@ -233,27 +243,288 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, const uint8_t* null_masks = rows.null_masks(); __m256i null_bits_per_row = _mm256_set1_epi32(8 * rows.metadata().null_masks_bytes_per_row); + __m256i pos_after_encoding = + _mm256_set1_epi32(rows.metadata().pos_after_encoding(column_id)); for (int i = 0; i < num_rows / unroll; ++i) { __m256i row_id = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(row_ids) + i); __m256i bit_id = _mm256_mullo_epi32(row_id, null_bits_per_row); - bit_id = _mm256_add_epi32(bit_id, _mm256_set1_epi32(column_id)); + bit_id = _mm256_add_epi32(bit_id, pos_after_encoding); __m256i bytes = _mm256_i32gather_epi32(reinterpret_cast<const int*>(null_masks), _mm256_srli_epi32(bit_id, 3), 1); __m256i bit_in_word = _mm256_sllv_epi32( _mm256_set1_epi32(1), _mm256_and_si256(bit_id, _mm256_set1_epi32(7))); __m256i result = _mm256_cmpeq_epi32(_mm256_and_si256(bytes, bit_in_word), bit_in_word); - uint64_t null_bytes = static_cast<uint64_t>( + // NB: Be careful about sign-extension when casting the return value of + // _mm256_movemask_epi8 (signed 32-bit) to unsigned 64-bit, which will pollute the + // higher bits of the following OR. + uint32_t null_bytes_lo = static_cast<uint32_t>( _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_castsi256_si128(result)))); - null_bytes |= static_cast<uint64_t>(_mm256_movemask_epi8( - _mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 1)))) - << 32; + uint64_t null_bytes_hi = + _mm256_movemask_epi8(_mm256_cvtepi32_epi64(_mm256_extracti128_si256(result, 1))); + uint64_t null_bytes = null_bytes_lo | (null_bytes_hi << 32); process_8_values_fn(i * unroll, null_bytes); } return num_rows - (num_rows % unroll); } +namespace { + +inline void Decode8FixedLength0_avx2(uint8_t* output, const uint8_t* row_ptr_base, + __m256i offset_lo, __m256i offset_hi) { + // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows based on the + // lower/higher 4 64-bit row offsets. + __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 1); + __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 1); + // Extend to 64-bit. + __m256i row_lo_64 = _mm256_cvtepi32_epi64(row_lo); + __m256i row_hi_64 = _mm256_cvtepi32_epi64(row_hi); + // Keep the first 8 bits in each 64-bit row. + row_lo_64 = _mm256_and_si256(row_lo_64, _mm256_set1_epi64x(0xFF)); + row_hi_64 = _mm256_and_si256(row_hi_64, _mm256_set1_epi64x(0xFF)); + // If the 64-bit is zero, then we get 64 set bits. + __m256i is_zero_lo_64 = _mm256_cmpeq_epi64(row_lo_64, _mm256_setzero_si256()); + __m256i is_zero_hi_64 = _mm256_cmpeq_epi64(row_hi_64, _mm256_setzero_si256()); + // 64 set bits to 8 set bits. + int is_zero_lo_8 = _mm256_movemask_epi8(is_zero_lo_64); + int is_zero_hi_8 = _mm256_movemask_epi8(is_zero_hi_64); + // 8 set bits to 1 set bit. + uint8_t is_zero = static_cast<uint8_t>( + _mm_movemask_epi8(_mm_set_epi32(0, 0, is_zero_hi_8, is_zero_lo_8))); + *output = static_cast<uint8_t>(~is_zero); +} + +inline void Decode8FixedLength1_avx2(uint8_t* output, const uint8_t* row_ptr_base, + __m256i offset_lo, __m256i offset_hi) { + // Gather the lower/higher 4 32-bit (only lower 8 bits interesting) rows based on the + // lower/higher 4 64-bit row offsets. + __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 1); + __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 1); + __m256i row = _mm256_set_m128i(row_hi, row_lo); + // Shuffle the lower 8 bits of each 32-bit rows to the lower 32 bits of each 128-bit + // lane. + constexpr uint64_t kByteSequence_0_4_8_12 = 0x0c080400ULL; + const __m256i shuffle_const = + _mm256_setr_epi64x(kByteSequence_0_4_8_12, -1, kByteSequence_0_4_8_12, -1); + row = _mm256_shuffle_epi8(row, shuffle_const); + // Get the lower 32-bits (4 8-bit rows) from each 128-bit lane. + // NB: Be careful about sign-extension when casting the return value of + // _mm256_extract_epi32 (signed 32-bit) to unsigned 64-bit, which will pollute the + // higher bits of the following OR. + uint32_t compact_row_lo = static_cast<uint32_t>(_mm256_extract_epi32(row, 0)); + uint64_t compact_row_hi = static_cast<uint64_t>(_mm256_extract_epi32(row, 4)) << 32; + *reinterpret_cast<uint64_t*>(output) = compact_row_lo | compact_row_hi; +} + +inline void Decode8FixedLength2_avx2(uint16_t* output, const uint8_t* row_ptr_base, + __m256i offset_lo, __m256i offset_hi) { + // Gather the lower/higher 4 32-bit (only lower 16 bits interesting) rows based on the + // lower/higher 4 64-bit row offsets. + __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 1); + __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 1); + __m256i row = _mm256_set_m128i(row_hi, row_lo); + // Shuffle the lower 16 bits of each 32-bit rows to the lower 64 bits of each 128-bit + // lane. + constexpr uint64_t kByteSequence_0_1_4_5_8_9_12_13 = 0x0d0c090805040100ULL; + const __m256i shuffle_const = _mm256_setr_epi64x(kByteSequence_0_1_4_5_8_9_12_13, -1, + kByteSequence_0_1_4_5_8_9_12_13, -1); + row = _mm256_shuffle_epi8(row, shuffle_const); + // Swap the second and the third 64-bit lane. + row = _mm256_permute4x64_epi64(row, 0xd8); + _mm_storeu_si128(reinterpret_cast<__m128i*>(output), _mm256_castsi256_si128(row)); +} + +inline void Decode8FixedLength4_avx2(uint32_t* output, const uint8_t* row_ptr_base, + __m256i offset_lo, __m256i offset_hi) { + // Gather the lower/higher 4 32-bit rows based on the lower/higher 4 64-bit row offsets. + __m128i row_lo = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_lo, 1); + __m128i row_hi = _mm256_i64gather_epi32((const int*)row_ptr_base, offset_hi, 1); + __m256i row = _mm256_set_m128i(row_hi, row_lo); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row); +} + +inline void Decode8FixedLength8_avx2(uint64_t* output, const uint8_t* row_ptr_base, + __m256i offset_lo, __m256i offset_hi) { + auto row_ptr_base_i64 = + reinterpret_cast<const arrow::util::int64_for_gather_t*>(row_ptr_base); + // Gather the lower/higher 4 64-bit rows based on the lower/higher 4 64-bit row offsets. + __m256i row_lo = _mm256_i64gather_epi64(row_ptr_base_i64, offset_lo, 1); + __m256i row_hi = _mm256_i64gather_epi64(row_ptr_base_i64, offset_hi, 1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), row_lo); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + 4), row_hi); +} + +inline void Decode1_avx2(uint8_t* output, const uint8_t* row_ptr, uint32_t num_bytes) { + // Copy 32 bytes at a time. + __m256i* output_i256 = reinterpret_cast<__m256i*>(output); + const __m256i* row_ptr_i256 = reinterpret_cast<const __m256i*>(row_ptr); + for (int istripe = 0; istripe < bit_util::CeilDiv(num_bytes, 32); ++istripe) { + _mm256_storeu_si256(output_i256 + istripe, + _mm256_loadu_si256(row_ptr_i256 + istripe)); + } +} + +inline uint32_t Decode8Offset_avx2(uint32_t* output, uint32_t current_length, + __m256i num_bytes) { + uint32_t num_bytes_last = static_cast<uint32_t>(_mm256_extract_epi32(num_bytes, 7)); + // Init every offset with the current length. + __m256i offsets = _mm256_set1_epi32(current_length); + // We keep right-shifting the length and accumulate the offset by adding the length. + __m256i length = + _mm256_permutevar8x32_epi32(num_bytes, _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6)); + length = _mm256_insert_epi32(length, 0, 0); + for (int i = 0; i < 7; ++i) { + offsets = _mm256_add_epi32(offsets, length); + length = + _mm256_permutevar8x32_epi32(length, _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6)); + length = _mm256_insert_epi32(length, 0, 0); + } + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output), offsets); + return _mm256_extract_epi32(offsets, 7) + num_bytes_last; +} + +inline void Decode8Null_avx2(uint8_t* output, uint64_t null_bytes) { + uint8_t null_bits = + static_cast<uint8_t>(_mm256_movemask_epi8(_mm256_set1_epi64x(null_bytes))); + *output = ~null_bits; +} + +} // namespace + +int RowArray::DecodeFixedLength_avx2(ResizableArrayData* output, int output_start_row, + int column_id, uint32_t fixed_length, + int num_rows_to_append, + const uint32_t* row_ids) const { + // Benchmarking shows that when the data element width is <= 8, the scalar code almost + // always outperforms the vectorized code - about 2X~3X faster when the whole data set + // falls into L1~LLC, and the ratio goes down to about 1:1 as the data size increases + // when most of the accesses hit the main memory. This is possibly because that decoding + // is mostly copying scattered pieces of data and there is not enough computation to pay + // off the cost of the heavy gather instructions. + // For fixed length 0 (boolean column), the vectorized code wins by batching 8 bits into + // a single byte instead of modifying the same byte 8 times in the scalar code. Review Comment: Nice reminder! @raulcd I think I can put something like "the performance improvement on specific CPU models (blablabla) may not be as significant as expected due to blablabla" in the PR description. Is there something we should do to ensure that will appear in the coming release notes? Thanks. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org