AntoinePrv commented on code in PR #46789: URL: https://github.com/apache/arrow/pull/46789#discussion_r2161887714
########## cpp/src/arrow/util/byte_stream_split_internal.h: ########## @@ -89,23 +102,72 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va } for (int j = 0; j < kNumStreams; ++j) { xsimd::store_unaligned( - reinterpret_cast<int8_t*>(out + (i * kNumStreams + j) * sizeof(simd_batch)), + reinterpret_cast<int8_t*>(out + (i * kNumStreams + j) * simd_batch::size), stage[kNumStreamsLog2][j]); } } } +template <int kNumBytes> +struct grouped_bytes_impl; + +template <> +struct grouped_bytes_impl<1> { + using type = int8_t; +}; + +template <> +struct grouped_bytes_impl<2> { + using type = int16_t; +}; + +template <> +struct grouped_bytes_impl<4> { + using type = int32_t; +}; + +template <> +struct grouped_bytes_impl<8> { + using type = int64_t; +}; + +// Map a number of bytes to a type +template <int kNumBytes> +using grouped_bytes_t = typename grouped_bytes_impl<kNumBytes>::type; + +// Like xsimd::zlip_lo, but zip groups of NBytes at once +template <int kNumBytes, int kBatchSize = 16, + typename Batch = xsimd::make_sized_batch_t<int8_t, kBatchSize>> +auto zip_lo_n(Batch const& a, Batch const& b) -> Batch { + if constexpr (kNumBytes == kBatchSize) { + return a; + } else { + return xsimd::bitwise_cast<int8_t>( + xsimd::zip_lo(xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(a), + xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(b))); + } +} + +// Like xsimd::zlip_hi, but zip groups of NBytes at once +template <int kNumBytes, int kBatchSize = 16, + typename Batch = xsimd::make_sized_batch_t<int8_t, kBatchSize>> +auto zip_hi_n(Batch const& a, Batch const& b) -> Batch { + if constexpr (kNumBytes == kBatchSize) { + return b; + } else { + return xsimd::bitwise_cast<int8_t>( + xsimd::zip_hi(xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(a), + xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(b))); + } +} + template <int kNumStreams> void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, const int64_t num_values, uint8_t* output_buffer_raw) { using simd_batch = xsimd::make_sized_batch_t<int8_t, 16>; assert(width == kNumStreams); - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); - constexpr int kBlockSize = sizeof(simd_batch) * kNumStreams; - - simd_batch stage[3][kNumStreams]; - simd_batch final_result[kNumStreams]; + constexpr int kBlockSize = simd_batch::size * kNumStreams; Review Comment: This specific constant was already here. Here `sizeof` was replaced with `::size`. As for the function in general: - it is independent of the *number of streams* (under condition that kNumStreams <= simd_batch::size and kNumStream a power of two, for which I'm adding a check.) - it is not far from being independent of the SIMD batch size. I had plan the final changes for another PR but I guess I could include them here if you prefer. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org