pitrou commented on PR #47573:
URL: https://github.com/apache/arrow/pull/47573#issuecomment-3405559075
Also some statistics on the generated assembly code:
```console
# Number of scalar shifts and ORs
$ objdump -w --demangle --no-addresses --no-show-raw-insn
--disassemble='int arrow::internal::unpack_avx2<unsigned int>(unsigned char
const*, unsigned int*, int, int)'
/build/build-release/relwithdebinfo/libarrow.so | rg -e '\b(sh[lr]|or)' -c
430
# Number of instructions moving data around
$ objdump -w --demangle --no-addresses --no-show-raw-insn
--disassemble='int arrow::internal::unpack_avx2<unsigned int>(unsigned char
const*, unsigned int*, int, int)'
/build/build-release/relwithdebinfo/libarrow.so | rg -e
'\b(mov|vmov|vinsert|vpinsr|vpunpck|vpbroadcast)' -c
2290
```
Based on these numbers, it's not obvious that eliminating the remaining
scalar shifts and ORs (when a bit-packed value straddles an input word
boundary) would really improve performance, as they would be replaced by
additional register moves to populate a second vector register.
<details>
Basically the idea would be to replace (example of unpacking 11-bit values
into 32-bit output):
```c++
// extract 11-bit bundles 0 to 7
words = simd_batch{
SafeLoadAs<uint32_t>(in + 4 * 0),
SafeLoadAs<uint32_t>(in + 4 * 0),
SafeLoadAs<uint32_t>(in + 4 * 0) >> 22 | SafeLoadAs<uint32_t>(in + 4 *
1) << 10,
SafeLoadAs<uint32_t>(in + 4 * 1),
SafeLoadAs<uint32_t>(in + 4 * 1),
SafeLoadAs<uint32_t>(in + 4 * 1) >> 23 | SafeLoadAs<uint32_t>(in + 4 *
2) << 9,
SafeLoadAs<uint32_t>(in + 4 * 2),
SafeLoadAs<uint32_t>(in + 4 * 2),
};
shifts = simd_batch{ 0, 11, 0, 1, 12, 0, 2, 13 };
results = (words >> shifts) & masks;
results.store_unaligned(out);
```
with:
```c++
// extract 11-bit bundles 0 to 7
words1 = simd_batch{
SafeLoadAs<uint32_t>(in + 4 * 0),
SafeLoadAs<uint32_t>(in + 4 * 0),
SafeLoadAs<uint32_t>(in + 4 * 0),
SafeLoadAs<uint32_t>(in + 4 * 1),
SafeLoadAs<uint32_t>(in + 4 * 1),
SafeLoadAs<uint32_t>(in + 4 * 1),
SafeLoadAs<uint32_t>(in + 4 * 2),
SafeLoadAs<uint32_t>(in + 4 * 2),
} >> simd_batch{ 0, 0, 22, 0, 0, 23, 0, 0};
words2 = simd_batch{
SafeLoadAs<uint32_t>(in + 4 * 0),
SafeLoadAs<uint32_t>(in + 4 * 0),
SafeLoadAs<uint32_t>(in + 4 * 1),
SafeLoadAs<uint32_t>(in + 4 * 1),
SafeLoadAs<uint32_t>(in + 4 * 1),
SafeLoadAs<uint32_t>(in + 4 * 2),
SafeLoadAs<uint32_t>(in + 4 * 2),
SafeLoadAs<uint32_t>(in + 4 * 2),
} << simd_batch{ 0, 0, 10, 0, 0, 9, 0, 0};;
shifts = simd_batch{ 0, 11, 0, 1, 12, 0, 2, 13 };
results = ((words1 | words2) >> shifts) & masks;
results.store_unaligned(out);
```
</details>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]