kiszk commented on PR #48217:
URL: https://github.com/apache/arrow/pull/48217#issuecomment-4733298668
@Vishwanatha-HD I think that the direction of this fix is correct.
My main concern is test coverage. The existing `TestRoundtrip` tests only
assert that encode → decode is self-consistent. On big-endian, a missing byte
reversal cancels out across that round-trip, so the bug stays hidden. Note also
that `ReferenceByteStreamSplitEncode` splits the in-memory bytes as-is, which
is not the Parquet (LE-based) layout on a BE host — so it can't be used as the
source of truth here.
Could you add platform-independent fixed-value tests that pin both the
decoded float values and the encoded byte layout to constants? These fail on BE
if the reversal is missing and pass once it's added, which is exactly what
locks in this fix. Something like:
```
namespace {
// Build the BYTE_STREAM_SPLIT encoded buffer for `values` from each value's
// little-endian byte representation, as mandated by the Parquet spec.
// `le_bytes[i]` must hold value i in little-endian order (LSB first).
template <int kWidth>
std::vector<uint8_t> MakeEncodedFromLittleEndian(
const std::vector<std::array<uint8_t, kWidth>>& le_bytes) {
const int64_t num_values = static_cast<int64_t>(le_bytes.size());
std::vector<uint8_t> encoded(num_values * kWidth);
for (int64_t i = 0; i < num_values; ++i) {
for (int stream = 0; stream < kWidth; ++stream) {
// stream `stream` holds byte `stream` of the LE representation.
encoded[stream * num_values + i] = le_bytes[i][stream];
}
}
return encoded;
}
} // namespace
TEST(TestByteStreamSplitFixed, DecodeFloat32) {
// 1.0f = 0x3F800000 -> LE bytes 00 00 80 3F
// -2.0f = 0xC0000000 -> LE bytes 00 00 00 C0
// 0.5f = 0x3F000000 -> LE bytes 00 00 00 3F
const std::vector<float> values = {1.0f, -2.0f, 0.5f};
const std::vector<std::array<uint8_t, 4>> le_bytes = {
{0x00, 0x00, 0x80, 0x3F},
{0x00, 0x00, 0x00, 0xC0},
{0x00, 0x00, 0x00, 0x3F},
};
const auto encoded = MakeEncodedFromLittleEndian<4>(le_bytes);
const int64_t num_values = static_cast<int64_t>(values.size());
// Decode must produce native-endian float values equal to the constants on
// both little- and big-endian hosts.
std::vector<float> decoded(num_values);
ByteStreamSplitDecode(encoded.data(), /*width=*/4, num_values,
/*stride=*/num_values,
reinterpret_cast<uint8_t*>(decoded.data()));
ASSERT_EQ(decoded, values);
// Encode must reproduce the spec-defined (LE-based) byte layout.
std::vector<uint8_t> re_encoded(num_values * 4);
ByteStreamSplitEncode(reinterpret_cast<const uint8_t*>(values.data()),
/*width=*/4, num_values, re_encoded.data());
ASSERT_EQ(re_encoded, encoded);
}
TEST(TestByteStreamSplitFixed, DecodeFloat64) {
// 1.0 = 0x3FF0000000000000 -> LE bytes 00 00 00 00 00 00 F0 3F
// -2.0 = 0xC000000000000000 -> LE bytes 00 00 00 00 00 00 00 C0
const std::vector<double> values = {1.0, -2.0};
const std::vector<std::array<uint8_t, 8>> le_bytes = {
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC0},
};
const auto encoded = MakeEncodedFromLittleEndian<8>(le_bytes);
const int64_t num_values = static_cast<int64_t>(values.size());
std::vector<double> decoded(num_values);
ByteStreamSplitDecode(encoded.data(), /*width=*/8, num_values,
/*stride=*/num_values,
reinterpret_cast<uint8_t*>(decoded.data()));
ASSERT_EQ(decoded, values);
std::vector<uint8_t> re_encoded(num_values * 8);
ByteStreamSplitEncode(reinterpret_cast<const uint8_t*>(values.data()),
/*width=*/8, num_values, re_encoded.data());
ASSERT_EQ(re_encoded, encoded);
}
```
In addition, it would be good to add the `@kBlockSize`boundary checks like
```
TEST(TestByteStreamSplitFixed, DecodeFloat32CrossesBlockBoundary) {
constexpr int64_t kNumValues = 200; // > 128 forces the merge main loop
std::vector<float> values(kNumValues);
std::vector<std::array<uint8_t, 4>> le_bytes(kNumValues);
for (int64_t i = 0; i < kNumValues; ++i) {
// 4 distinct LE bytes per value, so any byte misordering is observable.
const uint32_t bits = 0x04030201u + static_cast<uint32_t>(i);
values[i] = SafeCopy<float>(bits);
le_bytes[i] = {static_cast<uint8_t>(bits), static_cast<uint8_t>(bits >>
8),
static_cast<uint8_t>(bits >> 16),
static_cast<uint8_t>(bits >> 24)};
}
const auto encoded = MakeEncodedFromLittleEndian<4>(le_bytes);
std::vector<float> decoded(kNumValues);
ByteStreamSplitDecode(encoded.data(), /*width=*/4, kNumValues,
/*stride=*/kNumValues,
reinterpret_cast<uint8_t*>(decoded.data()));
for (int64_t i = 0; i < kNumValues; ++i) {
ASSERT_EQ(SafeCopy<uint32_t>(decoded[i]), SafeCopy<uint32_t>(values[i]));
}
std::vector<uint8_t> re_encoded(kNumValues * 4);
ByteStreamSplitEncode(reinterpret_cast<const uint8_t*>(values.data()),
/*width=*/4, kNumValues, re_encoded.data());
ASSERT_EQ(re_encoded, encoded);
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]