kiszk commented on PR #48217:
URL: https://github.com/apache/arrow/pull/48217#issuecomment-4733298668

   @Vishwanatha-HD I think that the direction of this fix is correct.
   
   My main concern is test coverage. The existing `TestRoundtrip` tests only 
assert that encode → decode is self-consistent. On big-endian, a missing byte 
reversal cancels out across that round-trip, so the bug stays hidden. Note also 
that `ReferenceByteStreamSplitEncode` splits the in-memory bytes as-is, which 
is not the Parquet (LE-based) layout on a BE host — so it can't be used as the 
source of truth here.
   
   Could you add platform-independent fixed-value tests that pin both the 
decoded float values and the encoded byte layout to constants? These fail on BE 
if the reversal is missing and pass once it's added, which is exactly what 
locks in this fix. Something like:
   
   ```
   namespace {
   
   // Build the BYTE_STREAM_SPLIT encoded buffer for `values` from each value's
   // little-endian byte representation, as mandated by the Parquet spec.
   // `le_bytes[i]` must hold value i in little-endian order (LSB first).
   template <int kWidth>
   std::vector<uint8_t> MakeEncodedFromLittleEndian(
       const std::vector<std::array<uint8_t, kWidth>>& le_bytes) {
     const int64_t num_values = static_cast<int64_t>(le_bytes.size());
     std::vector<uint8_t> encoded(num_values * kWidth);
     for (int64_t i = 0; i < num_values; ++i) {
       for (int stream = 0; stream < kWidth; ++stream) {
         // stream `stream` holds byte `stream` of the LE representation.
         encoded[stream * num_values + i] = le_bytes[i][stream];
       }
     }
     return encoded;
   }
   
   }  // namespace
   
   TEST(TestByteStreamSplitFixed, DecodeFloat32) {
     // 1.0f  = 0x3F800000 -> LE bytes 00 00 80 3F
     // -2.0f = 0xC0000000 -> LE bytes 00 00 00 C0
     // 0.5f  = 0x3F000000 -> LE bytes 00 00 00 3F
     const std::vector<float> values = {1.0f, -2.0f, 0.5f};
     const std::vector<std::array<uint8_t, 4>> le_bytes = {
         {0x00, 0x00, 0x80, 0x3F},
         {0x00, 0x00, 0x00, 0xC0},
         {0x00, 0x00, 0x00, 0x3F},
     };
     const auto encoded = MakeEncodedFromLittleEndian<4>(le_bytes);
     const int64_t num_values = static_cast<int64_t>(values.size());
   
     // Decode must produce native-endian float values equal to the constants on
     // both little- and big-endian hosts.
     std::vector<float> decoded(num_values);
     ByteStreamSplitDecode(encoded.data(), /*width=*/4, num_values,
                           /*stride=*/num_values,
                           reinterpret_cast<uint8_t*>(decoded.data()));
     ASSERT_EQ(decoded, values);
   
     // Encode must reproduce the spec-defined (LE-based) byte layout.
     std::vector<uint8_t> re_encoded(num_values * 4);
     ByteStreamSplitEncode(reinterpret_cast<const uint8_t*>(values.data()),
                           /*width=*/4, num_values, re_encoded.data());
     ASSERT_EQ(re_encoded, encoded);
   }
   
   TEST(TestByteStreamSplitFixed, DecodeFloat64) {
     // 1.0  = 0x3FF0000000000000 -> LE bytes 00 00 00 00 00 00 F0 3F
     // -2.0 = 0xC000000000000000 -> LE bytes 00 00 00 00 00 00 00 C0
     const std::vector<double> values = {1.0, -2.0};
     const std::vector<std::array<uint8_t, 8>> le_bytes = {
         {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x3F},
         {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xC0},
     };
     const auto encoded = MakeEncodedFromLittleEndian<8>(le_bytes);
     const int64_t num_values = static_cast<int64_t>(values.size());
   
     std::vector<double> decoded(num_values);
     ByteStreamSplitDecode(encoded.data(), /*width=*/8, num_values,
                           /*stride=*/num_values,
                           reinterpret_cast<uint8_t*>(decoded.data()));
     ASSERT_EQ(decoded, values);
   
     std::vector<uint8_t> re_encoded(num_values * 8);
     ByteStreamSplitEncode(reinterpret_cast<const uint8_t*>(values.data()),
                           /*width=*/8, num_values, re_encoded.data());
     ASSERT_EQ(re_encoded, encoded);
   }
   ```
   
   In addition, it would be good to add the `@kBlockSize`boundary checks like
   
   ```
   TEST(TestByteStreamSplitFixed, DecodeFloat32CrossesBlockBoundary) {
     constexpr int64_t kNumValues = 200;  // > 128 forces the merge main loop
     std::vector<float> values(kNumValues);
     std::vector<std::array<uint8_t, 4>> le_bytes(kNumValues);
     for (int64_t i = 0; i < kNumValues; ++i) {
       // 4 distinct LE bytes per value, so any byte misordering is observable.
       const uint32_t bits = 0x04030201u + static_cast<uint32_t>(i);
       values[i] = SafeCopy<float>(bits);
       le_bytes[i] = {static_cast<uint8_t>(bits), static_cast<uint8_t>(bits >> 
8),
                      static_cast<uint8_t>(bits >> 16), 
static_cast<uint8_t>(bits >> 24)};
     }
     const auto encoded = MakeEncodedFromLittleEndian<4>(le_bytes);
   
     std::vector<float> decoded(kNumValues);
     ByteStreamSplitDecode(encoded.data(), /*width=*/4, kNumValues,
                           /*stride=*/kNumValues,
                           reinterpret_cast<uint8_t*>(decoded.data()));
     for (int64_t i = 0; i < kNumValues; ++i) {
       ASSERT_EQ(SafeCopy<uint32_t>(decoded[i]), SafeCopy<uint32_t>(values[i]));
     }
   
     std::vector<uint8_t> re_encoded(kNumValues * 4);
     ByteStreamSplitEncode(reinterpret_cast<const uint8_t*>(values.data()),
                           /*width=*/4, kNumValues, re_encoded.data());
     ASSERT_EQ(re_encoded, encoded);
   }
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to