Re: [PR] GH-47112: [Parquet][C++] Rle BitPacked parser [arrow]

via GitHub Thu, 28 Aug 2025 01:57:21 -0700


HuaHuaY commented on code in PR #47294:
URL: https://github.com/apache/arrow/pull/47294#discussion_r2306492567



##########
cpp/src/arrow/util/bit_util_test.cc:
##########
@@ -1997,11 +1997,119 @@ TEST(BitUtil, RoundUpToPowerOf2) {
 #undef U64
 #undef S64
 
+/// Test the maximum number of bytes needed to write a LEB128 of a give size.
+TEST(BitStreamUtil, MaxLEB128ByteLenFor) {

Review Comment:
   Will it be better to ensure correctness through `static_assert`?



##########
cpp/src/arrow/util/bit_util.h:
##########
@@ -365,5 +365,118 @@ void PackBits(const uint32_t* values, uint8_t* out) {
   }
 }
 
+constexpr int64_t MaxLEB128ByteLen(int64_t n_bits) { return CeilDiv(n_bits, 
7); }
+
+template <typename Int>
+constexpr int64_t MaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8);
+
+/// Write a integer as LEB128
+///
+/// Write the input value as LEB128 into the outptu buffer and return the 
number of bytes
+/// written.
+/// If the output buffer size is insufficient, return 0 but the output may 
have been
+/// written to.
+///
+/// \see https://en.wikipedia.org/wiki/LEB128
+/// \see MaxLEB128ByteLenFor
+template <typename Int>
+constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) {
+  constexpr Int kLow7Mask = Int(0x7F);
+  constexpr Int kHigh7Mask = ~kLow7Mask;
+  constexpr uint8_t kContinuationBit = 0x80;
+
+  auto const out_first = out;
+
+  // Write as many bytes as the could be for the given input
+  while ((value & kHigh7Mask) != Int(0)) {
+    // We do not have enough room to write the LEB128
+    if (out - out_first >= max_out_size) {
+      return 0;
+    }
+
+    // Write the encoded byte with continuation bit
+    *out = static_cast<uint8_t>(value & kLow7Mask) | kContinuationBit;
+    ++out;
+    // Shift remaining data
+    value >>= 7;
+  }
+
+  // We do not have enough room to write the LEB128
+  if (out - out_first >= max_out_size) {
+    return 0;
+  }
+
+  // Write last non-continuing byte
+  *out = static_cast<uint8_t>(value & kLow7Mask);
+  ++out;
+
+  return static_cast<int32_t>(out - out_first);
+}
+
+/// Parse a leading LEB128
+///
+/// Take as input a data pointer and the maximum number of bytes that can be 
read from it
+/// (typically the array size).
+/// When a valid LEB128 is found at the start of the data, the function writes 
it to the
+/// out pointer and return the number of bytes read.
+/// Otherwise, the out pointer is unmodified and zero is returned.
+///
+/// \see https://en.wikipedia.org/wiki/LEB128
+/// \see MaxLEB128ByteLenFor
+template <typename Int>
+constexpr int32_t ParseLeadingLEB128(uint8_t const* data, int32_t 
max_data_size,
+                                     Int* out) {
+  constexpr auto kMaxBytes = static_cast<int32_t>(MaxLEB128ByteLenFor<Int>);
+  static_assert(kMaxBytes >= 1);
+  constexpr uint8_t kLow7Mask = 0x7F;
+  constexpr uint8_t kContinuationBit = 0x80;
+  constexpr int32_t kSignBitCount = std::is_signed_v<Int> ? 1 : 0;
+  // Number of bits allowed for encoding data on the last byte to avoid 
overflow
+  constexpr uint8_t kHighBitCount = (8 * sizeof(Int) - kSignBitCount) % 7;
+  // kHighBitCount least significant `0` bits and the rest with `1`
+  constexpr uint8_t kHighForbiddenMask = ~((1 << kHighBitCount) - 1);
+
+  // Iteratively building the value
+  std::make_unsigned_t<Int> value = 0;
+
+  // Read as many bytes as the could be for the give output
+  for (int32_t i = 0; i < kMaxBytes - 1; i++) {
+    // We have not finished reading a valid LEB128, yet we run out of data
+    if (ARROW_PREDICT_FALSE(i >= max_data_size)) {
+      return 0;
+    }
+
+    // Read the byte and set its 7 LSB to in the final value
+    uint8_t const byte = data[i];
+    value |= static_cast<Int>(byte & kLow7Mask) << (7 * i);
+
+    // Check for lack of continuation flag in MSB
+    if ((byte & kContinuationBit) == 0) {
+      *out = value;
+      return i + 1;
+    }
+  }
+
+  // Process the last index avoiding overflowing
+  constexpr int32_t last = kMaxBytes - 1;
+
+  // We have not finished reading a valid LEB128, yet we run out of data
+  if (ARROW_PREDICT_FALSE(last >= max_data_size)) {
+    return 0;
+  }
+
+  uint8_t const byte = data[last];
+
+  // Need to check if there are bits that would overflow the output.
+  // Also checks that there is no continuation.
+  if (ARROW_PREDICT_FALSE((byte & kHighForbiddenMask) != 0)) {

Review Comment:
   In my opinion, high bit remains bit0 when original 
`BitWriter::PutVlqInt(uint64_t v)` writes the last byte due to right shift of 
`uint64_t`. In which case should we treat the last byte specially in 
`WriteLEB128` and `ParseLeadingLEB128 `? 



##########
cpp/src/arrow/util/bit_stream_utils_internal.h:
##########
@@ -483,13 +496,17 @@ inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
 }
 
 inline bool BitWriter::PutVlqInt(uint64_t v) {
-  bool result = true;
-  while ((v & 0xFFFFFFFFFFFFFF80ULL) != 0ULL) {
-    result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);
-    v >>= 7;
+  constexpr auto kMaxBytes = bit_util::MaxLEB128ByteLenFor<decltype(v)>;

Review Comment:
   ```suggestion
     constexpr auto kMaxBytes = kMaxVlqByteLengthForInt64;
   ```



##########
cpp/src/arrow/util/bit_util.h:
##########
@@ -365,5 +365,118 @@ void PackBits(const uint32_t* values, uint8_t* out) {
   }
 }
 
+constexpr int64_t MaxLEB128ByteLen(int64_t n_bits) { return CeilDiv(n_bits, 
7); }
+
+template <typename Int>
+constexpr int64_t MaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8);
+
+/// Write a integer as LEB128
+///
+/// Write the input value as LEB128 into the outptu buffer and return the 
number of bytes

Review Comment:
   ```suggestion
   /// Write the input value as LEB128 into the output buffer and return the 
number of bytes
   ```



##########
cpp/src/arrow/util/bit_util.h:
##########
@@ -365,5 +365,118 @@ void PackBits(const uint32_t* values, uint8_t* out) {
   }
 }
 
+constexpr int64_t MaxLEB128ByteLen(int64_t n_bits) { return CeilDiv(n_bits, 
7); }
+
+template <typename Int>
+constexpr int64_t MaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8);
+
+/// Write a integer as LEB128
+///
+/// Write the input value as LEB128 into the outptu buffer and return the 
number of bytes
+/// written.
+/// If the output buffer size is insufficient, return 0 but the output may 
have been
+/// written to.
+///
+/// \see https://en.wikipedia.org/wiki/LEB128
+/// \see MaxLEB128ByteLenFor
+template <typename Int>
+constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) {

Review Comment:
   Why do we write two new functions instead of moving the previous function 
here? If we want a generic function through template, why is 
`BitWriter::PutVlqInt(uint32_t v)` not replaced?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-47112: [Parquet][C++] Rle BitPacked parser [arrow]

Reply via email to