pitrou commented on code in PR #47294:
URL: https://github.com/apache/arrow/pull/47294#discussion_r2359203120
##########
cpp/src/arrow/util/rle_encoding_internal.h:
##########
@@ -82,86 +81,400 @@ namespace util {
/// 200 ints = 25 groups of 8
/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
/// (total 26 bytes, 1 byte overhead)
-//
-/// Decoder class for RLE encoded data.
-class RleDecoder {
+/// The type for an encoded Rle of BitPacked run size, between 1 and 2^31-1 as
per Parquet
+/// spec.
+/// This is also pragmatically used for other integer used in the Rle and
BitPacked runs
+/// and decoder to avoid conversions.
+/// It can therefore be referred to as a "typical" size for Rle and BitPacked
logic.
+using rle_size_t = int32_t;
+
+template <typename T>
+class RleRunDecoder;
+
+class RleRun {
public:
- /// Create a decoder object. buffer/buffer_len is the decoded data.
- /// bit_width is the width of each value (before encoding).
- RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
- : bit_reader_(buffer, buffer_len),
- bit_width_(bit_width),
- current_value_(0),
- repeat_count_(0),
- literal_count_(0) {
- ARROW_DCHECK_GE(bit_width_, 0);
- ARROW_DCHECK_LE(bit_width_, 64);
+ /// The decoder class used to decode a single run in the given type.
+ template <typename T>
+ using DecoderType = RleRunDecoder<T>;
+
+ constexpr RleRun() noexcept = default;
+
+ explicit RleRun(const uint8_t* data, rle_size_t values_count,
+ rle_size_t value_bit_width) noexcept
+ : values_count_(values_count), value_bit_width_(value_bit_width) {
+ ARROW_DCHECK_GE(value_bit_width, 0);
+ ARROW_DCHECK_GE(values_count, 0);
+ std::copy(data, data + raw_data_size(), data_.begin());
}
- RleDecoder() : bit_width_(-1) {}
+ /// The number of repeated values in this run.
+ constexpr rle_size_t values_count() const noexcept { return values_count_; }
- void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
- ARROW_DCHECK_GE(bit_width, 0);
- ARROW_DCHECK_LE(bit_width, 64);
- bit_reader_.Reset(buffer, buffer_len);
- bit_width_ = bit_width;
- current_value_ = 0;
- repeat_count_ = 0;
- literal_count_ = 0;
+ /// The size in bits of each encoded value.
+ constexpr rle_size_t values_bit_width() const noexcept { return
value_bit_width_; }
+
+ /// A pointer to the repeated value raw bytes.
+ constexpr const uint8_t* raw_data_ptr() const noexcept { return
data_.data(); }
+
+ /// The number of bytes used for the raw repeated value.
+ constexpr rle_size_t raw_data_size() const noexcept {
+ auto out = bit_util::BytesForBits(value_bit_width_);
+ ARROW_DCHECK_LE(out, std::numeric_limits<rle_size_t>::max());
+ return static_cast<rle_size_t>(out);
}
- /// Gets the next value. Returns false if there are no more.
+ private:
+ /// The repeated value raw bytes stored inside the class with enough space
to store
+ /// up to a 64 bit value.
+ std::array<uint8_t, 8> data_ = {};
+ /// The number of time the value is repeated.
+ rle_size_t values_count_ = 0;
+ /// The size in bit of a packed value in the run.
+ rle_size_t value_bit_width_ = 0;
Review Comment:
I'm curious, why does this have to be a member of `RleRun`? The value bit
width is constant for the entire RleBitPacked-encoded data.
##########
cpp/src/arrow/util/rle_encoding_internal.h:
##########
@@ -82,86 +81,400 @@ namespace util {
/// 200 ints = 25 groups of 8
/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
/// (total 26 bytes, 1 byte overhead)
-//
-/// Decoder class for RLE encoded data.
-class RleDecoder {
+/// The type for an encoded Rle of BitPacked run size, between 1 and 2^31-1 as
per Parquet
+/// spec.
+/// This is also pragmatically used for other integer used in the Rle and
BitPacked runs
+/// and decoder to avoid conversions.
+/// It can therefore be referred to as a "typical" size for Rle and BitPacked
logic.
+using rle_size_t = int32_t;
+
+template <typename T>
+class RleRunDecoder;
+
+class RleRun {
public:
- /// Create a decoder object. buffer/buffer_len is the decoded data.
- /// bit_width is the width of each value (before encoding).
- RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
- : bit_reader_(buffer, buffer_len),
- bit_width_(bit_width),
- current_value_(0),
- repeat_count_(0),
- literal_count_(0) {
- ARROW_DCHECK_GE(bit_width_, 0);
- ARROW_DCHECK_LE(bit_width_, 64);
+ /// The decoder class used to decode a single run in the given type.
+ template <typename T>
+ using DecoderType = RleRunDecoder<T>;
+
+ constexpr RleRun() noexcept = default;
+
+ explicit RleRun(const uint8_t* data, rle_size_t values_count,
+ rle_size_t value_bit_width) noexcept
+ : values_count_(values_count), value_bit_width_(value_bit_width) {
+ ARROW_DCHECK_GE(value_bit_width, 0);
+ ARROW_DCHECK_GE(values_count, 0);
+ std::copy(data, data + raw_data_size(), data_.begin());
}
- RleDecoder() : bit_width_(-1) {}
+ /// The number of repeated values in this run.
+ constexpr rle_size_t values_count() const noexcept { return values_count_; }
- void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
- ARROW_DCHECK_GE(bit_width, 0);
- ARROW_DCHECK_LE(bit_width, 64);
- bit_reader_.Reset(buffer, buffer_len);
- bit_width_ = bit_width;
- current_value_ = 0;
- repeat_count_ = 0;
- literal_count_ = 0;
+ /// The size in bits of each encoded value.
+ constexpr rle_size_t values_bit_width() const noexcept { return
value_bit_width_; }
+
+ /// A pointer to the repeated value raw bytes.
+ constexpr const uint8_t* raw_data_ptr() const noexcept { return
data_.data(); }
+
+ /// The number of bytes used for the raw repeated value.
+ constexpr rle_size_t raw_data_size() const noexcept {
+ auto out = bit_util::BytesForBits(value_bit_width_);
+ ARROW_DCHECK_LE(out, std::numeric_limits<rle_size_t>::max());
+ return static_cast<rle_size_t>(out);
}
- /// Gets the next value. Returns false if there are no more.
+ private:
+ /// The repeated value raw bytes stored inside the class with enough space
to store
+ /// up to a 64 bit value.
+ std::array<uint8_t, 8> data_ = {};
+ /// The number of time the value is repeated.
+ rle_size_t values_count_ = 0;
+ /// The size in bit of a packed value in the run.
+ rle_size_t value_bit_width_ = 0;
+};
+
+template <typename T>
+class BitPackedRunDecoder;
+
+class BitPackedRun {
+ public:
+ /// The decoder class used to decode a single run in the given type.
+ template <typename T>
+ using DecoderType = BitPackedRunDecoder<T>;
+
+ constexpr BitPackedRun() noexcept = default;
+
+ constexpr BitPackedRun(const uint8_t* data, rle_size_t values_count,
+ rle_size_t value_bit_width) noexcept
+ : data_(data), values_count_(values_count),
value_bit_width_(value_bit_width) {
+ ARROW_CHECK_GE(value_bit_width_, 0);
+ ARROW_CHECK_GE(values_count_, 0);
+ }
+
+ constexpr rle_size_t values_count() const noexcept { return values_count_; }
+
+ /// The size in bits of each encoded value.
+ constexpr rle_size_t values_bit_width() const noexcept { return
value_bit_width_; }
+
+ constexpr const uint8_t* raw_data_ptr() const noexcept { return data_; }
+
+ constexpr rle_size_t raw_data_size() const noexcept {
+ auto out = bit_util::BytesForBits(static_cast<int64_t>(value_bit_width_) *
+ static_cast<int64_t>(values_count_));
+ ARROW_CHECK_LE(out, std::numeric_limits<rle_size_t>::max());
+ return static_cast<rle_size_t>(out);
+ }
+
+ private:
+ /// The pointer to the beginning of the run
+ const uint8_t* data_ = nullptr;
+ /// Number of values in this run.
+ rle_size_t values_count_ = 0;
+ /// The size in bit of a packed value in the run
+ rle_size_t value_bit_width_ = 0;
+};
+
+/// A parser that emits either a ``BitPackedRun`` or a ``RleRun``.
+class RleBitPackedParser {
+ public:
+ /// The different types of runs emitted by the parser
+ using dynamic_run_type = std::variant<RleRun, BitPackedRun>;
+
+ constexpr RleBitPackedParser() noexcept = default;
+
+ constexpr RleBitPackedParser(const uint8_t* data, rle_size_t data_size,
+ rle_size_t value_bit_width) noexcept
+ : data_(data), data_size_(data_size), value_bit_width_(value_bit_width)
{}
+
+ constexpr void Reset(const uint8_t* data, rle_size_t data_size,
+ rle_size_t value_bit_width) noexcept {
+ *this = {data, data_size, value_bit_width};
+ }
+
+ /// Whether there is still runs to iterate over.
+ ///
+ /// WARN: Due to simplistic error handling, iteration with Next and Peek
could
+ /// fail to return data while the parser is not exhausted.
+ /// This is how one can check for errors.
+ bool exhausted() const { return data_size_ == 0; }
+
+ /// Enum to return from an ``Parse`` handler.
+ ///
+ /// Since a callback has no way to know when to stop, the handler must return
+ /// a value indicating to the ``Parse`` function whether to stop or continue.
+ enum class ControlFlow {
+ Continue,
+ Break,
+ };
+
+ /// A callback approach to parsing.
+ ///
+ /// This approach is used to reduce the number of dynamic lookups involved
with using a
+ /// variant.
+ ///
+ /// The handler must be of the form
+ /// ```cpp
+ /// struct Handler {
+ /// ControlFlow OnBitPackedRun(BitPackedRun run);
+ ///
+ /// ControlFlow OnRleRun(RleRun run);
+ /// };
+ /// ```
+ template <typename Handler>
+ void Parse(Handler&& handler);
+
+ private:
+ /// The pointer to the beginning of the run
+ const uint8_t* data_ = nullptr;
+ /// Size in bytes of the run.
+ rle_size_t data_size_ = 0;
+ /// The size in bit of a packed value in the run
+ rle_size_t value_bit_width_ = 0;
Review Comment:
This could probably be `const`, if we don't care about this type being
movable. Not sure it would make a difference performance-wise...
##########
cpp/src/arrow/util/rle_encoding_internal.h:
##########
@@ -299,385 +612,663 @@ class RleEncoder {
uint8_t* literal_indicator_byte_;
};
+/************************
+ * RleBitPackedParser *
+ ************************/
+
+template <typename Handler>
+void RleBitPackedParser::Parse(Handler&& handler) {
+ while (!exhausted()) {
+ auto [read, control] = PeekImpl(handler);
+ data_ += read;
+ data_size_ -= read;
+ if (ARROW_PREDICT_FALSE(control == ControlFlow::Break)) {
+ break;
+ }
+ }
+}
+
+namespace internal {
+/// The maximal unsigned size that a variable can fit.
+template <typename T>
+constexpr auto max_size_for_v =
+ static_cast<std::make_unsigned_t<T>>(std::numeric_limits<T>::max());
+
+} // namespace internal
+
+template <typename Handler>
+auto RleBitPackedParser::PeekImpl(Handler&& handler) const
+ -> std::pair<rle_size_t, ControlFlow> {
+ ARROW_DCHECK(!exhausted());
+
+ constexpr auto kMaxSize = bit_util::kMaxLEB128ByteLenFor<uint32_t>;
+ uint32_t run_len_type = 0;
+ const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize,
&run_len_type);
+
+ if (ARROW_PREDICT_FALSE(header_bytes == 0)) {
+ // Malfomrmed LEB128 data
+ return {};
+ }
+
+ const bool is_bit_packed = run_len_type & 1;
+ const uint32_t count = run_len_type >> 1;
+ if (is_bit_packed) {
+ constexpr auto kMaxCount =
bit_util::CeilDiv(internal::max_size_for_v<rle_size_t>, 8);
+ if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) {
+ // Illegal number of encoded values
+ return {0, ControlFlow::Break};
+ }
+
+ const auto values_count = static_cast<rle_size_t>(count * 8);
+ ARROW_DCHECK_LT(count, internal::max_size_for_v<rle_size_t>);
+ // Count Already divided by 8
+ const auto bytes_read =
+ header_bytes + static_cast<rle_size_t>(count) * value_bit_width_;
+
+ auto control = handler.OnBitPackedRun(
+ BitPackedRun(data_ + header_bytes, values_count, value_bit_width_));
+
+ return {bytes_read, control};
+ }
+
+ if (ARROW_PREDICT_FALSE(
+ count == 0 ||
+ count >
static_cast<uint32_t>(std::numeric_limits<rle_size_t>::max()))) {
+ // Illegal number of encoded values
+ return {0, ControlFlow::Break};
+ }
+
+ const auto values_count = static_cast<rle_size_t>(count);
+ const auto value_bytes = bit_util::BytesForBits(value_bit_width_);
+ ARROW_DCHECK_LT(value_bytes, internal::max_size_for_v<rle_size_t>);
+ const auto bytes_read = header_bytes + static_cast<rle_size_t>(value_bytes);
+
+ auto control =
+ handler.OnRleRun(RleRun(data_ + header_bytes, values_count,
value_bit_width_));
+
+ return {bytes_read, control};
+}
+
+/*************************
+ * RleBitPackedDecoder *
+ *************************/
+
template <typename T>
-inline bool RleDecoder::Get(T* val) {
+template <typename Callable>
+void RleBitPackedDecoder<T>::ParseWithCallable(Callable&& func) {
+ struct {
+ Callable func;
+ auto OnBitPackedRun(BitPackedRun run) { return func(std::move(run)); }
+ auto OnRleRun(RleRun run) { return func(std::move(run)); }
+ } handler{std::move(func)};
+
+ parser_.Parse(std::move(handler));
+}
+
+template <typename T>
+bool RleBitPackedDecoder<T>::Get(value_type* val) {
return GetBatch(val, 1) == 1;
}
template <typename T>
-inline int RleDecoder::GetBatch(T* values, int batch_size) {
- ARROW_DCHECK_GE(bit_width_, 0);
- int values_read = 0;
-
- auto* out = values;
-
- while (values_read < batch_size) {
- int remaining = batch_size - values_read;
-
- if (repeat_count_ > 0) { // Repeated value case.
- int repeat_batch = std::min(remaining, repeat_count_);
- std::fill(out, out + repeat_batch, static_cast<T>(current_value_));
-
- repeat_count_ -= repeat_batch;
- values_read += repeat_batch;
- out += repeat_batch;
- } else if (literal_count_ > 0) {
- int literal_batch = std::min(remaining, literal_count_);
- int actual_read = bit_reader_.GetBatch(bit_width_, out, literal_batch);
- if (actual_read != literal_batch) {
- return values_read;
- }
+auto RleBitPackedDecoder<T>::GetBatch(value_type* out, rle_size_t batch_size)
+ -> rle_size_t {
+ using ControlFlow = RleBitPackedParser::ControlFlow;
- literal_count_ -= literal_batch;
- values_read += literal_batch;
- out += literal_batch;
- } else {
- if (!NextCounts<T>()) return values_read;
+ rle_size_t values_read = 0;
+
+ // Remaining from a previous call that would have left some unread data from
a run.
+ if (ARROW_PREDICT_FALSE(run_remaining() > 0)) {
+ const auto read = RunGetBatch(out, batch_size);
+ values_read += read;
+ out += read;
+
+ // Either we fulfilled all the batch to be read or we finished remaining
run.
+ if (ARROW_PREDICT_FALSE(values_read == batch_size)) {
+ return values_read;
}
+ ARROW_DCHECK(run_remaining() == 0);
}
+ ParseWithCallable([&](auto run) {
+ using RunDecoder = typename decltype(run)::template
DecoderType<value_type>;
+
+ ARROW_DCHECK_LT(values_read, batch_size);
+ RunDecoder decoder(run);
+ const auto read = decoder.GetBatch(out, batch_size - values_read);
+ ARROW_DCHECK_LE(read, batch_size - values_read);
+ values_read += read;
+ out += read;
+
+ // Stop reading and store remaining decoder
+ if (ARROW_PREDICT_FALSE(values_read == batch_size || read == 0)) {
Review Comment:
We can just check `read == 0` I think.
##########
cpp/src/arrow/util/rle_encoding_internal.h:
##########
@@ -82,86 +81,400 @@ namespace util {
/// 200 ints = 25 groups of 8
/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
/// (total 26 bytes, 1 byte overhead)
-//
-/// Decoder class for RLE encoded data.
-class RleDecoder {
+/// The type for an encoded Rle of BitPacked run size, between 1 and 2^31-1 as
per Parquet
+/// spec.
+/// This is also pragmatically used for other integer used in the Rle and
BitPacked runs
+/// and decoder to avoid conversions.
+/// It can therefore be referred to as a "typical" size for Rle and BitPacked
logic.
+using rle_size_t = int32_t;
+
+template <typename T>
+class RleRunDecoder;
+
+class RleRun {
public:
- /// Create a decoder object. buffer/buffer_len is the decoded data.
- /// bit_width is the width of each value (before encoding).
- RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
- : bit_reader_(buffer, buffer_len),
- bit_width_(bit_width),
- current_value_(0),
- repeat_count_(0),
- literal_count_(0) {
- ARROW_DCHECK_GE(bit_width_, 0);
- ARROW_DCHECK_LE(bit_width_, 64);
+ /// The decoder class used to decode a single run in the given type.
+ template <typename T>
+ using DecoderType = RleRunDecoder<T>;
+
+ constexpr RleRun() noexcept = default;
+
+ explicit RleRun(const uint8_t* data, rle_size_t values_count,
+ rle_size_t value_bit_width) noexcept
+ : values_count_(values_count), value_bit_width_(value_bit_width) {
+ ARROW_DCHECK_GE(value_bit_width, 0);
+ ARROW_DCHECK_GE(values_count, 0);
+ std::copy(data, data + raw_data_size(), data_.begin());
}
- RleDecoder() : bit_width_(-1) {}
+ /// The number of repeated values in this run.
+ constexpr rle_size_t values_count() const noexcept { return values_count_; }
- void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
- ARROW_DCHECK_GE(bit_width, 0);
- ARROW_DCHECK_LE(bit_width, 64);
- bit_reader_.Reset(buffer, buffer_len);
- bit_width_ = bit_width;
- current_value_ = 0;
- repeat_count_ = 0;
- literal_count_ = 0;
+ /// The size in bits of each encoded value.
+ constexpr rle_size_t values_bit_width() const noexcept { return
value_bit_width_; }
+
+ /// A pointer to the repeated value raw bytes.
+ constexpr const uint8_t* raw_data_ptr() const noexcept { return
data_.data(); }
+
+ /// The number of bytes used for the raw repeated value.
+ constexpr rle_size_t raw_data_size() const noexcept {
+ auto out = bit_util::BytesForBits(value_bit_width_);
+ ARROW_DCHECK_LE(out, std::numeric_limits<rle_size_t>::max());
+ return static_cast<rle_size_t>(out);
}
- /// Gets the next value. Returns false if there are no more.
+ private:
+ /// The repeated value raw bytes stored inside the class with enough space
to store
+ /// up to a 64 bit value.
+ std::array<uint8_t, 8> data_ = {};
+ /// The number of time the value is repeated.
+ rle_size_t values_count_ = 0;
+ /// The size in bit of a packed value in the run.
+ rle_size_t value_bit_width_ = 0;
+};
+
+template <typename T>
+class BitPackedRunDecoder;
+
+class BitPackedRun {
+ public:
+ /// The decoder class used to decode a single run in the given type.
+ template <typename T>
+ using DecoderType = BitPackedRunDecoder<T>;
+
+ constexpr BitPackedRun() noexcept = default;
+
+ constexpr BitPackedRun(const uint8_t* data, rle_size_t values_count,
+ rle_size_t value_bit_width) noexcept
+ : data_(data), values_count_(values_count),
value_bit_width_(value_bit_width) {
+ ARROW_CHECK_GE(value_bit_width_, 0);
+ ARROW_CHECK_GE(values_count_, 0);
+ }
+
+ constexpr rle_size_t values_count() const noexcept { return values_count_; }
+
+ /// The size in bits of each encoded value.
+ constexpr rle_size_t values_bit_width() const noexcept { return
value_bit_width_; }
+
+ constexpr const uint8_t* raw_data_ptr() const noexcept { return data_; }
+
+ constexpr rle_size_t raw_data_size() const noexcept {
+ auto out = bit_util::BytesForBits(static_cast<int64_t>(value_bit_width_) *
+ static_cast<int64_t>(values_count_));
+ ARROW_CHECK_LE(out, std::numeric_limits<rle_size_t>::max());
+ return static_cast<rle_size_t>(out);
+ }
+
+ private:
+ /// The pointer to the beginning of the run
+ const uint8_t* data_ = nullptr;
+ /// Number of values in this run.
+ rle_size_t values_count_ = 0;
+ /// The size in bit of a packed value in the run
+ rle_size_t value_bit_width_ = 0;
Review Comment:
Same question of course.
##########
cpp/src/arrow/util/rle_encoding_internal.h:
##########
@@ -299,385 +612,663 @@ class RleEncoder {
uint8_t* literal_indicator_byte_;
};
+/************************
+ * RleBitPackedParser *
+ ************************/
+
+template <typename Handler>
+void RleBitPackedParser::Parse(Handler&& handler) {
+ while (!exhausted()) {
+ auto [read, control] = PeekImpl(handler);
+ data_ += read;
+ data_size_ -= read;
+ if (ARROW_PREDICT_FALSE(control == ControlFlow::Break)) {
+ break;
+ }
+ }
+}
+
+namespace internal {
+/// The maximal unsigned size that a variable can fit.
+template <typename T>
+constexpr auto max_size_for_v =
+ static_cast<std::make_unsigned_t<T>>(std::numeric_limits<T>::max());
+
+} // namespace internal
+
+template <typename Handler>
+auto RleBitPackedParser::PeekImpl(Handler&& handler) const
+ -> std::pair<rle_size_t, ControlFlow> {
+ ARROW_DCHECK(!exhausted());
+
+ constexpr auto kMaxSize = bit_util::kMaxLEB128ByteLenFor<uint32_t>;
+ uint32_t run_len_type = 0;
+ const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize,
&run_len_type);
+
+ if (ARROW_PREDICT_FALSE(header_bytes == 0)) {
+ // Malfomrmed LEB128 data
+ return {};
Review Comment:
Let's spell `{0, ControlFlow::Break}` explicitly?
##########
cpp/src/arrow/util/rle_encoding_internal.h:
##########
@@ -299,385 +612,663 @@ class RleEncoder {
uint8_t* literal_indicator_byte_;
};
+/************************
+ * RleBitPackedParser *
+ ************************/
+
+template <typename Handler>
+void RleBitPackedParser::Parse(Handler&& handler) {
+ while (!exhausted()) {
+ auto [read, control] = PeekImpl(handler);
+ data_ += read;
+ data_size_ -= read;
+ if (ARROW_PREDICT_FALSE(control == ControlFlow::Break)) {
+ break;
+ }
+ }
+}
+
+namespace internal {
+/// The maximal unsigned size that a variable can fit.
+template <typename T>
+constexpr auto max_size_for_v =
+ static_cast<std::make_unsigned_t<T>>(std::numeric_limits<T>::max());
+
+} // namespace internal
+
+template <typename Handler>
+auto RleBitPackedParser::PeekImpl(Handler&& handler) const
+ -> std::pair<rle_size_t, ControlFlow> {
+ ARROW_DCHECK(!exhausted());
+
+ constexpr auto kMaxSize = bit_util::kMaxLEB128ByteLenFor<uint32_t>;
+ uint32_t run_len_type = 0;
+ const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize,
&run_len_type);
+
+ if (ARROW_PREDICT_FALSE(header_bytes == 0)) {
+ // Malfomrmed LEB128 data
+ return {};
+ }
+
+ const bool is_bit_packed = run_len_type & 1;
+ const uint32_t count = run_len_type >> 1;
+ if (is_bit_packed) {
+ constexpr auto kMaxCount =
bit_util::CeilDiv(internal::max_size_for_v<rle_size_t>, 8);
+ if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) {
+ // Illegal number of encoded values
+ return {0, ControlFlow::Break};
+ }
+
+ const auto values_count = static_cast<rle_size_t>(count * 8);
+ ARROW_DCHECK_LT(count, internal::max_size_for_v<rle_size_t>);
Review Comment:
Do you mean `values_count` here?
##########
cpp/src/arrow/util/rle_encoding_internal.h:
##########
@@ -299,385 +612,663 @@ class RleEncoder {
uint8_t* literal_indicator_byte_;
};
+/************************
+ * RleBitPackedParser *
+ ************************/
+
+template <typename Handler>
+void RleBitPackedParser::Parse(Handler&& handler) {
+ while (!exhausted()) {
+ auto [read, control] = PeekImpl(handler);
+ data_ += read;
+ data_size_ -= read;
+ if (ARROW_PREDICT_FALSE(control == ControlFlow::Break)) {
+ break;
+ }
+ }
+}
+
+namespace internal {
+/// The maximal unsigned size that a variable can fit.
+template <typename T>
+constexpr auto max_size_for_v =
+ static_cast<std::make_unsigned_t<T>>(std::numeric_limits<T>::max());
+
+} // namespace internal
+
+template <typename Handler>
+auto RleBitPackedParser::PeekImpl(Handler&& handler) const
+ -> std::pair<rle_size_t, ControlFlow> {
+ ARROW_DCHECK(!exhausted());
+
+ constexpr auto kMaxSize = bit_util::kMaxLEB128ByteLenFor<uint32_t>;
+ uint32_t run_len_type = 0;
+ const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize,
&run_len_type);
+
+ if (ARROW_PREDICT_FALSE(header_bytes == 0)) {
+ // Malfomrmed LEB128 data
+ return {};
+ }
+
+ const bool is_bit_packed = run_len_type & 1;
+ const uint32_t count = run_len_type >> 1;
+ if (is_bit_packed) {
+ constexpr auto kMaxCount =
bit_util::CeilDiv(internal::max_size_for_v<rle_size_t>, 8);
+ if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) {
+ // Illegal number of encoded values
+ return {0, ControlFlow::Break};
+ }
+
+ const auto values_count = static_cast<rle_size_t>(count * 8);
+ ARROW_DCHECK_LT(count, internal::max_size_for_v<rle_size_t>);
+ // Count Already divided by 8
+ const auto bytes_read =
+ header_bytes + static_cast<rle_size_t>(count) * value_bit_width_;
+
+ auto control = handler.OnBitPackedRun(
+ BitPackedRun(data_ + header_bytes, values_count, value_bit_width_));
+
+ return {bytes_read, control};
+ }
+
+ if (ARROW_PREDICT_FALSE(
+ count == 0 ||
+ count >
static_cast<uint32_t>(std::numeric_limits<rle_size_t>::max()))) {
Review Comment:
This is not possible, right? `count` is initially in [0, 2^32 - 1], but then
it's shifted to the right by one bit.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]