pitrou commented on code in PR #47294:
URL: https://github.com/apache/arrow/pull/47294#discussion_r2367097752
##########
cpp/src/arrow/util/rle_encoding_internal.h:
##########
@@ -82,86 +81,400 @@ namespace util {
/// 200 ints = 25 groups of 8
/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
/// (total 26 bytes, 1 byte overhead)
-//
-/// Decoder class for RLE encoded data.
-class RleDecoder {
+/// The type for an encoded Rle of BitPacked run size, between 1 and 2^31-1 as
per Parquet
+/// spec.
+/// This is also pragmatically used for other integer used in the Rle and
BitPacked runs
+/// and decoder to avoid conversions.
+/// It can therefore be referred to as a "typical" size for Rle and BitPacked
logic.
+using rle_size_t = int32_t;
+
+template <typename T>
+class RleRunDecoder;
+
+class RleRun {
public:
- /// Create a decoder object. buffer/buffer_len is the decoded data.
- /// bit_width is the width of each value (before encoding).
- RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
- : bit_reader_(buffer, buffer_len),
- bit_width_(bit_width),
- current_value_(0),
- repeat_count_(0),
- literal_count_(0) {
- ARROW_DCHECK_GE(bit_width_, 0);
- ARROW_DCHECK_LE(bit_width_, 64);
+ /// The decoder class used to decode a single run in the given type.
+ template <typename T>
+ using DecoderType = RleRunDecoder<T>;
+
+ constexpr RleRun() noexcept = default;
+
+ explicit RleRun(const uint8_t* data, rle_size_t values_count,
+ rle_size_t value_bit_width) noexcept
+ : values_count_(values_count), value_bit_width_(value_bit_width) {
+ ARROW_DCHECK_GE(value_bit_width, 0);
+ ARROW_DCHECK_GE(values_count, 0);
+ std::copy(data, data + raw_data_size(), data_.begin());
}
- RleDecoder() : bit_width_(-1) {}
+ /// The number of repeated values in this run.
+ constexpr rle_size_t values_count() const noexcept { return values_count_; }
- void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
- ARROW_DCHECK_GE(bit_width, 0);
- ARROW_DCHECK_LE(bit_width, 64);
- bit_reader_.Reset(buffer, buffer_len);
- bit_width_ = bit_width;
- current_value_ = 0;
- repeat_count_ = 0;
- literal_count_ = 0;
+ /// The size in bits of each encoded value.
+ constexpr rle_size_t values_bit_width() const noexcept { return
value_bit_width_; }
+
+ /// A pointer to the repeated value raw bytes.
+ constexpr const uint8_t* raw_data_ptr() const noexcept { return
data_.data(); }
+
+ /// The number of bytes used for the raw repeated value.
+ constexpr rle_size_t raw_data_size() const noexcept {
+ auto out = bit_util::BytesForBits(value_bit_width_);
+ ARROW_DCHECK_LE(out, std::numeric_limits<rle_size_t>::max());
+ return static_cast<rle_size_t>(out);
}
- /// Gets the next value. Returns false if there are no more.
+ private:
+ /// The repeated value raw bytes stored inside the class with enough space
to store
+ /// up to a 64 bit value.
+ std::array<uint8_t, 8> data_ = {};
+ /// The number of time the value is repeated.
+ rle_size_t values_count_ = 0;
+ /// The size in bit of a packed value in the run.
+ rle_size_t value_bit_width_ = 0;
Review Comment:
I think the main question for me is whether the optimizer is able to deduce
that the bit width is constant across the entire decoding process. If it is,
then fine.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]