This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 52ff63c220 refactor(parquet): replace magic `8` literals with named
constants (#9751)
52ff63c220 is described below
commit 52ff63c220f6abcd802ac57f329879fb0145009e
Author: Hippolyte Barraud <[email protected]>
AuthorDate: Tue Apr 21 10:20:06 2026 -0400
refactor(parquet): replace magic `8` literals with named constants (#9751)
# Which issue does this PR close?
- Spawn off from #9653
- Contributes to #9731
# Rationale for this change
The literal `8` appeared in two distinct roles throughout `RleEncoder`,
`RleDecoder`, and their tests.
# What changes are included in this PR?
Replacing each with a named constant makes the intent explicit and
prevents the two meanings from being confused.
* `BIT_PACK_GROUP_SIZE = 8` The Parquet RLE/bit-packing hybrid format
always bit-packs values in multiples of this count (spec: "we always
bit-pack a multiple of 8 values at a time"). Every occurrence related to
the staging buffer size, the repeat-count threshold that triggers the
RLE decision, and the group-count arithmetic in bit-packed headers now
uses this name.
* `u8::BITS` (= 8, from std) Used wherever a bit-count is divided by 8
to obtain a byte-count (e.g. `ceil(bit_width, u8::BITS as usize)`). This
is a bits-per-byte conversion, a fundamentally different concept from
the packing-group size.
No behaviour change.
# Are these changes tested?
All tests passing.
# Are there any user-facing changes?
None.
Signed-off-by: Hippolyte Barraud <[email protected]>
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet/src/encodings/rle.rs | 99 +++++++++++++++++++++++++-------------------
1 file changed, 57 insertions(+), 42 deletions(-)
diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs
index 937be1dd2c..806b41a353 100644
--- a/parquet/src/encodings/rle.rs
+++ b/parquet/src/encodings/rle.rs
@@ -41,7 +41,13 @@ use bytes::Bytes;
use crate::errors::{ParquetError, Result};
use crate::util::bit_util::{self, BitReader, BitWriter, FromBitpacked};
-/// Maximum groups of 8 values per bit-packed run. Current value is 64.
+/// Number of values in one bit-packed group. The Parquet RLE/bit-packing
hybrid
+/// format always bit-packs values in multiples of this count (see the
+/// [format
spec](https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3):
+/// "we always bit-pack a multiple of 8 values at a time").
+const BIT_PACK_GROUP_SIZE: usize = 8;
+
+/// Maximum groups of `BIT_PACK_GROUP_SIZE` values per bit-packed run. Current
value is 64.
const MAX_GROUPS_PER_BIT_PACKED_RUN: usize = 1 << 6;
/// A RLE/Bit-Packing hybrid encoder.
@@ -54,9 +60,9 @@ pub struct RleEncoder {
bit_writer: BitWriter,
// Buffered values for bit-packed runs.
- buffered_values: [u64; 8],
+ buffered_values: [u64; BIT_PACK_GROUP_SIZE],
- // Number of current buffered values. Must be less than 8.
+ // Number of current buffered values. Must be less than
BIT_PACK_GROUP_SIZE.
num_buffered_values: usize,
// The current (also last) value that was written and the count of how many
@@ -89,7 +95,7 @@ impl RleEncoder {
RleEncoder {
bit_width,
bit_writer,
- buffered_values: [0; 8],
+ buffered_values: [0; BIT_PACK_GROUP_SIZE],
num_buffered_values: 0,
current_value: 0,
repeat_count: 0,
@@ -101,22 +107,23 @@ impl RleEncoder {
/// Returns the maximum buffer size to encode `num_values` values with
/// `bit_width`.
pub fn max_buffer_size(bit_width: u8, num_values: usize) -> usize {
- // The maximum size occurs with the shortest possible runs of 8
- let num_runs = bit_util::ceil(num_values, 8);
+ // The maximum size occurs with the shortest possible runs of
BIT_PACK_GROUP_SIZE
+ let num_runs = bit_util::ceil(num_values, BIT_PACK_GROUP_SIZE);
- // The number of bytes in a run of 8
+ // The number of bytes in a run of BIT_PACK_GROUP_SIZE
let bytes_per_run = bit_width as usize;
- // The maximum size if stored as shortest possible bit packed runs of 8
+ // The maximum size if stored as shortest possible bit packed runs of
BIT_PACK_GROUP_SIZE
let bit_packed_max_size = num_runs + num_runs * bytes_per_run;
- // The length of `8` VLQ encoded
+ // The length of `BIT_PACK_GROUP_SIZE` VLQ encoded
let rle_len_prefix = 1;
- // The length of an RLE run of 8
- let min_rle_run_size = rle_len_prefix + bit_util::ceil(bit_width as
usize, 8);
+ // The length of an RLE run of BIT_PACK_GROUP_SIZE
+ let min_rle_run_size =
+ rle_len_prefix + bit_util::ceil(bit_width as usize, u8::BITS as
usize);
- // The maximum size if stored as shortest possible RLE runs of 8
+ // The maximum size if stored as shortest possible RLE runs of
BIT_PACK_GROUP_SIZE
let rle_max_size = num_runs * min_rle_run_size;
bit_packed_max_size.max(rle_max_size)
@@ -125,16 +132,17 @@ impl RleEncoder {
/// Encodes `value`, which must be representable with `bit_width` bits.
#[inline]
pub fn put(&mut self, value: u64) {
- // This function buffers 8 values at a time. After seeing 8 values, it
- // decides whether the current run should be encoded in bit-packed or
RLE.
+ // This function buffers BIT_PACK_GROUP_SIZE values at a time. After
seeing that
+ // many values, it decides whether the current run should be encoded
in bit-packed
+ // or RLE.
if self.current_value == value {
self.repeat_count += 1;
- if self.repeat_count > 8 {
+ if self.repeat_count > BIT_PACK_GROUP_SIZE {
// A continuation of last value. No need to buffer.
return;
}
} else {
- if self.repeat_count >= 8 {
+ if self.repeat_count >= BIT_PACK_GROUP_SIZE {
// The current RLE run has ended and we've gathered enough.
Flush first.
debug_assert_eq!(self.bit_packed_count, 0);
self.flush_rle_run();
@@ -145,9 +153,9 @@ impl RleEncoder {
self.buffered_values[self.num_buffered_values] = value;
self.num_buffered_values += 1;
- if self.num_buffered_values == 8 {
+ if self.num_buffered_values == BIT_PACK_GROUP_SIZE {
// Buffered values are full. Flush them.
- debug_assert_eq!(self.bit_packed_count % 8, 0);
+ debug_assert_eq!(self.bit_packed_count % BIT_PACK_GROUP_SIZE, 0);
self.flush_buffered_values();
}
}
@@ -219,9 +227,9 @@ impl RleEncoder {
if self.repeat_count > 0 && all_repeat {
self.flush_rle_run();
} else {
- // Buffer the last group of bit-packed values to 8 by padding
with 0s.
+ // Buffer the last group of bit-packed values to
BIT_PACK_GROUP_SIZE by padding with 0s.
if self.num_buffered_values > 0 {
- while self.num_buffered_values < 8 {
+ while self.num_buffered_values < BIT_PACK_GROUP_SIZE {
self.buffered_values[self.num_buffered_values] = 0;
self.num_buffered_values += 1;
}
@@ -239,7 +247,7 @@ impl RleEncoder {
self.bit_writer.put_vlq_int(indicator_value as u64);
self.bit_writer.put_aligned(
self.current_value,
- bit_util::ceil(self.bit_width as usize, 8),
+ bit_util::ceil(self.bit_width as usize, u8::BITS as usize),
);
self.num_buffered_values = 0;
self.repeat_count = 0;
@@ -263,7 +271,7 @@ impl RleEncoder {
// Called when ending a bit-packed run. Writes the indicator byte to the
reserved
// position in `bit_writer`
fn finish_bit_packed_run(&mut self) {
- let num_groups = self.bit_packed_count / 8;
+ let num_groups = self.bit_packed_count / BIT_PACK_GROUP_SIZE;
let indicator_byte = ((num_groups << 1) | 1) as u8;
self.bit_writer
.put_aligned_offset(indicator_byte, 1, self.indicator_byte_pos as
usize);
@@ -272,20 +280,20 @@ impl RleEncoder {
}
fn flush_buffered_values(&mut self) {
- if self.repeat_count >= 8 {
+ if self.repeat_count >= BIT_PACK_GROUP_SIZE {
// Clear buffered values as they are not needed
self.num_buffered_values = 0;
if self.bit_packed_count > 0 {
// In this case we have chosen to switch to RLE encoding.
Close out the
// previous bit-packed run.
- debug_assert_eq!(self.bit_packed_count % 8, 0);
+ debug_assert_eq!(self.bit_packed_count % BIT_PACK_GROUP_SIZE,
0);
self.finish_bit_packed_run();
}
return;
}
self.bit_packed_count += self.num_buffered_values;
- let num_groups = self.bit_packed_count / 8;
+ let num_groups = self.bit_packed_count / BIT_PACK_GROUP_SIZE;
if num_groups + 1 >= MAX_GROUPS_PER_BIT_PACKED_RUN {
// We've reached the maximum value that can be hold in a single
bit-packed
// run.
@@ -359,7 +367,7 @@ impl RleDecoder {
#[inline(never)]
#[allow(unused)]
pub fn get<T: FromBitpacked>(&mut self) -> Result<Option<T>> {
- assert!(size_of::<T>() <= 8);
+ assert!(size_of::<T>() <= size_of::<u64>());
while self.rle_left == 0 && self.bit_packed_left == 0 {
if !self.reload()? {
@@ -395,7 +403,7 @@ impl RleDecoder {
#[inline(never)]
pub fn get_batch<T: FromBitpacked + Clone>(&mut self, buffer: &mut [T]) ->
Result<usize> {
- assert!(size_of::<T>() <= 8);
+ assert!(size_of::<T>() <= size_of::<u64>());
let mut values_read = 0;
while values_read < buffer.len() {
@@ -594,10 +602,10 @@ impl RleDecoder {
return Ok(false);
}
if indicator_value & 1 == 1 {
- self.bit_packed_left = ((indicator_value >> 1) * 8) as u32;
+ self.bit_packed_left = ((indicator_value >> 1) *
BIT_PACK_GROUP_SIZE as i64) as u32;
} else {
self.rle_left = (indicator_value >> 1) as u32;
- let value_width = bit_util::ceil(self.bit_width as usize, 8);
+ let value_width = bit_util::ceil(self.bit_width as usize,
u8::BITS as usize);
self.current_value =
bit_reader.get_aligned::<u64>(value_width);
self.current_value.ok_or_else(|| {
general_err!("parquet_data_error: not enough data for RLE
decoding")
@@ -626,7 +634,7 @@ mod tests {
let data = vec![0x03, 0x88, 0xC6, 0xFA];
let mut decoder: RleDecoder = RleDecoder::new(3);
decoder.set_data(data.into()).unwrap();
- let mut buffer = vec![0; 8];
+ let mut buffer = vec![0; BIT_PACK_GROUP_SIZE];
let expected = vec![0, 1, 2, 3, 4, 5, 6, 7];
let result = decoder.get_batch::<i32>(&mut buffer);
assert!(result.is_ok());
@@ -810,14 +818,18 @@ mod tests {
let data = vec![0x03, 0x63, 0xC7, 0x8E, 0x03, 0x65, 0x0B];
let mut decoder: RleDecoder = RleDecoder::new(3);
decoder.set_data(data.into()).unwrap();
- let mut buffer = vec![""; 8];
+ let mut buffer = vec![""; BIT_PACK_GROUP_SIZE];
let expected = vec!["eee", "fff", "ddd", "eee", "fff", "eee", "fff",
"fff"];
let skipped = decoder.skip(4).expect("skipping four values");
assert_eq!(skipped, 4);
let remainder = decoder
- .get_batch_with_dict::<&str>(dict.as_slice(),
buffer.as_mut_slice(), 8)
+ .get_batch_with_dict::<&str>(
+ dict.as_slice(),
+ buffer.as_mut_slice(),
+ BIT_PACK_GROUP_SIZE,
+ )
.expect("getting remainder");
- assert_eq!(remainder, 8);
+ assert_eq!(remainder, BIT_PACK_GROUP_SIZE);
assert_eq!(buffer, expected);
}
@@ -879,7 +891,7 @@ mod tests {
&values[..],
width as u8,
None,
- 2 * (1 + bit_util::ceil(width as i64, 8) as i32),
+ 2 * (1 + bit_util::ceil(width as i64, u8::BITS as i64) as i32),
);
}
@@ -889,9 +901,12 @@ mod tests {
for i in 0..101 {
values.push(i % 2);
}
- let num_groups = bit_util::ceil(100, 8) as u8;
+ let num_groups = bit_util::ceil(100, BIT_PACK_GROUP_SIZE) as u8;
expected_buffer.push((num_groups << 1) | 1);
- expected_buffer.resize(expected_buffer.len() + 100 / 8, 0b10101010);
+ expected_buffer.resize(
+ expected_buffer.len() + 100 / BIT_PACK_GROUP_SIZE,
+ 0b10101010,
+ );
// For the last 4 0 and 1's, padded with 0.
expected_buffer.push(0b00001010);
@@ -902,12 +917,12 @@ mod tests {
1 + num_groups as i32,
);
for width in 2..MAX_WIDTH + 1 {
- let num_values = bit_util::ceil(100, 8) * 8;
+ let num_values = bit_util::ceil(100, BIT_PACK_GROUP_SIZE) *
BIT_PACK_GROUP_SIZE;
validate_rle(
&values,
width as u8,
None,
- 1 + bit_util::ceil(width as i64 * num_values, 8) as i32,
+ 1 + bit_util::ceil(width as i64 * num_values as i64, u8::BITS
as i64) as i32,
);
}
}
@@ -1001,9 +1016,9 @@ mod tests {
.get_batch(&mut actual_values)
.expect("get_batch() should be OK");
- // Should decode 8 values despite only encoding 6 as length of
- // bit packed run is always multiple of 8
- assert_eq!(r, 8);
+ // Should decode BIT_PACK_GROUP_SIZE values despite only encoding 6 as
length of
+ // bit packed run is always a multiple of BIT_PACK_GROUP_SIZE
+ assert_eq!(r, BIT_PACK_GROUP_SIZE);
assert_eq!(actual_values[..6], values);
assert_eq!(actual_values[6], 0);
assert_eq!(actual_values[7], 0);
@@ -1024,7 +1039,7 @@ mod tests {
let num_values = 2002;
// bit-packed header
- let run_bytes = ceil(num_values * bit_width, 8) as u64;
+ let run_bytes = ceil(num_values * bit_width, u8::BITS as usize) as u64;
writer.put_vlq_int((run_bytes << 1) | 1);
for _ in 0..run_bytes {
writer.put_aligned(0xFF_u8, 1);