This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new ade038153a No longer allow BIT_PACKED level encoding in Parquet writer
(#9656)
ade038153a is described below
commit ade038153a66464c56e21e350085bfcf950be09f
Author: Ed Seidl <[email protected]>
AuthorDate: Fri Apr 3 13:55:38 2026 -0700
No longer allow BIT_PACKED level encoding in Parquet writer (#9656)
# Which issue does this PR close?
- Closes #9635.
# Rationale for this change
The `BIT_PACKED` encoding for repetition and definition levels has long
been deprecated. Remove the possibility of using it.
# What changes are included in this PR?
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
# Are these changes tested?
Should be covered by existing tests
# Are there any user-facing changes?
No, only changes to API marked "experimental"
---
parquet/src/column/writer/mod.rs | 2 +-
parquet/src/encodings/levels.rs | 34 +++++++------------------------
parquet/src/util/test_common/page_util.rs | 2 +-
3 files changed, 9 insertions(+), 29 deletions(-)
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index cdf489f3b6..46f90d3f77 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -650,7 +650,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
/// Creates a new streaming level encoder appropriate for the writer
version.
fn create_level_encoder(max_level: i16, props: &WriterProperties) ->
LevelEncoder {
match props.writer_version() {
- WriterVersion::PARQUET_1_0 =>
LevelEncoder::v1_streaming(Encoding::RLE, max_level),
+ WriterVersion::PARQUET_1_0 =>
LevelEncoder::v1_streaming(max_level),
WriterVersion::PARQUET_2_0 =>
LevelEncoder::v2_streaming(max_level),
}
}
diff --git a/parquet/src/encodings/levels.rs b/parquet/src/encodings/levels.rs
index b761a5ac5d..5d85e240cb 100644
--- a/parquet/src/encodings/levels.rs
+++ b/parquet/src/encodings/levels.rs
@@ -19,43 +19,32 @@ use std::mem;
use super::rle::RleEncoder;
-use crate::basic::Encoding;
use crate::data_type::AsBytes;
-use crate::util::bit_util::{BitWriter, num_required_bits};
+use crate::util::bit_util::num_required_bits;
/// Encoder for definition/repetition levels.
/// Currently only supports Rle and BitPacked (dev/null) encoding, including
v2.
pub enum LevelEncoder {
Rle(RleEncoder),
RleV2(RleEncoder),
- BitPacked(u8, BitWriter),
}
impl LevelEncoder {
/// Creates a new streaming level encoder for Data Page v1.
///
- /// Unlike [`v1`](Self::v1), this does not require knowing the number of
values
+ /// This does not require knowing the number of values
/// upfront, making it suitable for incremental encoding where levels are
fed in
/// as they arrive via [`put`](Self::put).
- pub fn v1_streaming(encoding: Encoding, max_level: i16) -> Self {
+ pub fn v1_streaming(max_level: i16) -> Self {
let bit_width = num_required_bits(max_level as u64);
- match encoding {
- Encoding::RLE => {
- // Reserve space for length header
- let buffer = vec![0u8; 4];
- LevelEncoder::Rle(RleEncoder::new_from_buf(bit_width, buffer))
- }
- #[allow(deprecated)]
- Encoding::BIT_PACKED => {
- LevelEncoder::BitPacked(bit_width,
BitWriter::new_from_buf(Vec::new()))
- }
- _ => panic!("Unsupported encoding type {encoding}"),
- }
+ // Reserve space for length header
+ let buffer = vec![0u8; 4];
+ LevelEncoder::Rle(RleEncoder::new_from_buf(bit_width, buffer))
}
/// Creates a new streaming RLE level encoder for Data Page v2.
///
- /// Unlike [`v2`](Self::v2), this does not require knowing the number of
values
+ /// This does not require knowing the number of values
/// upfront, making it suitable for incremental encoding where levels are
fed in
/// as they arrive via [`put`](Self::put).
pub fn v2_streaming(max_level: i16) -> Self {
@@ -80,12 +69,6 @@ impl LevelEncoder {
num_encoded += 1;
}
}
- LevelEncoder::BitPacked(bit_width, ref mut encoder) => {
- for value in buffer {
- encoder.put_value(*value as u64, bit_width as usize);
- num_encoded += 1;
- }
- }
}
num_encoded
}
@@ -106,7 +89,6 @@ impl LevelEncoder {
encoded_data
}
LevelEncoder::RleV2(encoder) => encoder.consume(),
- LevelEncoder::BitPacked(_, encoder) => encoder.consume(),
}
}
@@ -126,7 +108,6 @@ impl LevelEncoder {
f(data)
}
LevelEncoder::RleV2(encoder) => f(encoder.flush_buffer()),
- LevelEncoder::BitPacked(_, encoder) => f(encoder.flush_buffer()),
};
match self {
LevelEncoder::Rle(encoder) => {
@@ -135,7 +116,6 @@ impl LevelEncoder {
encoder.skip(mem::size_of::<i32>());
}
LevelEncoder::RleV2(encoder) => encoder.clear(),
- LevelEncoder::BitPacked(_, encoder) => encoder.clear(),
}
result
}
diff --git a/parquet/src/util/test_common/page_util.rs
b/parquet/src/util/test_common/page_util.rs
index 6a99beaea1..7797427872 100644
--- a/parquet/src/util/test_common/page_util.rs
+++ b/parquet/src/util/test_common/page_util.rs
@@ -75,7 +75,7 @@ impl DataPageBuilderImpl {
if max_level <= 0 {
return 0;
}
- let mut level_encoder = LevelEncoder::v1_streaming(Encoding::RLE,
max_level);
+ let mut level_encoder = LevelEncoder::v1_streaming(max_level);
level_encoder.put(levels);
let encoded_levels = level_encoder.consume();
// Actual encoded bytes (without length offset)