This is an automated email from the ASF dual-hosted git repository.
liurenjie1024 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 2944ccb55 fix: Serialize `split_offsets` as null when empty (#1906)
2944ccb55 is described below
commit 2944ccb551ec7c07ccc802a8edc01d5eb2ab6b87
Author: Andrea Bozzo <[email protected]>
AuthorDate: Wed Dec 10 08:11:33 2025 +0100
fix: Serialize `split_offsets` as null when empty (#1906)
- Change `split_offsets` in `DataFile` from `Vec<i64>` to
`Option<Vec<i64>>`
- Empty values now serialize as `null` instead of `[]`
- Aligns with Iceberg spec (field is optional)
Closes #1897
---------
Co-authored-by: Renjie Liu <[email protected]>
---
bindings/python/src/data_file.rs | 2 +-
.../src/expr/visitors/expression_evaluator.rs | 4 ++--
.../expr/visitors/inclusive_metrics_evaluator.rs | 12 +++++------
.../src/expr/visitors/strict_metrics_evaluator.rs | 8 ++++----
crates/iceberg/src/spec/manifest/_serde.rs | 6 +++---
crates/iceberg/src/spec/manifest/data_file.rs | 10 +++++----
crates/iceberg/src/spec/manifest/mod.rs | 24 +++++++++++-----------
crates/iceberg/src/spec/manifest/writer.rs | 6 +++---
crates/iceberg/src/spec/snapshot_summary.rs | 10 ++++-----
.../writer/base_writer/equality_delete_writer.rs | 16 +++++++--------
.../src/writer/file_writer/parquet_writer.rs | 4 ++--
11 files changed, 52 insertions(+), 50 deletions(-)
diff --git a/bindings/python/src/data_file.rs b/bindings/python/src/data_file.rs
index 900d6c601..b0e42e7d7 100644
--- a/bindings/python/src/data_file.rs
+++ b/bindings/python/src/data_file.rs
@@ -143,7 +143,7 @@ impl PyDataFile {
}
#[getter]
- fn split_offsets(&self) -> &[i64] {
+ fn split_offsets(&self) -> Option<&[i64]> {
self.inner.split_offsets()
}
diff --git a/crates/iceberg/src/expr/visitors/expression_evaluator.rs
b/crates/iceberg/src/expr/visitors/expression_evaluator.rs
index 3675ce355..570c40950 100644
--- a/crates/iceberg/src/expr/visitors/expression_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/expression_evaluator.rs
@@ -346,7 +346,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -374,7 +374,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
index 2b65cf12a..06c92ab3e 100644
--- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
@@ -1995,7 +1995,7 @@ mod test {
lower_bounds: Default::default(),
upper_bounds: Default::default(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -2021,7 +2021,7 @@ mod test {
lower_bounds: Default::default(),
upper_bounds: Default::default(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -2083,7 +2083,7 @@ mod test {
column_sizes: Default::default(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -2114,7 +2114,7 @@ mod test {
column_sizes: Default::default(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -2146,7 +2146,7 @@ mod test {
column_sizes: Default::default(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -2178,7 +2178,7 @@ mod test {
column_sizes: Default::default(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
diff --git a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
index 7c652e206..a6af2990c 100644
--- a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
@@ -578,7 +578,7 @@ mod test {
]),
column_sizes: Default::default(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -604,7 +604,7 @@ mod test {
lower_bounds: Default::default(),
upper_bounds: Default::default(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -630,7 +630,7 @@ mod test {
upper_bounds: HashMap::from([(1, Datum::int(42))]),
column_sizes: Default::default(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -657,7 +657,7 @@ mod test {
upper_bounds: HashMap::from([(3, Datum::string("dC"))]),
column_sizes: Default::default(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
diff --git a/crates/iceberg/src/spec/manifest/_serde.rs
b/crates/iceberg/src/spec/manifest/_serde.rs
index 7738af46d..07306be2b 100644
--- a/crates/iceberg/src/spec/manifest/_serde.rs
+++ b/crates/iceberg/src/spec/manifest/_serde.rs
@@ -153,7 +153,7 @@ impl DataFileSerde {
lower_bounds: Some(to_bytes_entry(value.lower_bounds)?),
upper_bounds: Some(to_bytes_entry(value.upper_bounds)?),
key_metadata: value.key_metadata.map(serde_bytes::ByteBuf::from),
- split_offsets: Some(value.split_offsets),
+ split_offsets: value.split_offsets,
equality_ids: value.equality_ids,
sort_order_id: value.sort_order_id,
first_row_id: value.first_row_id,
@@ -222,7 +222,7 @@ impl DataFileSerde {
.transpose()?
.unwrap_or_default(),
key_metadata: self.key_metadata.map(|v| v.to_vec()),
- split_offsets: self.split_offsets.unwrap_or_default(),
+ split_offsets: self.split_offsets,
equality_ids: self.equality_ids,
sort_order_id: self.sort_order_id,
partition_spec_id,
@@ -380,7 +380,7 @@ mod tests {
lower_bounds:
HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
upper_bounds:
HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
diff --git a/crates/iceberg/src/spec/manifest/data_file.rs
b/crates/iceberg/src/spec/manifest/data_file.rs
index a9c041f54..77bd046f8 100644
--- a/crates/iceberg/src/spec/manifest/data_file.rs
+++ b/crates/iceberg/src/spec/manifest/data_file.rs
@@ -127,9 +127,10 @@ pub struct DataFile {
/// element field id: 133
///
/// Split offsets for the data file. For example, all row group offsets
- /// in a Parquet file. Must be sorted ascending
+ /// in a Parquet file. Must be sorted ascending. Optional field that
+ /// should be serialized as null when not present.
#[builder(default)]
- pub(crate) split_offsets: Vec<i64>,
+ pub(crate) split_offsets: Option<Vec<i64>>,
/// field id: 135
/// element field id: 136
///
@@ -247,8 +248,9 @@ impl DataFile {
}
/// Get the split offsets of the data file.
/// For example, all row group offsets in a Parquet file.
- pub fn split_offsets(&self) -> &[i64] {
- &self.split_offsets
+ /// Returns `None` if no split offsets are present.
+ pub fn split_offsets(&self) -> Option<&[i64]> {
+ self.split_offsets.as_deref()
}
/// Get the equality ids of the data file.
/// Field ids used to determine row equality in equality delete files.
diff --git a/crates/iceberg/src/spec/manifest/mod.rs
b/crates/iceberg/src/spec/manifest/mod.rs
index 51219bfdb..b126396e3 100644
--- a/crates/iceberg/src/spec/manifest/mod.rs
+++ b/crates/iceberg/src/spec/manifest/mod.rs
@@ -257,7 +257,7 @@ mod tests {
snapshot_id: None,
sequence_number: None,
file_sequence_number: None,
- data_file: DataFile
{content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]
[...]
+ data_file: DataFile
{content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]
[...]
}
];
@@ -435,7 +435,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: Some(Vec::new()),
sort_order_id: None,
partition_spec_id: 0,
@@ -532,7 +532,7 @@ mod tests {
lower_bounds:
HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
upper_bounds:
HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
@@ -640,7 +640,7 @@ mod tests {
(3, Datum::string("x"))
]),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
@@ -749,7 +749,7 @@ mod tests {
(3, Datum::string("x"))
]),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -840,7 +840,7 @@ mod tests {
(2, Datum::int(2)),
]),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -922,7 +922,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -957,7 +957,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -992,7 +992,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -1027,7 +1027,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -1182,7 +1182,7 @@ mod tests {
"lower_bounds": [],
"upper_bounds": [],
"key_metadata": null,
- "split_offsets": [],
+ "split_offsets": null,
"equality_ids": null,
"sort_order_id": null,
"first_row_id": null,
@@ -1213,7 +1213,7 @@ mod tests {
"lower_bounds": [],
"upper_bounds": [],
"key_metadata": null,
- "split_offsets": [],
+ "split_offsets": null,
"equality_ids": null,
"sort_order_id": null,
"first_row_id": null,
diff --git a/crates/iceberg/src/spec/manifest/writer.rs
b/crates/iceberg/src/spec/manifest/writer.rs
index 389ac7a1f..2fb6a4206 100644
--- a/crates/iceberg/src/spec/manifest/writer.rs
+++ b/crates/iceberg/src/spec/manifest/writer.rs
@@ -608,7 +608,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: Some(Vec::new()),
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -637,7 +637,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: Some(Vec::new()),
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -666,7 +666,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: Some(Vec::new()),
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
diff --git a/crates/iceberg/src/spec/snapshot_summary.rs
b/crates/iceberg/src/spec/snapshot_summary.rs
index 4cd3715e0..c67ee37d3 100644
--- a/crates/iceberg/src/spec/snapshot_summary.rs
+++ b/crates/iceberg/src/spec/snapshot_summary.rs
@@ -767,7 +767,7 @@ mod tests {
(3, Datum::string("x")),
]),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
@@ -799,7 +799,7 @@ mod tests {
(3, Datum::string("x")),
]),
key_metadata: None,
- split_offsets: vec![4],
+ split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
@@ -910,7 +910,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -938,7 +938,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
@@ -993,7 +993,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
- split_offsets: vec![],
+ split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
diff --git a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
index cd0b19148..dd8487f9c 100644
--- a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
+++ b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
@@ -293,15 +293,15 @@ mod test {
assert_eq!(*data_file.null_value_counts.get(id).unwrap(), expect);
}
- assert_eq!(data_file.split_offsets.len(), metadata.num_row_groups());
- data_file
+ let split_offsets = data_file
.split_offsets
- .iter()
- .enumerate()
- .for_each(|(i, &v)| {
- let expect = metadata.row_groups()[i].file_offset().unwrap();
- assert_eq!(v, expect);
- });
+ .as_ref()
+ .expect("split_offsets should be set");
+ assert_eq!(split_offsets.len(), metadata.num_row_groups());
+ split_offsets.iter().enumerate().for_each(|(i, &v)| {
+ let expect = metadata.row_groups()[i].file_offset().unwrap();
+ assert_eq!(v, expect);
+ });
}
#[tokio::test]
diff --git a/crates/iceberg/src/writer/file_writer/parquet_writer.rs
b/crates/iceberg/src/writer/file_writer/parquet_writer.rs
index 356c2cb43..8fe40df71 100644
--- a/crates/iceberg/src/writer/file_writer/parquet_writer.rs
+++ b/crates/iceberg/src/writer/file_writer/parquet_writer.rs
@@ -412,13 +412,13 @@ impl ParquetWriter {
// - We can ignore implementing distinct_counts due to this:
https://lists.apache.org/thread/j52tsojv0x4bopxyzsp7m7bqt23n5fnd
.lower_bounds(lower_bounds)
.upper_bounds(upper_bounds)
- .split_offsets(
+ .split_offsets(Some(
metadata
.row_groups()
.iter()
.filter_map(|group| group.file_offset())
.collect(),
- );
+ ));
Ok(builder)
}