This is an automated email from the ASF dual-hosted git repository.
xuanwo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 921f3895 refactor: Move equality-ids closer to the spec (#1705)
921f3895 is described below
commit 921f38955443dfecedcc0a94fa4bd9bbed2b2afb
Author: Fokko Driesprong <[email protected]>
AuthorDate: Tue Sep 23 18:56:43 2025 +0200
refactor: Move equality-ids closer to the spec (#1705)
## Which issue does this PR close?
Right now the equality-deletes can't be not-null, while the spec states
that it should be null in the case of a manifest-entry that's not an
equality delete:
<img width="835" height="341" alt="image"
src="https://github.com/user-attachments/assets/60a88f37-7c50-48b7-8878-ecfe4bd70509"
/>
## What changes are included in this PR?
<!--
Provide a summary of the modifications in this PR. List the main changes
such as new features, bug fixes, refactoring, or any other updates.
-->
## Are these changes tested?
<!--
Specify what test covers (unit test, integration test, etc.).
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->
---
bindings/python/src/data_file.rs | 2 +-
bindings/python/tests/test_manifest.py | 2 +-
.../src/arrow/caching_delete_file_loader.rs | 2 +-
crates/iceberg/src/arrow/delete_filter.rs | 6 +++---
.../src/expr/visitors/expression_evaluator.rs | 4 ++--
.../expr/visitors/inclusive_metrics_evaluator.rs | 12 +++++------
.../src/expr/visitors/strict_metrics_evaluator.rs | 8 ++++----
crates/iceberg/src/scan/task.rs | 4 ++--
crates/iceberg/src/spec/manifest/_serde.rs | 17 +++++++--------
crates/iceberg/src/spec/manifest/data_file.rs | 6 +++---
crates/iceberg/src/spec/manifest/mod.rs | 24 +++++++++++-----------
crates/iceberg/src/spec/manifest/writer.rs | 6 +++---
crates/iceberg/src/spec/snapshot_summary.rs | 10 ++++-----
.../writer/base_writer/equality_delete_writer.rs | 2 +-
14 files changed, 51 insertions(+), 54 deletions(-)
diff --git a/bindings/python/src/data_file.rs b/bindings/python/src/data_file.rs
index 3339b384..900d6c60 100644
--- a/bindings/python/src/data_file.rs
+++ b/bindings/python/src/data_file.rs
@@ -148,7 +148,7 @@ impl PyDataFile {
}
#[getter]
- fn equality_ids(&self) -> &[i32] {
+ fn equality_ids(&self) -> Option<Vec<i32>> {
self.inner.equality_ids()
}
diff --git a/bindings/python/tests/test_manifest.py
b/bindings/python/tests/test_manifest.py
index 0e838c39..701eac25 100644
--- a/bindings/python/tests/test_manifest.py
+++ b/bindings/python/tests/test_manifest.py
@@ -138,5 +138,5 @@ def test_read_manifest_entry(generated_manifest_entry_file:
str) -> None:
}
assert data_file.key_metadata is None
assert data_file.split_offsets == [4]
- assert data_file.equality_ids == []
+ assert data_file.equality_ids is None
assert data_file.sort_order_id == 0
diff --git a/crates/iceberg/src/arrow/caching_delete_file_loader.rs
b/crates/iceberg/src/arrow/caching_delete_file_loader.rs
index c6a8943d..9cf60568 100644
--- a/crates/iceberg/src/arrow/caching_delete_file_loader.rs
+++ b/crates/iceberg/src/arrow/caching_delete_file_loader.rs
@@ -233,7 +233,7 @@ impl CachingDeleteFileLoader {
)
.await?,
sender,
- equality_ids:
HashSet::from_iter(task.equality_ids.clone()),
+ equality_ids:
HashSet::from_iter(task.equality_ids.clone().unwrap()),
})
}
diff --git a/crates/iceberg/src/arrow/delete_filter.rs
b/crates/iceberg/src/arrow/delete_filter.rs
index 0dd53a34..b853baa9 100644
--- a/crates/iceberg/src/arrow/delete_filter.rs
+++ b/crates/iceberg/src/arrow/delete_filter.rs
@@ -311,21 +311,21 @@ pub(crate) mod tests {
file_path: format!("{}/pos-del-1.parquet",
table_location.to_str().unwrap()),
file_type: DataContentType::PositionDeletes,
partition_spec_id: 0,
- equality_ids: vec![],
+ equality_ids: None,
};
let pos_del_2 = FileScanTaskDeleteFile {
file_path: format!("{}/pos-del-2.parquet",
table_location.to_str().unwrap()),
file_type: DataContentType::PositionDeletes,
partition_spec_id: 0,
- equality_ids: vec![],
+ equality_ids: None,
};
let pos_del_3 = FileScanTaskDeleteFile {
file_path: format!("{}/pos-del-3.parquet",
table_location.to_str().unwrap()),
file_type: DataContentType::PositionDeletes,
partition_spec_id: 0,
- equality_ids: vec![],
+ equality_ids: None,
};
let file_scan_tasks = vec![
diff --git a/crates/iceberg/src/expr/visitors/expression_evaluator.rs
b/crates/iceberg/src/expr/visitors/expression_evaluator.rs
index 4db1ad7d..3675ce35 100644
--- a/crates/iceberg/src/expr/visitors/expression_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/expression_evaluator.rs
@@ -347,7 +347,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -375,7 +375,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
index a00376e1..2b65cf12 100644
--- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
@@ -1996,7 +1996,7 @@ mod test {
upper_bounds: Default::default(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -2022,7 +2022,7 @@ mod test {
upper_bounds: Default::default(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -2084,7 +2084,7 @@ mod test {
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -2115,7 +2115,7 @@ mod test {
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -2147,7 +2147,7 @@ mod test {
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -2179,7 +2179,7 @@ mod test {
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
diff --git a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
index f74ce3a6..e17c44c6 100644
--- a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
+++ b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
@@ -579,7 +579,7 @@ mod test {
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -605,7 +605,7 @@ mod test {
upper_bounds: Default::default(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -631,7 +631,7 @@ mod test {
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -658,7 +658,7 @@ mod test {
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
diff --git a/crates/iceberg/src/scan/task.rs b/crates/iceberg/src/scan/task.rs
index 7b111e4f..32fe3ae3 100644
--- a/crates/iceberg/src/scan/task.rs
+++ b/crates/iceberg/src/scan/task.rs
@@ -112,6 +112,6 @@ pub struct FileScanTaskDeleteFile {
/// partition id
pub partition_spec_id: i32,
- /// equality ids for equality deletes (empty for positional deletes)
- pub equality_ids: Vec<i32>,
+ /// equality ids for equality deletes (null for anything other than
equality-deletes)
+ pub equality_ids: Option<Vec<i32>>,
}
diff --git a/crates/iceberg/src/spec/manifest/_serde.rs
b/crates/iceberg/src/spec/manifest/_serde.rs
index 462128cf..7738af46 100644
--- a/crates/iceberg/src/spec/manifest/_serde.rs
+++ b/crates/iceberg/src/spec/manifest/_serde.rs
@@ -116,7 +116,6 @@ pub(super) struct DataFileSerde {
upper_bounds: Option<Vec<BytesEntry>>,
key_metadata: Option<serde_bytes::ByteBuf>,
split_offsets: Option<Vec<i64>>,
- #[serde(default)]
equality_ids: Option<Vec<i32>>,
sort_order_id: Option<i32>,
first_row_id: Option<i64>,
@@ -155,7 +154,7 @@ impl DataFileSerde {
upper_bounds: Some(to_bytes_entry(value.upper_bounds)?),
key_metadata: value.key_metadata.map(serde_bytes::ByteBuf::from),
split_offsets: Some(value.split_offsets),
- equality_ids: Some(value.equality_ids),
+ equality_ids: value.equality_ids,
sort_order_id: value.sort_order_id,
first_row_id: value.first_row_id,
referenced_data_file: value.referenced_data_file,
@@ -224,7 +223,7 @@ impl DataFileSerde {
.unwrap_or_default(),
key_metadata: self.key_metadata.map(|v| v.to_vec()),
split_offsets: self.split_offsets.unwrap_or_default(),
- equality_ids: self.equality_ids.unwrap_or_default(),
+ equality_ids: self.equality_ids,
sort_order_id: self.sort_order_id,
partition_spec_id,
first_row_id: self.first_row_id,
@@ -382,7 +381,7 @@ mod tests {
upper_bounds:
HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
first_row_id: None,
@@ -517,9 +516,8 @@ mod tests {
"DataFileSerde should convert content 0 to DataContentType::Data"
);
assert_eq!(
- v2_entry.data_file.equality_ids,
- Vec::<i32>::new(),
- "DataFileSerde should convert None equality_ids to empty vec"
+ v2_entry.data_file.equality_ids, None,
+ "DataFileSerde should preserve None equality_ids as None"
);
// Verify other fields are preserved during conversion
@@ -581,9 +579,8 @@ mod tests {
"content 0 should convert to DataContentType::Data"
);
assert_eq!(
- data_file.equality_ids,
- Vec::<i32>::new(),
- "None equality_ids should convert to empty vec via
unwrap_or_default()"
+ data_file.equality_ids, None,
+ "None equality_ids should remain as None"
);
// Verify other fields are handled correctly during conversion
diff --git a/crates/iceberg/src/spec/manifest/data_file.rs
b/crates/iceberg/src/spec/manifest/data_file.rs
index d191ba2e..931f9441 100644
--- a/crates/iceberg/src/spec/manifest/data_file.rs
+++ b/crates/iceberg/src/spec/manifest/data_file.rs
@@ -135,7 +135,7 @@ pub struct DataFile {
/// otherwise. Fields with ids listed in this column must be present
/// in the delete file
#[builder(default)]
- pub(crate) equality_ids: Vec<i32>,
+ pub(crate) equality_ids: Option<Vec<i32>>,
/// field id: 140
///
/// ID representing sort order for this file.
@@ -249,8 +249,8 @@ impl DataFile {
/// Get the equality ids of the data file.
/// Field ids used to determine row equality in equality delete files.
/// null when content is not EqualityDeletes.
- pub fn equality_ids(&self) -> &[i32] {
- &self.equality_ids
+ pub fn equality_ids(&self) -> Option<Vec<i32>> {
+ self.equality_ids.clone()
}
/// Get the first row id in the data file.
pub fn first_row_id(&self) -> Option<i64> {
diff --git a/crates/iceberg/src/spec/manifest/mod.rs
b/crates/iceberg/src/spec/manifest/mod.rs
index da039773..a1a5612c 100644
--- a/crates/iceberg/src/spec/manifest/mod.rs
+++ b/crates/iceberg/src/spec/manifest/mod.rs
@@ -256,7 +256,7 @@ mod tests {
snapshot_id: None,
sequence_number: None,
file_sequence_number: None,
- data_file: DataFile
{content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]
[...]
+ data_file: DataFile
{content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]
[...]
}
];
@@ -435,7 +435,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: vec![],
+ equality_ids: Some(Vec::new()),
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -532,7 +532,7 @@ mod tests {
upper_bounds:
HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
first_row_id: None,
@@ -640,7 +640,7 @@ mod tests {
]),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
first_row_id: None,
@@ -749,7 +749,7 @@ mod tests {
]),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -840,7 +840,7 @@ mod tests {
]),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -922,7 +922,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: Vec::new(),
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -957,7 +957,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: Vec::new(),
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -992,7 +992,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: Vec::new(),
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -1027,7 +1027,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: Vec::new(),
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -1182,7 +1182,7 @@ mod tests {
"upper_bounds": [],
"key_metadata": null,
"split_offsets": [],
- "equality_ids": [],
+ "equality_ids": null,
"sort_order_id": null,
"first_row_id": null,
"referenced_data_file": null,
@@ -1213,7 +1213,7 @@ mod tests {
"upper_bounds": [],
"key_metadata": null,
"split_offsets": [],
- "equality_ids": [],
+ "equality_ids": null,
"sort_order_id": null,
"first_row_id": null,
"referenced_data_file": null,
diff --git a/crates/iceberg/src/spec/manifest/writer.rs
b/crates/iceberg/src/spec/manifest/writer.rs
index 39945a51..673f8b5d 100644
--- a/crates/iceberg/src/spec/manifest/writer.rs
+++ b/crates/iceberg/src/spec/manifest/writer.rs
@@ -545,7 +545,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: Some(Vec::new()),
split_offsets: vec![4],
- equality_ids: Vec::new(),
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -574,7 +574,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: Some(Vec::new()),
split_offsets: vec![4],
- equality_ids: Vec::new(),
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -603,7 +603,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: Some(Vec::new()),
split_offsets: vec![4],
- equality_ids: Vec::new(),
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
diff --git a/crates/iceberg/src/spec/snapshot_summary.rs
b/crates/iceberg/src/spec/snapshot_summary.rs
index e374e567..1b07ce3f 100644
--- a/crates/iceberg/src/spec/snapshot_summary.rs
+++ b/crates/iceberg/src/spec/snapshot_summary.rs
@@ -768,7 +768,7 @@ mod tests {
]),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
first_row_id: None,
@@ -800,7 +800,7 @@ mod tests {
]),
key_metadata: None,
split_offsets: vec![4],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
first_row_id: None,
@@ -910,7 +910,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -938,7 +938,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
@@ -992,7 +992,7 @@ mod tests {
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![],
- equality_ids: vec![],
+ equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
first_row_id: None,
diff --git a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
index f710544d..765ff1ca 100644
--- a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
+++ b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
@@ -156,7 +156,7 @@ impl<B: FileWriterBuilder> IcebergWriter for
EqualityDeleteFileWriter<B> {
.into_iter()
.map(|mut res| {
res.content(crate::spec::DataContentType::EqualityDeletes);
-
res.equality_ids(self.equality_ids.iter().copied().collect_vec());
+
res.equality_ids(Some(self.equality_ids.iter().copied().collect_vec()));
res.partition(self.partition_value.clone());
res.partition_spec_id(self.partition_spec_id);
res.build().expect("msg")