(iceberg-rust) branch main updated: fix: reuse partition field IDs for equivalent fields in AddSpec (#2011)

liurenjie1024 Sun, 18 Jan 2026 23:48:22 -0800

This is an automated email from the ASF dual-hosted git repository.

liurenjie1024 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git



The following commit(s) were added to refs/heads/main by this push:
     new 2c690c226 fix: reuse partition field IDs for equivalent fields in 
AddSpec (#2011)
2c690c226 is described below

commit 2c690c226c1faa44d9fb4842299111de27c97a3b
Author: Aditya Subrahmanyan <[email protected]>
AuthorDate: Sun Jan 18 23:46:40 2026 -0800

    fix: reuse partition field IDs for equivalent fields in AddSpec (#2011)
    
    ## Which issue does this PR close?
    
    - Closes #2007.
    
    ## What changes are included in this PR?
    
    Implements Iceberg spec requirement to reuse partition field IDs when
    adding specs with equivalent fields (same source_id + transform).
    
    - Add Hash trait to Transform enum for HashMap keys
    - Add field ID reuse logic in TableMetadataBuilder.add_partition_spec()
    
    ## Are these changes tested?
    
    Added a test to cover partition field id reuse when new specs are added
    
    ---------
    
    Co-authored-by: Renjie Liu <[email protected]>
---
 crates/iceberg/src/spec/table_metadata_builder.rs | 100 ++++++++++++++++++++++
 crates/iceberg/src/spec/transform.rs              |   2 +-
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/crates/iceberg/src/spec/table_metadata_builder.rs 
b/crates/iceberg/src/spec/table_metadata_builder.rs
index 45d1ccefc..3508c1e52 100644
--- a/crates/iceberg/src/spec/table_metadata_builder.rs
+++ b/crates/iceberg/src/spec/table_metadata_builder.rs
@@ -835,6 +835,9 @@ impl TableMetadataBuilder {
         // Check if partition field names conflict with schema field names 
across all schemas
         self.validate_partition_field_names(&unbound_spec)?;
 
+        // Reuse field IDs for equivalent fields from existing partition specs
+        let unbound_spec = self.reuse_partition_field_ids(unbound_spec)?;
+
         let spec = 
PartitionSpecBuilder::new_from_unbound(unbound_spec.clone(), schema)?
             .with_last_assigned_field_id(self.metadata.last_partition_id)
             .build()?;
@@ -877,6 +880,44 @@ impl TableMetadataBuilder {
         Ok(self)
     }
 
+    /// Reuse partition field IDs for equivalent fields from existing 
partition specs.
+    ///
+    /// According to the Iceberg spec, partition field IDs must be reused if 
an existing
+    /// partition spec contains an equivalent field (same source_id and 
transform).
+    fn reuse_partition_field_ids(
+        &self,
+        unbound_spec: UnboundPartitionSpec,
+    ) -> Result<UnboundPartitionSpec> {
+        // Build a map of (source_id, transform) -> field_id from existing 
specs
+        let equivalent_field_ids: HashMap<_, _> = self
+            .metadata
+            .partition_specs
+            .values()
+            .flat_map(|spec| spec.fields())
+            .map(|field| ((field.source_id, &field.transform), field.field_id))
+            .collect();
+
+        // Create new fields with reused field IDs where possible
+        let fields = unbound_spec
+            .fields
+            .into_iter()
+            .map(|mut field| {
+                if field.field_id.is_none()
+                    && let Some(&existing_field_id) =
+                        equivalent_field_ids.get(&(field.source_id, 
&field.transform))
+                {
+                    field.field_id = Some(existing_field_id);
+                }
+                field
+            })
+            .collect();
+
+        Ok(UnboundPartitionSpec {
+            spec_id: unbound_spec.spec_id,
+            fields,
+        })
+    }
+
     /// Set the default partition spec.
     ///
     /// # Errors
@@ -3507,4 +3548,63 @@ mod tests {
         let keys = build_result.metadata.encryption_keys_iter();
         assert_eq!(keys.len(), 0);
     }
+
+    #[test]
+    fn test_partition_field_id_reuse_across_specs() {
+        let schema = Schema::builder()
+            .with_fields(vec![
+                NestedField::required(1, "id", 
Type::Primitive(PrimitiveType::Long)).into(),
+                NestedField::required(2, "data", 
Type::Primitive(PrimitiveType::String)).into(),
+                NestedField::required(3, "timestamp", 
Type::Primitive(PrimitiveType::Timestamp))
+                    .into(),
+            ])
+            .build()
+            .unwrap();
+
+        // Create initial table with spec 0: identity(id) -> field_id = 1000
+        let initial_spec = UnboundPartitionSpec::builder()
+            .add_partition_field(1, "id", Transform::Identity)
+            .unwrap()
+            .build();
+
+        let mut metadata = TableMetadataBuilder::new(
+            schema,
+            initial_spec,
+            SortOrder::unsorted_order(),
+            "s3://bucket/table".to_string(),
+            FormatVersion::V2,
+            HashMap::new(),
+        )
+        .unwrap()
+        .build()
+        .unwrap()
+        .metadata;
+
+        // Add spec 1: bucket(data) -> field_id = 1001
+        let spec1 = UnboundPartitionSpec::builder()
+            .add_partition_field(2, "data_bucket", Transform::Bucket(10))
+            .unwrap()
+            .build();
+        let builder = 
metadata.into_builder(Some("s3://bucket/table/metadata/v1.json".to_string()));
+        let result = 
builder.add_partition_spec(spec1).unwrap().build().unwrap();
+        metadata = result.metadata;
+
+        // Add spec 2: identity(id) + bucket(data) + year(timestamp)
+        // Should reuse field_id 1000 for identity(id) and 1001 for 
bucket(data)
+        let spec2 = UnboundPartitionSpec::builder()
+            .add_partition_field(1, "id", Transform::Identity) // Should reuse 
1000
+            .unwrap()
+            .add_partition_field(2, "data_bucket", Transform::Bucket(10)) // 
Should reuse 1001
+            .unwrap()
+            .add_partition_field(3, "year", Transform::Year) // Should get new 
1002
+            .unwrap()
+            .build();
+        let builder = 
metadata.into_builder(Some("s3://bucket/table/metadata/v2.json".to_string()));
+        let result = 
builder.add_partition_spec(spec2).unwrap().build().unwrap();
+
+        // Verify field ID reuse: spec 2 should reuse IDs from specs 0 and 1, 
assign new ID for new field
+        let spec2 = result.metadata.partition_spec_by_id(2).unwrap();
+        let field_ids: Vec<i32> = spec2.fields().iter().map(|f| 
f.field_id).collect();
+        assert_eq!(field_ids, vec![1000, 1001, 1002]); // Reused 1000, 1001; 
new 1002
+    }
 }
diff --git a/crates/iceberg/src/spec/transform.rs 
b/crates/iceberg/src/spec/transform.rs
index 354dc1889..026b12613 100644
--- a/crates/iceberg/src/spec/transform.rs
+++ b/crates/iceberg/src/spec/transform.rs
@@ -47,7 +47,7 @@ use crate::transform::{BoxedTransformFunction, 
create_transform_function};
 /// predicates and partition predicates.
 ///
 /// All transforms must return `null` for a `null` input value.
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
 pub enum Transform {
     /// Source value, unmodified
     ///

(iceberg-rust) branch main updated: fix: reuse partition field IDs for equivalent fields in AddSpec (#2011)

Reply via email to