This is an automated email from the ASF dual-hosted git repository.
liurenjie1024 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new d76bb5ed1 fix: allow v2 to v3 table upgrades with existing snapshots
(#2010)
d76bb5ed1 is described below
commit d76bb5ed124e65e42bdc488e5028cb9d20366aa5
Author: Aditya Subrahmanyan <[email protected]>
AuthorDate: Fri Jan 16 16:08:16 2026 -0800
fix: allow v2 to v3 table upgrades with existing snapshots (#2010)
---
crates/iceberg/src/spec/snapshot.rs | 33 ++--
crates/iceberg/src/spec/table_metadata.rs | 191 ++++++++++++++++++++++
crates/iceberg/src/spec/table_metadata_builder.rs | 2 +
3 files changed, 211 insertions(+), 15 deletions(-)
diff --git a/crates/iceberg/src/spec/snapshot.rs
b/crates/iceberg/src/spec/snapshot.rs
index 270279988..802cd6546 100644
--- a/crates/iceberg/src/spec/snapshot.rs
+++ b/crates/iceberg/src/spec/snapshot.rs
@@ -272,7 +272,7 @@ pub(super) mod _serde {
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "kebab-case")]
- /// Defines the structure of a v2 snapshot for
serialization/deserialization
+ /// Defines the structure of a v3 snapshot for
serialization/deserialization
pub(crate) struct SnapshotV3 {
pub snapshot_id: i64,
#[serde(skip_serializing_if = "Option::is_none")]
@@ -283,8 +283,10 @@ pub(super) mod _serde {
pub summary: Summary,
#[serde(skip_serializing_if = "Option::is_none")]
pub schema_id: Option<SchemaId>,
- pub first_row_id: u64,
- pub added_rows: u64,
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub first_row_id: Option<u64>,
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub added_rows: Option<u64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub key_id: Option<String>,
}
@@ -333,10 +335,13 @@ pub(super) mod _serde {
summary: s.summary,
schema_id: s.schema_id,
encryption_key_id: s.key_id,
- row_range: Some(SnapshotRowRange {
- first_row_id: s.first_row_id,
- added_rows: s.added_rows,
- }),
+ row_range: match (s.first_row_id, s.added_rows) {
+ (Some(first_row_id), Some(added_rows)) =>
Some(SnapshotRowRange {
+ first_row_id,
+ added_rows,
+ }),
+ _ => None,
+ },
}
}
}
@@ -345,12 +350,10 @@ pub(super) mod _serde {
type Error = Error;
fn try_from(s: Snapshot) -> Result<Self, Self::Error> {
- let row_range = s.row_range.ok_or_else(|| {
- Error::new(
- crate::ErrorKind::DataInvalid,
- "v3 Snapshots must have first-row-id and rows-added fields
set.".to_string(),
- )
- })?;
+ let (first_row_id, added_rows) = match s.row_range {
+ Some(row_range) => (Some(row_range.first_row_id),
Some(row_range.added_rows)),
+ None => (None, None),
+ };
Ok(SnapshotV3 {
snapshot_id: s.snapshot_id,
@@ -360,8 +363,8 @@ pub(super) mod _serde {
manifest_list: s.manifest_list,
summary: s.summary,
schema_id: s.schema_id,
- first_row_id: row_range.first_row_id,
- added_rows: row_range.added_rows,
+ first_row_id,
+ added_rows,
key_id: s.encryption_key_id,
})
}
diff --git a/crates/iceberg/src/spec/table_metadata.rs
b/crates/iceberg/src/spec/table_metadata.rs
index 585cb3e2b..d3836b2f4 100644
--- a/crates/iceberg/src/spec/table_metadata.rs
+++ b/crates/iceberg/src/spec/table_metadata.rs
@@ -3978,4 +3978,195 @@ mod tests {
assert_eq!(err.kind(), ErrorKind::DataInvalid);
assert!(err.message().contains("Invalid table properties"));
}
+
+ #[test]
+ fn
test_v2_to_v3_upgrade_preserves_existing_snapshots_without_row_lineage() {
+ // Create a v2 table metadata
+ let schema = Schema::builder()
+ .with_fields(vec![
+ NestedField::required(1, "id",
Type::Primitive(PrimitiveType::Long)).into(),
+ ])
+ .build()
+ .unwrap();
+
+ let v2_metadata = TableMetadataBuilder::new(
+ schema,
+ PartitionSpec::unpartition_spec().into_unbound(),
+ SortOrder::unsorted_order(),
+ "s3://bucket/test/location".to_string(),
+ FormatVersion::V2,
+ HashMap::new(),
+ )
+ .unwrap()
+ .build()
+ .unwrap()
+ .metadata;
+
+ // Add a v2 snapshot
+ let snapshot = Snapshot::builder()
+ .with_snapshot_id(1)
+ .with_timestamp_ms(v2_metadata.last_updated_ms + 1)
+ .with_sequence_number(1)
+ .with_schema_id(0)
+ .with_manifest_list("s3://bucket/test/metadata/snap-1.avro")
+ .with_summary(Summary {
+ operation: Operation::Append,
+ additional_properties: HashMap::from([(
+ "added-data-files".to_string(),
+ "1".to_string(),
+ )]),
+ })
+ .build();
+
+ let v2_with_snapshot = v2_metadata
+
.into_builder(Some("s3://bucket/test/metadata/v00001.json".to_string()))
+ .add_snapshot(snapshot)
+ .unwrap()
+ .set_ref("main", SnapshotReference {
+ snapshot_id: 1,
+ retention: SnapshotRetention::Branch {
+ min_snapshots_to_keep: None,
+ max_snapshot_age_ms: None,
+ max_ref_age_ms: None,
+ },
+ })
+ .unwrap()
+ .build()
+ .unwrap()
+ .metadata;
+
+ // Verify v2 serialization works fine
+ let v2_json = serde_json::to_string(&v2_with_snapshot);
+ assert!(v2_json.is_ok(), "v2 serialization should work");
+
+ // Upgrade to v3
+ let v3_metadata = v2_with_snapshot
+
.into_builder(Some("s3://bucket/test/metadata/v00002.json".to_string()))
+ .upgrade_format_version(FormatVersion::V3)
+ .unwrap()
+ .build()
+ .unwrap()
+ .metadata;
+
+ assert_eq!(v3_metadata.format_version, FormatVersion::V3);
+ assert_eq!(v3_metadata.next_row_id, INITIAL_ROW_ID);
+ assert_eq!(v3_metadata.snapshots.len(), 1);
+
+ // Verify the snapshot has no row_range
+ let snapshot = v3_metadata.snapshots.values().next().unwrap();
+ assert!(
+ snapshot.row_range().is_none(),
+ "Snapshot should have no row_range after upgrade"
+ );
+
+ // Try to serialize v3 metadata - this should now work
+ let v3_json = serde_json::to_string(&v3_metadata);
+ assert!(
+ v3_json.is_ok(),
+ "v3 serialization should work for upgraded tables"
+ );
+
+ // Verify we can deserialize it back
+ let deserialized: TableMetadata =
serde_json::from_str(&v3_json.unwrap()).unwrap();
+ assert_eq!(deserialized.format_version, FormatVersion::V3);
+ assert_eq!(deserialized.snapshots.len(), 1);
+
+ // Verify the deserialized snapshot still has no row_range
+ let deserialized_snapshot =
deserialized.snapshots.values().next().unwrap();
+ assert!(
+ deserialized_snapshot.row_range().is_none(),
+ "Deserialized snapshot should have no row_range"
+ );
+ }
+
+ #[test]
+ fn test_v3_snapshot_with_row_lineage_serialization() {
+ // Create a v3 table metadata
+ let schema = Schema::builder()
+ .with_fields(vec![
+ NestedField::required(1, "id",
Type::Primitive(PrimitiveType::Long)).into(),
+ ])
+ .build()
+ .unwrap();
+
+ let v3_metadata = TableMetadataBuilder::new(
+ schema,
+ PartitionSpec::unpartition_spec().into_unbound(),
+ SortOrder::unsorted_order(),
+ "s3://bucket/test/location".to_string(),
+ FormatVersion::V3,
+ HashMap::new(),
+ )
+ .unwrap()
+ .build()
+ .unwrap()
+ .metadata;
+
+ // Add a v3 snapshot with row lineage
+ let snapshot = Snapshot::builder()
+ .with_snapshot_id(1)
+ .with_timestamp_ms(v3_metadata.last_updated_ms + 1)
+ .with_sequence_number(1)
+ .with_schema_id(0)
+ .with_manifest_list("s3://bucket/test/metadata/snap-1.avro")
+ .with_summary(Summary {
+ operation: Operation::Append,
+ additional_properties: HashMap::from([(
+ "added-data-files".to_string(),
+ "1".to_string(),
+ )]),
+ })
+ .with_row_range(100, 50) // first_row_id=100, added_rows=50
+ .build();
+
+ let v3_with_snapshot = v3_metadata
+
.into_builder(Some("s3://bucket/test/metadata/v00001.json".to_string()))
+ .add_snapshot(snapshot)
+ .unwrap()
+ .set_ref("main", SnapshotReference {
+ snapshot_id: 1,
+ retention: SnapshotRetention::Branch {
+ min_snapshots_to_keep: None,
+ max_snapshot_age_ms: None,
+ max_ref_age_ms: None,
+ },
+ })
+ .unwrap()
+ .build()
+ .unwrap()
+ .metadata;
+
+ // Verify the snapshot has row_range
+ let snapshot = v3_with_snapshot.snapshots.values().next().unwrap();
+ assert!(
+ snapshot.row_range().is_some(),
+ "Snapshot should have row_range"
+ );
+ let (first_row_id, added_rows) = snapshot.row_range().unwrap();
+ assert_eq!(first_row_id, 100);
+ assert_eq!(added_rows, 50);
+
+ // Serialize v3 metadata - this should work
+ let v3_json = serde_json::to_string(&v3_with_snapshot);
+ assert!(
+ v3_json.is_ok(),
+ "v3 serialization should work for snapshots with row lineage"
+ );
+
+ // Verify we can deserialize it back
+ let deserialized: TableMetadata =
serde_json::from_str(&v3_json.unwrap()).unwrap();
+ assert_eq!(deserialized.format_version, FormatVersion::V3);
+ assert_eq!(deserialized.snapshots.len(), 1);
+
+ // Verify the deserialized snapshot has the correct row_range
+ let deserialized_snapshot =
deserialized.snapshots.values().next().unwrap();
+ assert!(
+ deserialized_snapshot.row_range().is_some(),
+ "Deserialized snapshot should have row_range"
+ );
+ let (deserialized_first_row_id, deserialized_added_rows) =
+ deserialized_snapshot.row_range().unwrap();
+ assert_eq!(deserialized_first_row_id, 100);
+ assert_eq!(deserialized_added_rows, 50);
+ }
}
diff --git a/crates/iceberg/src/spec/table_metadata_builder.rs
b/crates/iceberg/src/spec/table_metadata_builder.rs
index 3db327d48..45d1ccefc 100644
--- a/crates/iceberg/src/spec/table_metadata_builder.rs
+++ b/crates/iceberg/src/spec/table_metadata_builder.rs
@@ -233,6 +233,8 @@ impl TableMetadataBuilder {
}
FormatVersion::V3 => {
self.metadata.format_version = format_version;
+ // Set next-row-id to 0 when upgrading to v3 as per
Iceberg spec
+ self.metadata.next_row_id = INITIAL_ROW_ID;
self.changes
.push(TableUpdate::UpgradeFormatVersion {
format_version });
}