This is an automated email from the ASF dual-hosted git repository.
laskoviymishka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git
The following commit(s) were added to refs/heads/main by this push:
new 1401cb6b feat(table): reject AddDataFiles on v3 when first_row_id is
missing (#1000) (#1101)
1401cb6b is described below
commit 1401cb6be0656b36771ac7d5d13e87cb224db006
Author: Tanmay Rauth <[email protected]>
AuthorDate: Thu May 21 19:11:59 2026 +0530
feat(table): reject AddDataFiles on v3 when first_row_id is missing (#1000)
(#1101)
Closes #1000
When users call `AddDataFiles` (or `ReplaceDataFilesWithDataFiles` /
`ReplaceFiles`) with externally-written parquet files on a v3 table,
reject the operation if `first_row_id` is not set. This mirrors
pyiceberg's behavior — the library cannot fabricate row IDs
retroactively, so callers must supply them explicitly via
`DataFileBuilder.FirstRowID()`.
## Changes
- Added validation in `validateDataFilesToAdd` to require `first_row_id`
on format version >= 3
- Error message directs users to `DataFileBuilder.FirstRowID()`
- v2 tables are unaffected
---
table/snapshot_producers_test.go | 44 ++++++++++++++++++++++++++++++++++++++++
table/transaction.go | 6 ++++++
2 files changed, 50 insertions(+)
diff --git a/table/snapshot_producers_test.go b/table/snapshot_producers_test.go
index 510efe96..5eea6028 100644
--- a/table/snapshot_producers_test.go
+++ b/table/snapshot_producers_test.go
@@ -968,3 +968,47 @@ func TestComputeOwnManifests_ParentManifestsIOError(t
*testing.T) {
"swallows the IO error and returns a programming-bug
error would fail this test")
require.Nil(t, got, "error path must return nil manifest slice")
}
+
+func TestAddDataFilesV3RejectsWithoutFirstRowID(t *testing.T) {
+ spec := iceberg.NewPartitionSpec()
+ txn, _ := createTestTransactionWithMemIO(t, spec)
+ txn.meta.formatVersion = 3
+
+ df := newTestDataFile(t, spec, "file://data.parquet", nil)
+
+ err := txn.AddDataFiles(context.Background(), []iceberg.DataFile{df},
nil)
+ require.Error(t, err)
+ require.ErrorContains(t, err, "missing first_row_id")
+ require.ErrorContains(t, err, "required for v3 tables")
+}
+
+func TestAddDataFilesV3SucceedsWithFirstRowID(t *testing.T) {
+ spec := iceberg.NewPartitionSpec()
+ txn, _ := createTestTransactionWithMemIO(t, spec)
+ txn.meta.formatVersion = 3
+
+ builder, err := iceberg.NewDataFileBuilder(
+ spec, iceberg.EntryContentData, "file://data.parquet",
+ iceberg.ParquetFile, nil, nil, nil, 5, 100,
+ )
+ require.NoError(t, err)
+ df := builder.FirstRowID(0).Build()
+
+ err = txn.AddDataFiles(context.Background(), []iceberg.DataFile{df},
nil)
+ require.NoError(t, err)
+
+ meta, err := txn.meta.Build()
+ require.NoError(t, err)
+ require.Equal(t, int64(5), meta.NextRowID(), "next-row-id should
advance by record count")
+}
+
+func TestAddDataFilesV2SucceedsWithoutFirstRowID(t *testing.T) {
+ spec := iceberg.NewPartitionSpec()
+ txn, _ := createTestTransactionWithMemIO(t, spec)
+ txn.meta.formatVersion = 2
+
+ df := newTestDataFile(t, spec, "file://data.parquet", nil)
+
+ err := txn.AddDataFiles(context.Background(), []iceberg.DataFile{df},
nil)
+ require.NoError(t, err)
+}
diff --git a/table/transaction.go b/table/transaction.go
index 7354e934..72d887dd 100644
--- a/table/transaction.go
+++ b/table/transaction.go
@@ -568,6 +568,12 @@ func (t *Transaction) validateDataFilesToAdd(dataFiles
[]iceberg.DataFile, opera
if err := validateDataFilePartitionData(df, currentSpec); err
!= nil {
return nil, fmt.Errorf("data file %s has invalid
partition data for %s: %w", path, operation, err)
}
+
+ if t.meta.formatVersion >= 3 && df.FirstRowID() == nil {
+ return nil, fmt.Errorf(
+ "data file %s is missing first_row_id which is
required for v3 tables for %s: use DataFileBuilder.FirstRowID() to set it
explicitly",
+ path, operation)
+ }
}
return setToAdd, nil