(iceberg-go) branch main updated: feat(table): reject AddDataFiles on v3 when first_row_id is missing (#1000) (#1101)

atserakhau Thu, 21 May 2026 06:42:18 -0700

This is an automated email from the ASF dual-hosted git repository.

laskoviymishka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git



The following commit(s) were added to refs/heads/main by this push:
     new 1401cb6b feat(table): reject AddDataFiles on v3 when first_row_id is 
missing (#1000) (#1101)
1401cb6b is described below

commit 1401cb6be0656b36771ac7d5d13e87cb224db006
Author: Tanmay Rauth <[email protected]>
AuthorDate: Thu May 21 19:11:59 2026 +0530

    feat(table): reject AddDataFiles on v3 when first_row_id is missing (#1000) 
(#1101)
    
    Closes #1000
    When users call `AddDataFiles` (or `ReplaceDataFilesWithDataFiles` /
    `ReplaceFiles`) with externally-written parquet files on a v3 table,
    reject the operation if `first_row_id` is not set. This mirrors
    pyiceberg's behavior — the library cannot fabricate row IDs
    retroactively, so callers must supply them explicitly via
    `DataFileBuilder.FirstRowID()`.
    ## Changes
    
    - Added validation in `validateDataFilesToAdd` to require `first_row_id`
    on format version >= 3
      - Error message directs users to `DataFileBuilder.FirstRowID()`
      - v2 tables are unaffected
---
 table/snapshot_producers_test.go | 44 ++++++++++++++++++++++++++++++++++++++++
 table/transaction.go             |  6 ++++++
 2 files changed, 50 insertions(+)

diff --git a/table/snapshot_producers_test.go b/table/snapshot_producers_test.go
index 510efe96..5eea6028 100644
--- a/table/snapshot_producers_test.go
+++ b/table/snapshot_producers_test.go
@@ -968,3 +968,47 @@ func TestComputeOwnManifests_ParentManifestsIOError(t 
*testing.T) {
                        "swallows the IO error and returns a programming-bug 
error would fail this test")
        require.Nil(t, got, "error path must return nil manifest slice")
 }
+
+func TestAddDataFilesV3RejectsWithoutFirstRowID(t *testing.T) {
+       spec := iceberg.NewPartitionSpec()
+       txn, _ := createTestTransactionWithMemIO(t, spec)
+       txn.meta.formatVersion = 3
+
+       df := newTestDataFile(t, spec, "file://data.parquet", nil)
+
+       err := txn.AddDataFiles(context.Background(), []iceberg.DataFile{df}, 
nil)
+       require.Error(t, err)
+       require.ErrorContains(t, err, "missing first_row_id")
+       require.ErrorContains(t, err, "required for v3 tables")
+}
+
+func TestAddDataFilesV3SucceedsWithFirstRowID(t *testing.T) {
+       spec := iceberg.NewPartitionSpec()
+       txn, _ := createTestTransactionWithMemIO(t, spec)
+       txn.meta.formatVersion = 3
+
+       builder, err := iceberg.NewDataFileBuilder(
+               spec, iceberg.EntryContentData, "file://data.parquet",
+               iceberg.ParquetFile, nil, nil, nil, 5, 100,
+       )
+       require.NoError(t, err)
+       df := builder.FirstRowID(0).Build()
+
+       err = txn.AddDataFiles(context.Background(), []iceberg.DataFile{df}, 
nil)
+       require.NoError(t, err)
+
+       meta, err := txn.meta.Build()
+       require.NoError(t, err)
+       require.Equal(t, int64(5), meta.NextRowID(), "next-row-id should 
advance by record count")
+}
+
+func TestAddDataFilesV2SucceedsWithoutFirstRowID(t *testing.T) {
+       spec := iceberg.NewPartitionSpec()
+       txn, _ := createTestTransactionWithMemIO(t, spec)
+       txn.meta.formatVersion = 2
+
+       df := newTestDataFile(t, spec, "file://data.parquet", nil)
+
+       err := txn.AddDataFiles(context.Background(), []iceberg.DataFile{df}, 
nil)
+       require.NoError(t, err)
+}
diff --git a/table/transaction.go b/table/transaction.go
index 7354e934..72d887dd 100644
--- a/table/transaction.go
+++ b/table/transaction.go
@@ -568,6 +568,12 @@ func (t *Transaction) validateDataFilesToAdd(dataFiles 
[]iceberg.DataFile, opera
                if err := validateDataFilePartitionData(df, currentSpec); err 
!= nil {
                        return nil, fmt.Errorf("data file %s has invalid 
partition data for %s: %w", path, operation, err)
                }
+
+               if t.meta.formatVersion >= 3 && df.FirstRowID() == nil {
+                       return nil, fmt.Errorf(
+                               "data file %s is missing first_row_id which is 
required for v3 tables for %s: use DataFileBuilder.FirstRowID() to set it 
explicitly",
+                               path, operation)
+               }
        }
 
        return setToAdd, nil

(iceberg-go) branch main updated: feat(table): reject AddDataFiles on v3 when first_row_id is missing (#1000) (#1101)

Reply via email to