(iceberg-go) branch main updated: fix(manifest): stop writing deprecated distinct_counts (field-id 111) (#1102)

atserakhau Thu, 21 May 2026 09:52:05 -0700

This is an automated email from the ASF dual-hosted git repository.

laskoviymishka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git



The following commit(s) were added to refs/heads/main by this push:
     new 51b3140b fix(manifest): stop writing deprecated distinct_counts 
(field-id 111) (#1102)
51b3140b is described below

commit 51b3140b3a96826622f2a9bc10be357b75d509ba
Author: Krutika Dhananjay <[email protected]>
AuthorDate: Thu May 21 22:21:52 2026 +0530

    fix(manifest): stop writing deprecated distinct_counts (field-id 111) 
(#1102)
    
    The Avro wire schema declared `distinct_counts` on data_file v1 and v2,
    causing it to be emitted on every manifest entry. The Iceberg spec marks
    this field as "Deprecated. Do not write."
    (https://github.com/apache/iceberg/blob/main/format/spec.md?plain=1#L667),
    so writers should not emit it. The field stands deprecated since 2020 --
    https://github.com/apache/iceberg/issues/767#issuecomment-582192665
    
    Strict readers that don't include field-id 111 in their data_file read
    schema (e.g. PyIceberg 0.10) fail to resolve manifests written by this
    library with: ResolveError "File/read schema are not aligned for map,
    got None".
    
    ---------
    
    Co-authored-by: Krutika Dhananjay <[email protected]>
    Co-authored-by: Matt Topol <[email protected]>
---
 codec/data_file.go                                 |  11 +-
 codec/data_file_test.go                            |  32 ++-
 data_file_codec.go                                 |  12 +-
 internal/avro_schemas.go                           |   6 -
 manifest.go                                        |  17 +-
 manifest_test.go                                   | 239 ++++++++++++++-------
 table/pos_delete_partitioned_fanout_writer_test.go |   4 +-
 table/table_test.go                                |  14 +-
 8 files changed, 200 insertions(+), 135 deletions(-)

diff --git a/codec/data_file.go b/codec/data_file.go
index b1f78fb1..4df07d9d 100644
--- a/codec/data_file.go
+++ b/codec/data_file.go
@@ -62,11 +62,12 @@ import (
 // usable as a standalone manifest entry — they only round-trip via
 // [DecodeDataFile].
 //
-// distinct_counts (field 111) is deprecated in the spec for all
-// versions. Already-set values round-trip on v1 and v2 as a
-// read-compatibility artifact; v3 omits the field entirely
-// (apache/iceberg#12182). New DataFiles should not set distinct
-// counts.
+// distinct_counts (field 111) is deprecated in the spec for every
+// version (apache/iceberg#12182). EncodeDataFile drops the field on
+// encode for v1, v2, and v3 alike — values populated on the source
+// DataFile are not transported. Legacy manifests that already carry
+// the field on the wire still decode correctly through
+// [DecodeDataFile]. New DataFiles should not set distinct counts.
 func EncodeDataFile(df iceberg.DataFile, spec iceberg.PartitionSpec, schema 
*iceberg.Schema, version int) ([]byte, error) {
        if version < 1 || version > 3 {
                return nil, fmt.Errorf("codec: EncodeDataFile: unsupported 
format version %d", version)
diff --git a/codec/data_file_test.go b/codec/data_file_test.go
index 99796b4d..0acb0791 100644
--- a/codec/data_file_test.go
+++ b/codec/data_file_test.go
@@ -183,15 +183,13 @@ func fullyPopulatedDataFile(t *testing.T, version int) 
(iceberg.PartitionSpec, *
                SplitOffsets([]int64{0, 4096}).
                SortOrderID(0).
                KeyMetadata([]byte("kms-key-1"))
-       if version < 3 {
-               // distinct_counts is deprecated for all versions in the spec
-               // (apache/iceberg#12182). The fixture sets it on v1/v2 to
-               // exercise the read-compatibility round-trip path — counts
-               // already present on a DataFile read from a legacy manifest
-               // must survive an encode→decode cycle. New DataFiles should
-               // not carry distinct counts.
-               builder.DistinctValueCounts(map[int]int64{1: 64, 2: 128})
-       }
+       // distinct_counts (field 111) is deprecated in the spec for every
+       // version (apache/iceberg#12182). The fixture populates it on every
+       // version to assert that the wire codec drops the field on encode
+       // regardless of what the in-memory DataFile carries — strict readers
+       // (e.g. PyIceberg) reject manifests that emit it. The deprecated
+       // setter is invoked here intentionally to exercise that drop path.
+       builder.DistinctValueCounts(map[int]int64{1: 64, 2: 128}) 
//nolint:staticcheck // SA1019: deliberate use of deprecated setter to assert 
the encoder drops the field
        if version >= 2 {
                builder.EqualityFieldIDs([]int{1})
        }
@@ -223,17 +221,11 @@ func assertDataFileEqual(t *testing.T, want, got 
iceberg.DataFile, version int)
        require.Equal(t, want.SortOrderID(), got.SortOrderID())
        require.Equal(t, want.SpecID(), got.SpecID())
        require.Equal(t, want.ContentType(), got.ContentType())
-       if version < 3 {
-               // distinct_counts (field 111) is deprecated for all versions
-               // but still writable on v1/v2. The codec preserves it on
-               // round-trip for read compatibility with legacy manifests; v3
-               // drops it per spec (apache/iceberg#12182).
-               require.Equal(t, want.DistinctValueCounts(), 
got.DistinctValueCounts())
-       } else {
-               require.Empty(t, got.DistinctValueCounts(),
-                       "v3 manifest-entry schema omits distinct_counts 
(deprecated in spec); "+
-                               "see internal/avro_schemas.go data_file_v3")
-       }
+       require.Empty(t, got.DistinctValueCounts(),
+               "distinct_counts (field 111) is deprecated in every version "+
+                       "(apache/iceberg#12182); the manifest-entry schemas in 
"+
+                       "internal/avro_schemas.go omit it for v1/v2/v3 so the 
encoder "+
+                       "drops it on the wire regardless of what the source 
DataFile carries")
        if version >= 2 {
                require.Equal(t, want.EqualityFieldIDs(), 
got.EqualityFieldIDs())
        }
diff --git a/data_file_codec.go b/data_file_codec.go
index 15706b7e..7a7b3d39 100644
--- a/data_file_codec.go
+++ b/data_file_codec.go
@@ -91,11 +91,13 @@ type AvroEntryMarshaler interface {
 // as a standalone manifest entry — they only round-trip via the
 // matching decoder.
 //
-// distinct_counts (field 111) is deprecated in the spec for all
-// versions. MarshalAvroEntry preserves any value already on the
-// source for v1 and v2 as a read-compatibility artifact; v3 omits
-// the field entirely (apache/iceberg#12182). New DataFiles should
-// not set distinct counts.
+// distinct_counts (field 111) is deprecated in the spec for every
+// version (apache/iceberg#12182). MarshalAvroEntry drops the field
+// on encode for v1, v2, and v3 alike — values populated on the
+// source DataFile are not transported. The Avro tag on the dataFile
+// struct is intentionally retained so legacy manifests that already
+// carry the field on the wire still decode through the matching
+// reader. New DataFiles should not set distinct counts.
 func (d *dataFile) MarshalAvroEntry(spec PartitionSpec, schema *Schema, 
version int) ([]byte, error) {
        if version < 1 || version > 3 {
                return nil, fmt.Errorf("iceberg: MarshalAvroEntry: unsupported 
format version %d", version)
diff --git a/internal/avro_schemas.go b/internal/avro_schemas.go
index 2dc8ba8f..720f4be4 100644
--- a/internal/avro_schemas.go
+++ b/internal/avro_schemas.go
@@ -259,9 +259,6 @@ func init() {
                        fieldNode("nan_value_counts",
                                NullableNode(newMapNode("k138_v139", IntNode, 
LongNode, 138, 139)),
                                137, withDoc("map of value to count")),
-                       fieldNode("distinct_counts",
-                               NullableNode(newMapNode("k123_v124", IntNode, 
LongNode, 123, 124)),
-                               111, withDoc("map of column id to distinct 
value count")),
                        fieldNode("lower_bounds",
                                NullableNode(newMapNode("k126_v127", IntNode, 
BytesNode, 126, 127)),
                                125, withDoc("map of column id to lower 
bound")),
@@ -298,9 +295,6 @@ func init() {
                        fieldNode("nan_value_counts",
                                NullableNode(newMapNode("k138_v139", IntNode, 
LongNode, 138, 139)),
                                137, withDoc("map of value to count")),
-                       fieldNode("distinct_counts",
-                               NullableNode(newMapNode("k123_v124", IntNode, 
LongNode, 123, 124)),
-                               111, withDoc("map of column id to distinct 
value count")),
                        fieldNode("lower_bounds",
                                NullableNode(newMapNode("k126_v127", IntNode, 
BytesNode, 126, 127)),
                                125, withDoc("map of column id to lower 
bound")),
diff --git a/manifest.go b/manifest.go
index b6fb77f9..af0040cf 100644
--- a/manifest.go
+++ b/manifest.go
@@ -944,16 +944,6 @@ func (v3writerImpl) prepareEntry(entry *manifestEntry, 
snapshotID int64) (Manife
                }
        }
 
-       // v3 spec deprecates data_file.distinct_counts (Java parity:
-       // apache/iceberg#12182). prepareEntry takes ownership of the entry's 
data
-       // file and clears the Avro-facing pointer; the cached distinctCntMap 
and
-       // DistinctValueCounts() getter are intentionally preserved so 
in-process
-       // readers keep their view. Best-effort: only the in-tree *dataFile is
-       // cleared; third-party DataFile impls bypass this guard.
-       if df, ok := entry.DataFile().(*dataFile); ok {
-               df.DistinctCounts = nil
-       }
-
        return entry, nil
 }
 
@@ -2248,6 +2238,13 @@ func (b *DataFileBuilder) NaNValueCounts(counts 
map[int]int64) *DataFileBuilder
 }
 
 // DistinctValueCounts sets the distinct value counts for the data file.
+//
+// Deprecated: distinct_counts (field 111) is deprecated in every
+// version of the Iceberg spec (apache/iceberg#12182). The Avro
+// manifest-entry schemas omit the field for v1, v2, and v3, so values
+// set here are not transported in manifests written by this library.
+// The setter is retained for round-tripping legacy DataFiles read from
+// older manifests; new code should not call it.
 func (b *DataFileBuilder) DistinctValueCounts(counts map[int]int64) 
*DataFileBuilder {
        b.d.DistinctCounts = mapToAvroColMap(counts)
 
diff --git a/manifest_test.go b/manifest_test.go
index 69e53cf0..eb0071a1 100644
--- a/manifest_test.go
+++ b/manifest_test.go
@@ -2115,10 +2115,21 @@ func (m *ManifestTestSuite) 
TestManifestRoundTripSortOrderID() {
        m.Equal(expectedSortOrderID, *got)
 }
 
-// TestWriteManifestV3OmitsDistinctCounts verifies the v3 writer clears
-// data_file.distinct_counts (deprecated by v3 spec; Java parity:
-// apache/iceberg#12182). v2 round-trip will be added with #1038.
-func (m *ManifestTestSuite) TestWriteManifestV3OmitsDistinctCounts() {
+// TestWriteManifestOmitsDistinctCounts verifies that the manifest
+// writer drops the deprecated distinct_counts field (id 111) from the
+// wire for every format version, regardless of what the source
+// DataFile carries (apache/iceberg#12182). The data_file_v{1,2,3}
+// Avro schemas in internal/avro_schemas.go omit field 111, so the
+// encoder never emits it and a round-trip read returns an empty map.
+func (m *ManifestTestSuite) TestWriteManifestOmitsDistinctCounts() {
+       for _, version := range []int{1, 2, 3} {
+               m.Run("v"+strconv.Itoa(version), func() {
+                       m.assertWriteOmitsDistinctCounts(version)
+               })
+       }
+}
+
+func (m *ManifestTestSuite) assertWriteOmitsDistinctCounts(version int) {
        partitionSpec := NewPartitionSpecID(0)
        snapshotID := int64(1)
        seqNum := int64(1)
@@ -2137,29 +2148,158 @@ func (m *ManifestTestSuite) 
TestWriteManifestV3OmitsDistinctCounts() {
        m.Require().NoError(err)
        dataFileBuilder.DistinctValueCounts(map[int]int64{1: 42})
 
-       entry := NewManifestEntry(
-               EntryStatusADDED,
-               &snapshotID,
-               &seqNum, &seqNum,
-               dataFileBuilder.Build(),
-       )
-
        var buf bytes.Buffer
-       _, err = WriteManifest(
-               "s3://bucket/ns/table/metadata/distinct.avro", &buf, 3,
+       file, err := WriteManifest(
+               "s3://bucket/ns/table/metadata/distinct.avro", &buf, version,
                partitionSpec,
-               NewSchema(
-                       0,
+               NewSchema(0,
                        NestedField{ID: 1, Name: "id", Type: Int64Type{}, 
Required: true},
                ),
                snapshotID,
-               []ManifestEntry{entry},
+               []ManifestEntry{NewManifestEntry(
+                       EntryStatusADDED,
+                       &snapshotID,
+                       &seqNum, &seqNum,
+                       dataFileBuilder.Build(),
+               )},
        )
        m.Require().NoError(err)
 
-       df, ok := entry.DataFile().(*dataFile)
-       m.Require().True(ok)
-       m.Nil(df.DistinctCounts, "v3 writer must clear DistinctCounts on the 
entry's *dataFile")
+       entries, err := ReadManifest(file, &buf, false)
+       m.Require().NoError(err)
+       m.Require().Len(entries, 1)
+       m.Empty(entries[0].DataFile().DistinctValueCounts(),
+               "manifest writer must drop distinct_counts on the wire for 
every format version; "+
+                       "see internal/avro_schemas.go 
data_file_v"+strconv.Itoa(version))
+}
+
+// TestReadManifestLegacyDistinctCounts is a back-compat regression
+// guard. Older iceberg-go and other Iceberg writers (Java, PySpark)
+// embed distinct_counts (id 111) in the data_file record. PR #1102
+// dropped the field from this library's writer schema, but legacy
+// manifests already on disk must still decode correctly: the dataFile
+// struct's avro:"distinct_counts" tag is intentionally retained for
+// this read path.
+//
+// The test bypasses WriteManifest and writes a raw OCF using a
+// fixture schema that mirrors the pre-PR data_file_v2 (with field
+// 111 re-injected), then reads it through ReadManifest and asserts
+// the distinct counts come back populated.
+func (m *ManifestTestSuite) TestReadManifestLegacyDistinctCounts() {
+       partitionSpec := NewPartitionSpec()
+       tableSchema := NewSchema(0,
+               NestedField{ID: 1, Name: "id", Type: Int64Type{}, Required: 
true},
+       )
+       partitionSchema, err := 
partitionTypeToAvroSchema(partitionSpec.PartitionType(tableSchema))
+       m.Require().NoError(err)
+
+       legacySchema := injectDistinctCountsIntoEntrySchema(m.T(), 
partitionSchema, 2)
+
+       snapshotID := int64(42)
+       seqNum := int64(7)
+       df := &dataFile{
+               Content:        EntryContentData,
+               Path:           "s3://bucket/ns/table/data/legacy.parquet",
+               Format:         ParquetFile,
+               PartitionData:  map[string]any{},
+               RecordCount:    100,
+               FileSize:       4096,
+               DistinctCounts: mapToAvroColMap(map[int]int64{1: 99, 2: 88}),
+       }
+       entry := &manifestEntry{
+               EntryStatus: EntryStatusADDED,
+               Snapshot:    &snapshotID,
+               SeqNum:      &seqNum,
+               FileSeqNum:  &seqNum,
+               Data:        df,
+       }
+
+       var buf bytes.Buffer
+       wr, err := ocf.NewWriter(&buf, legacySchema,
+               ocf.WithSchema(legacySchema.String()),
+               ocf.WithMetadata(map[string][]byte{
+                       "format-version": []byte("2"),
+                       "content":        []byte("data"),
+               }),
+       )
+       m.Require().NoError(err)
+       m.Require().NoError(wr.Encode(entry))
+       m.Require().NoError(wr.Close())
+
+       file := &manifestFile{
+               version: 2,
+               Path:    "s3://bucket/ns/table/metadata/legacy.avro",
+               Content: ManifestContentData,
+       }
+       entries, err := ReadManifest(file, &buf, false)
+       m.Require().NoError(err)
+       m.Require().Len(entries, 1)
+       m.Equal(map[int]int64{1: 99, 2: 88}, 
entries[0].DataFile().DistinctValueCounts(),
+               "ReadManifest must decode distinct_counts (field 111) from 
legacy "+
+                       "manifests where the file's writer schema still carries 
the field")
+}
+
+// injectDistinctCountsIntoEntrySchema returns a manifest_entry avro
+// schema for the given format version with the deprecated
+// distinct_counts field (id 111) re-added into the data_file record.
+// The shape mirrors the pre-PR-1102 data_file_v{version} schema
+// definition that was removed from internal/avro_schemas.go, so it
+// can be used to fixture-up legacy manifests for back-compat tests.
+func injectDistinctCountsIntoEntrySchema(t *testing.T, partitionSchema 
*avro.Schema, version int) *avro.Schema {
+       t.Helper()
+
+       base, err := internal.NewManifestEntrySchema(partitionSchema, version)
+       if err != nil {
+               t.Fatalf("base entry schema for v%d: %v", version, err)
+       }
+       root := base.Root()
+
+       dfIdx := -1
+       for i := range root.Fields {
+               if root.Fields[i].Name == "data_file" {
+                       dfIdx = i
+
+                       break
+               }
+       }
+       if dfIdx == -1 {
+               t.Fatalf("data_file field not found on manifest_entry v%d 
schema", version)
+       }
+
+       distinctCountsField := avro.SchemaField{
+               Name: "distinct_counts",
+               Type: avro.SchemaNode{
+                       Type: "union",
+                       Branches: []avro.SchemaNode{
+                               {Type: "null"},
+                               {
+                                       Type: "array",
+                                       Items: &avro.SchemaNode{
+                                               Type: "record",
+                                               Name: "k123_v124",
+                                               Fields: []avro.SchemaField{
+                                                       {Name: "key", Type: 
avro.SchemaNode{Type: "int"}, Props: map[string]any{"field-id": 123}},
+                                                       {Name: "value", Type: 
avro.SchemaNode{Type: "long"}, Props: map[string]any{"field-id": 124}},
+                                               },
+                                       },
+                                       Props: map[string]any{"logicalType": 
"map"},
+                               },
+                       },
+               },
+               Props: map[string]any{"field-id": 111},
+               Doc:   "map of column id to distinct value count",
+       }
+
+       dfField := root.Fields[dfIdx]
+       dfField.Type.Fields = append(append([]avro.SchemaField{}, 
dfField.Type.Fields...), distinctCountsField)
+       root.Fields[dfIdx] = dfField
+
+       legacy, err := root.Schema()
+       if err != nil {
+               t.Fatalf("recompiling legacy entry schema for v%d: %v", 
version, err)
+       }
+
+       return legacy
 }
 
 func (m *ManifestTestSuite) TestWriteManifestClosesWriterOnEntryError() {
@@ -2334,64 +2474,3 @@ func (m *ManifestTestSuite) 
TestEntriesCloseErrorAsFinalPair() {
                "terminal error must equal or wrap the simulated close error")
        m.Equal(1, file.closeCount, "file must be closed exactly once even when 
Close returns an error")
 }
-
-// TestWriteManifestV2KeepsDistinctCounts is a regression guard that v2
-// manifest writers preserve data_file.distinct_counts (id 111) per the
-// Iceberg v2 spec. Fixes #1038.
-func (m *ManifestTestSuite) TestWriteManifestV2KeepsDistinctCounts() {
-       m.assertDistinctCountsRoundTrip(2)
-}
-
-// TestWriteManifestV1KeepsDistinctCounts is a regression guard that v1
-// manifest writers preserve data_file.distinct_counts (id 111) per the
-// Iceberg v1 spec. Fixes #1038.
-func (m *ManifestTestSuite) TestWriteManifestV1KeepsDistinctCounts() {
-       m.assertDistinctCountsRoundTrip(1)
-}
-
-// assertDistinctCountsRoundTrip writes a manifest at the given format
-// version with distinct_counts populated for one column, round-trips it
-// through ReadManifest, and asserts the read side observes the same map.
-func (m *ManifestTestSuite) assertDistinctCountsRoundTrip(version int) {
-       partitionSpec := NewPartitionSpecID(0)
-       snapshotID := int64(1)
-       seqNum := int64(1)
-
-       dataFileBuilder, err := NewDataFileBuilder(
-               partitionSpec,
-               EntryContentData,
-               "s3://bucket/ns/table/data/distinct.parquet",
-               ParquetFile,
-               map[int]any{},
-               map[int]string{},
-               map[int]int{},
-               1,
-               1,
-       )
-       m.Require().NoError(err)
-       dataFileBuilder.DistinctValueCounts(map[int]int64{1: 42})
-
-       var buf bytes.Buffer
-       file, err := WriteManifest(
-               "s3://bucket/ns/table/metadata/distinct.avro", &buf, version,
-               partitionSpec,
-               NewSchema(0,
-                       NestedField{ID: 1, Name: "id", Type: Int64Type{}, 
Required: true},
-               ),
-               snapshotID,
-               []ManifestEntry{NewManifestEntry(
-                       EntryStatusADDED,
-                       &snapshotID,
-                       &seqNum, &seqNum,
-                       dataFileBuilder.Build(),
-               )},
-       )
-       m.Require().NoError(err)
-
-       entries, err := ReadManifest(file, &buf, false)
-       m.Require().NoError(err)
-       m.Require().Len(entries, 1)
-
-       m.Equal(map[int]int64{1: 42}, 
entries[0].DataFile().DistinctValueCounts(),
-               "manifest writer must preserve distinct_counts for the 
requested format version")
-}
diff --git a/table/pos_delete_partitioned_fanout_writer_test.go 
b/table/pos_delete_partitioned_fanout_writer_test.go
index 83b6fc42..7b6e5284 100644
--- a/table/pos_delete_partitioned_fanout_writer_test.go
+++ b/table/pos_delete_partitioned_fanout_writer_test.go
@@ -74,7 +74,7 @@ func TestPositionDeletePartitionedFanoutWriterProcessBatch(t 
*testing.T) {
                        name:                   "success",
                        pathToPartitionContext: 
map[string]partitionContext{"file://namespace/age_bucket=1/test.parquet": 
{partitionData: map[int]any{iceberg.PartitionDataIDStart: 1}, specID: 0}},
                        input:                  
mustLoadRecordBatchFromJSON(PositionalDeleteArrowSchema, `[{"file_path": 
"file://namespace/age_bucket=1/test.parquet", "pos": 100}]`),
-                       expectedDataFile:       &mockDataFile{columnSizes: 
map[int]int64{2147483545: 88, 2147483546: 174}, format: iceberg.ParquetFile, 
partition: map[int]any{iceberg.PartitionDataIDStart: 1}, count: 1, specid: 0, 
contentType: iceberg.EntryContentPosDeletes, sortOrderID: ptr(1)},
+                       expectedDataFile:       &mockDataFile{columnSizes: 
map[int]int64{2147483545: 86, 2147483546: 172}, format: iceberg.ParquetFile, 
partition: map[int]any{iceberg.PartitionDataIDStart: 1}, count: 1, specid: 0, 
contentType: iceberg.EntryContentPosDeletes, sortOrderID: ptr(1)},
                },
                // This test case illustrates how the 
positionDeletePartitionedFanoutWriter does not validate that all records
                // in a batch have the same file path. Doing so would be 
prohibitive in the current implementation and
@@ -84,7 +84,7 @@ func TestPositionDeletePartitionedFanoutWriterProcessBatch(t 
*testing.T) {
                        name:                   "batch with records having 
different file paths",
                        pathToPartitionContext: 
map[string]partitionContext{"file://namespace/age_bucket=1/test.parquet": 
{partitionData: map[int]any{iceberg.PartitionDataIDStart: 1}, specID: 0}},
                        input:                  
mustLoadRecordBatchFromJSON(PositionalDeleteArrowSchema, `[{"file_path": 
"file://namespace/age_bucket=1/test.parquet", "pos": 100}, {"file_path": 
"file://namespace/age_bucket=0/test.parquet", "pos": 10}]`),
-                       expectedDataFile:       &mockDataFile{columnSizes: 
map[int]int64{2147483545: 96, 2147483546: 187}, format: iceberg.ParquetFile, 
partition: map[int]any{iceberg.PartitionDataIDStart: 1}, count: 2, specid: 0, 
contentType: iceberg.EntryContentPosDeletes, sortOrderID: ptr(1)},
+                       expectedDataFile:       &mockDataFile{columnSizes: 
map[int]int64{2147483545: 94, 2147483546: 185}, format: iceberg.ParquetFile, 
partition: map[int]any{iceberg.PartitionDataIDStart: 1}, count: 2, specid: 0, 
contentType: iceberg.EntryContentPosDeletes, sortOrderID: ptr(1)},
                },
        }
 
diff --git a/table/table_test.go b/table/table_test.go
index 17c5bd16..adf8c052 100644
--- a/table/table_test.go
+++ b/table/table_test.go
@@ -546,12 +546,12 @@ func (t *TableWritingTestSuite) 
TestAddFilesUnpartitioned() {
                        Operation: table.OpAppend,
                        Properties: iceberg.Properties{
                                "added-data-files":       "5",
-                               "added-files-size":       "3590",
+                               "added-files-size":       "3070",
                                "added-records":          "5",
                                "total-data-files":       "5",
                                "total-delete-files":     "0",
                                "total-equality-deletes": "0",
-                               "total-files-size":       "3590",
+                               "total-files-size":       "3070",
                                "total-position-deletes": "0",
                                "total-records":          "5",
                        },
@@ -767,13 +767,13 @@ func (t *TableWritingTestSuite) 
TestAddFilesPartitionedTable() {
                        Operation: table.OpAppend,
                        Properties: iceberg.Properties{
                                "added-data-files":        "5",
-                               "added-files-size":        "3590",
+                               "added-files-size":        "3070",
                                "added-records":           "5",
                                "changed-partition-count": "1",
                                "total-data-files":        "5",
                                "total-delete-files":      "0",
                                "total-equality-deletes":  "0",
-                               "total-files-size":        "3590",
+                               "total-files-size":        "3070",
                                "total-position-deletes":  "0",
                                "total-records":           "5",
                        },
@@ -1133,15 +1133,15 @@ func (t *TableWritingTestSuite) TestReplaceDataFiles() {
                Operation: table.OpOverwrite,
                Properties: iceberg.Properties{
                        "added-data-files":       "1",
-                       "added-files-size":       "1066",
+                       "added-files-size":       "963",
                        "added-records":          "4",
                        "deleted-data-files":     "2",
                        "deleted-records":        "4",
-                       "removed-files-size":     "2132",
+                       "removed-files-size":     "1816",
                        "total-data-files":       "4",
                        "total-delete-files":     "0",
                        "total-equality-deletes": "0",
-                       "total-files-size":       "4264",
+                       "total-files-size":       "3687",
                        "total-position-deletes": "0",
                        "total-records":          "10",
                },

(iceberg-go) branch main updated: fix(manifest): stop writing deprecated distinct_counts (field-id 111) (#1102)

Reply via email to