This is an automated email from the ASF dual-hosted git repository.

laskoviymishka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git


The following commit(s) were added to refs/heads/main by this push:
     new bcf85500 feat(puffin): validate deletion-vector blob header invariants 
on read (#1055)
bcf85500 is described below

commit bcf85500d8f35eeac80b93b7d074e5b73da4d955
Author: Andrei Tserakhau <[email protected]>
AuthorDate: Wed May 13 21:53:49 2026 +0200

    feat(puffin): validate deletion-vector blob header invariants on read 
(#1055)
    
    Closes #1048. Spec requires `deletion-vector-v1` blobs to carry
    snapshot-id and sequence-number set to -1. The writer enforces this on
    the way out; validateBlobs now mirrors the check on the way in, so a
    malformed DV puffin written by a third-party tool is rejected before its
    bytes reach dv.DeserializeDV.
    
    Test byte-patches a non-DV blob to claim it's a deletion-vector and
    asserts the reader rejects both invariant violations; uses a same-
    length placeholder type so the footer-payload-size trailer stays
    accurate.
---
 puffin/dv_header_validation_test.go | 135 ++++++++++++++++++++++++++++++++++++
 puffin/puffin_reader.go             |  18 +++++
 puffin/puffin_writer.go             |  10 ++-
 3 files changed, 160 insertions(+), 3 deletions(-)

diff --git a/puffin/dv_header_validation_test.go 
b/puffin/dv_header_validation_test.go
new file mode 100644
index 00000000..7975bfec
--- /dev/null
+++ b/puffin/dv_header_validation_test.go
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package puffin_test
+
+import (
+       "bytes"
+       "testing"
+
+       "github.com/apache/iceberg-go/puffin"
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+// placeholderBlobType is exactly the same byte length as
+// puffin.BlobTypeDeletionVector ("deletion-vector-v1", 18 chars) so a
+// byte-replace in the footer JSON keeps the file's footer-payload-size field
+// accurate. The puffin writer accepts arbitrary strings for non-DV blob types,
+// so this synthetic type passes validation on the way out.
+const placeholderBlobType puffin.BlobType = "filler-blob-type01"
+
+// Compile-time guarantee that placeholderBlobType and the DV blob type have
+// equal byte length, so the byte-replace in writeWithPatchedType doesn't
+// shift the footer-payload-size trailer. Produces an "array bound must be
+// non-negative" build error if the lengths ever diverge.
+var _ = [len(puffin.BlobTypeDeletionVector) - len(placeholderBlobType)]byte{}
+
+// rowPositionFieldID is the reserved Iceberg field ID for the row-position
+// metadata column that a deletion vector covers. Spec-compliant DV blobs
+// list this in `fields`; Java's BaseDVFileWriter always emits it.
+const rowPositionFieldID int32 = 2147483546
+
+// writeWithPatchedType writes a blob whose type is placeholderBlobType and
+// carries the requested snapshot-id / sequence-number, then byte-rewrites the
+// footer JSON to claim the blob is `deletion-vector-v1`. The puffin writer
+// rejects DV-typed blobs that don't satisfy snapshot-id/sequence-number == -1,
+// so this byte patch is how we construct a malformed puffin file the reader's
+// validateBlobs must catch. Same-length swap keeps the footer-payload-size
+// trailer field accurate.
+func writeWithPatchedType(t *testing.T, snapshotID, sequenceNumber int64) 
[]byte {
+       t.Helper()
+
+       buf := &bytes.Buffer{}
+       w, err := puffin.NewWriter(buf)
+       require.NoError(t, err)
+       _, err = w.AddBlob(puffin.BlobMetadataInput{
+               Type:           placeholderBlobType,
+               SnapshotID:     snapshotID,
+               SequenceNumber: sequenceNumber,
+               Fields:         []int32{rowPositionFieldID},
+       }, []byte("payload"))
+       require.NoError(t, err)
+       require.NoError(t, w.Finish())
+
+       patched := bytes.Replace(
+               buf.Bytes(),
+               []byte(`"type":"`+string(placeholderBlobType)+`"`),
+               []byte(`"type":"`+string(puffin.BlobTypeDeletionVector)+`"`),
+               1,
+       )
+       require.NotEqual(t, buf.Bytes(), patched, "footer JSON did not contain 
the expected type substring")
+
+       return patched
+}
+
+// TestReaderRejectsDeletionVectorWithBadHeader pins the reader-side 
enforcement
+// of the puffin spec's `deletion-vector-v1` invariants: snapshot-id and
+// sequence-number on the blob metadata MUST both be -1. The writer already
+// rejects bad values on the way out; this is the symmetric reader-side check,
+// so a malformed DV puffin produced by a third-party tool can't sneak
+// arbitrary blob bytes into dv.DeserializeDV.
+func TestReaderRejectsDeletionVectorWithBadHeader(t *testing.T) {
+       t.Run("rejects snapshot-id != -1", func(t *testing.T) {
+               _, err := 
puffin.NewReader(bytes.NewReader(writeWithPatchedType(t, 5, -1)))
+               require.Error(t, err)
+               // Assert on the actual value carried in the error (not the spec
+               // substring), so the assertion fails if the formatter ever 
drops
+               // the value rather than passing for the wrong reason.
+               assert.Contains(t, err.Error(), "snapshot-id")
+               assert.Contains(t, err.Error(), "got 5")
+               assert.Contains(t, err.Error(), 
string(puffin.BlobTypeDeletionVector))
+       })
+
+       t.Run("rejects sequence-number != -1", func(t *testing.T) {
+               _, err := 
puffin.NewReader(bytes.NewReader(writeWithPatchedType(t, -1, 10)))
+               require.Error(t, err)
+               assert.Contains(t, err.Error(), "sequence-number")
+               assert.Contains(t, err.Error(), "got 10")
+               assert.Contains(t, err.Error(), 
string(puffin.BlobTypeDeletionVector))
+       })
+
+       t.Run("both fields wrong: snapshot-id check fires first", func(t 
*testing.T) {
+               // Pins the precedence of the two checks. If a future refactor
+               // reorders them, this test fails — the ordering is part of the
+               // implicit contract for log/diagnostic consumers grepping for
+               // "snapshot-id" before "sequence-number".
+               _, err := 
puffin.NewReader(bytes.NewReader(writeWithPatchedType(t, 7, 13)))
+               require.Error(t, err)
+               assert.Contains(t, err.Error(), "snapshot-id")
+               assert.Contains(t, err.Error(), "got 7")
+               assert.NotContains(t, err.Error(), "sequence-number",
+                       "snapshot-id check should short-circuit before 
sequence-number is reported")
+       })
+
+       t.Run("accepts well-formed deletion-vector-v1 blob", func(t *testing.T) 
{
+               buf := &bytes.Buffer{}
+               w, err := puffin.NewWriter(buf)
+               require.NoError(t, err)
+               _, err = w.AddBlob(puffin.BlobMetadataInput{
+                       Type:           puffin.BlobTypeDeletionVector,
+                       SnapshotID:     -1,
+                       SequenceNumber: -1,
+                       Fields:         []int32{rowPositionFieldID},
+               }, []byte("payload"))
+               require.NoError(t, err)
+               require.NoError(t, w.Finish())
+
+               _, err = puffin.NewReader(bytes.NewReader(buf.Bytes()))
+               require.NoError(t, err)
+       })
+}
diff --git a/puffin/puffin_reader.go b/puffin/puffin_reader.go
index d178a2ab..5945fb23 100644
--- a/puffin/puffin_reader.go
+++ b/puffin/puffin_reader.go
@@ -369,6 +369,24 @@ func (r *Reader) validateBlobs(blobs []BlobMetadata, 
footerStart int64) error {
                        return fmt.Errorf("puffin: blob %d: extends into 
footer: offset=%d length=%d footerStart=%d",
                                i, blob.Offset, blob.Length, footerStart)
                }
+
+               // Spec invariant for deletion-vector-v1: snapshot-id and 
sequence-
+               // number MUST be -1. The writer enforces this on the way out; 
the
+               // reader symmetrically rejects malformed blobs on the way in 
so a
+               // third-party tool that violates the contract can't sneak data
+               // into the deserialization layer. Phrasing matches the writer's
+               // "<type> requires X to be -1, got N" so both sides share a
+               // grep-able canonical message.
+               if blob.Type == BlobTypeDeletionVector {
+                       if blob.SnapshotID != -1 {
+                               return fmt.Errorf("puffin: blob %d: %s requires 
snapshot-id to be -1, got %d",
+                                       i, BlobTypeDeletionVector, 
blob.SnapshotID)
+                       }
+                       if blob.SequenceNumber != -1 {
+                               return fmt.Errorf("puffin: blob %d: %s requires 
sequence-number to be -1, got %d",
+                                       i, BlobTypeDeletionVector, 
blob.SequenceNumber)
+                       }
+               }
        }
 
        return nil
diff --git a/puffin/puffin_writer.go b/puffin/puffin_writer.go
index fd0a39dd..8083a8b3 100644
--- a/puffin/puffin_writer.go
+++ b/puffin/puffin_writer.go
@@ -129,13 +129,17 @@ func (w *Writer) AddBlob(input BlobMetadataInput, data 
[]byte) (BlobMetadata, er
                return BlobMetadata{}, errors.New("puffin: cannot add blob: 
fields is required")
        }
 
-       // Deletion vectors have special requirements per spec
+       // Deletion vectors have special requirements per spec. Format matches
+       // the reader-side validateBlobs check so both sides surface the same
+       // "<type> requires X to be -1, got N" phrasing for log grep.
        if input.Type == BlobTypeDeletionVector {
                if input.SnapshotID != -1 {
-                       return BlobMetadata{}, errors.New("puffin: 
deletion-vector-v1 requires snapshot-id to be -1")
+                       return BlobMetadata{}, fmt.Errorf("puffin: %s requires 
snapshot-id to be -1, got %d",
+                               BlobTypeDeletionVector, input.SnapshotID)
                }
                if input.SequenceNumber != -1 {
-                       return BlobMetadata{}, errors.New("puffin: 
deletion-vector-v1 requires sequence-number to be -1")
+                       return BlobMetadata{}, fmt.Errorf("puffin: %s requires 
sequence-number to be -1, got %d",
+                               BlobTypeDeletionVector, input.SequenceNumber)
                }
        }
 

Reply via email to