This is an automated email from the ASF dual-hosted git repository.
laskoviymishka pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-go.git
The following commit(s) were added to refs/heads/main by this push:
new bcf85500 feat(puffin): validate deletion-vector blob header invariants
on read (#1055)
bcf85500 is described below
commit bcf85500d8f35eeac80b93b7d074e5b73da4d955
Author: Andrei Tserakhau <[email protected]>
AuthorDate: Wed May 13 21:53:49 2026 +0200
feat(puffin): validate deletion-vector blob header invariants on read
(#1055)
Closes #1048. Spec requires `deletion-vector-v1` blobs to carry
snapshot-id and sequence-number set to -1. The writer enforces this on
the way out; validateBlobs now mirrors the check on the way in, so a
malformed DV puffin written by a third-party tool is rejected before its
bytes reach dv.DeserializeDV.
Test byte-patches a non-DV blob to claim it's a deletion-vector and
asserts the reader rejects both invariant violations; uses a same-
length placeholder type so the footer-payload-size trailer stays
accurate.
---
puffin/dv_header_validation_test.go | 135 ++++++++++++++++++++++++++++++++++++
puffin/puffin_reader.go | 18 +++++
puffin/puffin_writer.go | 10 ++-
3 files changed, 160 insertions(+), 3 deletions(-)
diff --git a/puffin/dv_header_validation_test.go
b/puffin/dv_header_validation_test.go
new file mode 100644
index 00000000..7975bfec
--- /dev/null
+++ b/puffin/dv_header_validation_test.go
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package puffin_test
+
+import (
+ "bytes"
+ "testing"
+
+ "github.com/apache/iceberg-go/puffin"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+// placeholderBlobType is exactly the same byte length as
+// puffin.BlobTypeDeletionVector ("deletion-vector-v1", 18 chars) so a
+// byte-replace in the footer JSON keeps the file's footer-payload-size field
+// accurate. The puffin writer accepts arbitrary strings for non-DV blob types,
+// so this synthetic type passes validation on the way out.
+const placeholderBlobType puffin.BlobType = "filler-blob-type01"
+
+// Compile-time guarantee that placeholderBlobType and the DV blob type have
+// equal byte length, so the byte-replace in writeWithPatchedType doesn't
+// shift the footer-payload-size trailer. Produces an "array bound must be
+// non-negative" build error if the lengths ever diverge.
+var _ = [len(puffin.BlobTypeDeletionVector) - len(placeholderBlobType)]byte{}
+
+// rowPositionFieldID is the reserved Iceberg field ID for the row-position
+// metadata column that a deletion vector covers. Spec-compliant DV blobs
+// list this in `fields`; Java's BaseDVFileWriter always emits it.
+const rowPositionFieldID int32 = 2147483546
+
+// writeWithPatchedType writes a blob whose type is placeholderBlobType and
+// carries the requested snapshot-id / sequence-number, then byte-rewrites the
+// footer JSON to claim the blob is `deletion-vector-v1`. The puffin writer
+// rejects DV-typed blobs that don't satisfy snapshot-id/sequence-number == -1,
+// so this byte patch is how we construct a malformed puffin file the reader's
+// validateBlobs must catch. Same-length swap keeps the footer-payload-size
+// trailer field accurate.
+func writeWithPatchedType(t *testing.T, snapshotID, sequenceNumber int64)
[]byte {
+ t.Helper()
+
+ buf := &bytes.Buffer{}
+ w, err := puffin.NewWriter(buf)
+ require.NoError(t, err)
+ _, err = w.AddBlob(puffin.BlobMetadataInput{
+ Type: placeholderBlobType,
+ SnapshotID: snapshotID,
+ SequenceNumber: sequenceNumber,
+ Fields: []int32{rowPositionFieldID},
+ }, []byte("payload"))
+ require.NoError(t, err)
+ require.NoError(t, w.Finish())
+
+ patched := bytes.Replace(
+ buf.Bytes(),
+ []byte(`"type":"`+string(placeholderBlobType)+`"`),
+ []byte(`"type":"`+string(puffin.BlobTypeDeletionVector)+`"`),
+ 1,
+ )
+ require.NotEqual(t, buf.Bytes(), patched, "footer JSON did not contain
the expected type substring")
+
+ return patched
+}
+
+// TestReaderRejectsDeletionVectorWithBadHeader pins the reader-side
enforcement
+// of the puffin spec's `deletion-vector-v1` invariants: snapshot-id and
+// sequence-number on the blob metadata MUST both be -1. The writer already
+// rejects bad values on the way out; this is the symmetric reader-side check,
+// so a malformed DV puffin produced by a third-party tool can't sneak
+// arbitrary blob bytes into dv.DeserializeDV.
+func TestReaderRejectsDeletionVectorWithBadHeader(t *testing.T) {
+ t.Run("rejects snapshot-id != -1", func(t *testing.T) {
+ _, err :=
puffin.NewReader(bytes.NewReader(writeWithPatchedType(t, 5, -1)))
+ require.Error(t, err)
+ // Assert on the actual value carried in the error (not the spec
+ // substring), so the assertion fails if the formatter ever
drops
+ // the value rather than passing for the wrong reason.
+ assert.Contains(t, err.Error(), "snapshot-id")
+ assert.Contains(t, err.Error(), "got 5")
+ assert.Contains(t, err.Error(),
string(puffin.BlobTypeDeletionVector))
+ })
+
+ t.Run("rejects sequence-number != -1", func(t *testing.T) {
+ _, err :=
puffin.NewReader(bytes.NewReader(writeWithPatchedType(t, -1, 10)))
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "sequence-number")
+ assert.Contains(t, err.Error(), "got 10")
+ assert.Contains(t, err.Error(),
string(puffin.BlobTypeDeletionVector))
+ })
+
+ t.Run("both fields wrong: snapshot-id check fires first", func(t
*testing.T) {
+ // Pins the precedence of the two checks. If a future refactor
+ // reorders them, this test fails — the ordering is part of the
+ // implicit contract for log/diagnostic consumers grepping for
+ // "snapshot-id" before "sequence-number".
+ _, err :=
puffin.NewReader(bytes.NewReader(writeWithPatchedType(t, 7, 13)))
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "snapshot-id")
+ assert.Contains(t, err.Error(), "got 7")
+ assert.NotContains(t, err.Error(), "sequence-number",
+ "snapshot-id check should short-circuit before
sequence-number is reported")
+ })
+
+ t.Run("accepts well-formed deletion-vector-v1 blob", func(t *testing.T)
{
+ buf := &bytes.Buffer{}
+ w, err := puffin.NewWriter(buf)
+ require.NoError(t, err)
+ _, err = w.AddBlob(puffin.BlobMetadataInput{
+ Type: puffin.BlobTypeDeletionVector,
+ SnapshotID: -1,
+ SequenceNumber: -1,
+ Fields: []int32{rowPositionFieldID},
+ }, []byte("payload"))
+ require.NoError(t, err)
+ require.NoError(t, w.Finish())
+
+ _, err = puffin.NewReader(bytes.NewReader(buf.Bytes()))
+ require.NoError(t, err)
+ })
+}
diff --git a/puffin/puffin_reader.go b/puffin/puffin_reader.go
index d178a2ab..5945fb23 100644
--- a/puffin/puffin_reader.go
+++ b/puffin/puffin_reader.go
@@ -369,6 +369,24 @@ func (r *Reader) validateBlobs(blobs []BlobMetadata,
footerStart int64) error {
return fmt.Errorf("puffin: blob %d: extends into
footer: offset=%d length=%d footerStart=%d",
i, blob.Offset, blob.Length, footerStart)
}
+
+ // Spec invariant for deletion-vector-v1: snapshot-id and
sequence-
+ // number MUST be -1. The writer enforces this on the way out;
the
+ // reader symmetrically rejects malformed blobs on the way in
so a
+ // third-party tool that violates the contract can't sneak data
+ // into the deserialization layer. Phrasing matches the writer's
+ // "<type> requires X to be -1, got N" so both sides share a
+ // grep-able canonical message.
+ if blob.Type == BlobTypeDeletionVector {
+ if blob.SnapshotID != -1 {
+ return fmt.Errorf("puffin: blob %d: %s requires
snapshot-id to be -1, got %d",
+ i, BlobTypeDeletionVector,
blob.SnapshotID)
+ }
+ if blob.SequenceNumber != -1 {
+ return fmt.Errorf("puffin: blob %d: %s requires
sequence-number to be -1, got %d",
+ i, BlobTypeDeletionVector,
blob.SequenceNumber)
+ }
+ }
}
return nil
diff --git a/puffin/puffin_writer.go b/puffin/puffin_writer.go
index fd0a39dd..8083a8b3 100644
--- a/puffin/puffin_writer.go
+++ b/puffin/puffin_writer.go
@@ -129,13 +129,17 @@ func (w *Writer) AddBlob(input BlobMetadataInput, data
[]byte) (BlobMetadata, er
return BlobMetadata{}, errors.New("puffin: cannot add blob:
fields is required")
}
- // Deletion vectors have special requirements per spec
+ // Deletion vectors have special requirements per spec. Format matches
+ // the reader-side validateBlobs check so both sides surface the same
+ // "<type> requires X to be -1, got N" phrasing for log grep.
if input.Type == BlobTypeDeletionVector {
if input.SnapshotID != -1 {
- return BlobMetadata{}, errors.New("puffin:
deletion-vector-v1 requires snapshot-id to be -1")
+ return BlobMetadata{}, fmt.Errorf("puffin: %s requires
snapshot-id to be -1, got %d",
+ BlobTypeDeletionVector, input.SnapshotID)
}
if input.SequenceNumber != -1 {
- return BlobMetadata{}, errors.New("puffin:
deletion-vector-v1 requires sequence-number to be -1")
+ return BlobMetadata{}, fmt.Errorf("puffin: %s requires
sequence-number to be -1, got %d",
+ BlobTypeDeletionVector, input.SequenceNumber)
}
}