This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new c9d7867  feat: implement manifest entry metadata inheritance (#178)
c9d7867 is described below

commit c9d7867fa9aa5891f37259446a0cd56097020682
Author: Li Feiyang <lifeiy...@zju.edu.cn>
AuthorDate: Wed Aug 27 04:55:29 2025 +0800

    feat: implement manifest entry metadata inheritance (#178)
---
 src/iceberg/CMakeLists.txt              |   1 +
 src/iceberg/inheritable_metadata.cc     | 110 ++++++++++++++++++++++
 src/iceberg/inheritable_metadata.h      | 111 ++++++++++++++++++++++
 src/iceberg/manifest_reader.cc          |  30 +++++-
 src/iceberg/manifest_reader.h           |  16 +++-
 src/iceberg/manifest_reader_internal.cc |   8 +-
 src/iceberg/manifest_reader_internal.h  |  10 +-
 src/iceberg/table_scan.cc               |   8 +-
 test/manifest_list_reader_test.cc       |   2 +-
 test/manifest_reader_test.cc            | 160 +++++++++++++++++++-------------
 10 files changed, 378 insertions(+), 78 deletions(-)

diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index 316b02c..b509c25 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -24,6 +24,7 @@ set(ICEBERG_SOURCES
     expression/literal.cc
     file_reader.cc
     file_writer.cc
+    inheritable_metadata.cc
     json_internal.cc
     manifest_entry.cc
     manifest_list.cc
diff --git a/src/iceberg/inheritable_metadata.cc 
b/src/iceberg/inheritable_metadata.cc
new file mode 100644
index 0000000..ae09208
--- /dev/null
+++ b/src/iceberg/inheritable_metadata.cc
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/inheritable_metadata.h"
+
+#include <cassert>
+#include <utility>
+
+#include <iceberg/result.h>
+
+#include "iceberg/manifest_entry.h"
+#include "iceberg/manifest_list.h"
+#include "iceberg/snapshot.h"
+
+namespace iceberg {
+
+BaseInheritableMetadata::BaseInheritableMetadata(int32_t spec_id, int64_t 
snapshot_id,
+                                                 int64_t sequence_number,
+                                                 std::string manifest_location)
+    : spec_id_(spec_id),
+      snapshot_id_(snapshot_id),
+      sequence_number_(sequence_number),
+      manifest_location_(std::move(manifest_location)) {}
+
+Status BaseInheritableMetadata::Apply(ManifestEntry& entry) {
+  if (!entry.snapshot_id.has_value()) {
+    entry.snapshot_id = snapshot_id_;
+  }
+
+  // In v1 metadata, the data sequence number is not persisted and can be 
safely defaulted
+  // to 0.
+  // In v2 metadata, the data sequence number should be inherited iff the 
entry status
+  // is ADDED.
+  if (!entry.sequence_number.has_value() &&
+      (sequence_number_ == 0 || entry.status == ManifestStatus::kAdded)) {
+    entry.sequence_number = sequence_number_;
+  }
+
+  // In v1 metadata, the file sequence number is not persisted and can be 
safely defaulted
+  // to 0.
+  // In v2 metadata, the file sequence number should be inherited iff the 
entry status
+  // is ADDED.
+  if (!entry.file_sequence_number.has_value() &&
+      (sequence_number_ == 0 || entry.status == ManifestStatus::kAdded)) {
+    entry.file_sequence_number = sequence_number_;
+  }
+
+  if (entry.data_file) {
+    entry.data_file->partition_spec_id = spec_id_;
+  } else {
+    return InvalidManifest("Manifest entry has no data file");
+  }
+
+  return {};
+}
+
+Status EmptyInheritableMetadata::Apply(ManifestEntry& entry) {
+  if (!entry.snapshot_id.has_value()) {
+    return InvalidManifest(
+        "Entries must have explicit snapshot ids if inherited metadata is 
empty");
+  }
+  return {};
+}
+
+CopyInheritableMetadata::CopyInheritableMetadata(int64_t snapshot_id)
+    : snapshot_id_(snapshot_id) {}
+
+Status CopyInheritableMetadata::Apply(ManifestEntry& entry) {
+  entry.snapshot_id = snapshot_id_;
+  return {};
+}
+
+Result<std::unique_ptr<InheritableMetadata>> 
InheritableMetadataFactory::Empty() {
+  return std::make_unique<EmptyInheritableMetadata>();
+}
+
+Result<std::unique_ptr<InheritableMetadata>> 
InheritableMetadataFactory::FromManifest(
+    const ManifestFile& manifest) {
+  // Validate that the manifest has a snapshot ID assigned
+  if (manifest.added_snapshot_id == Snapshot::kInvalidSnapshotId) {
+    return InvalidManifest("Manifest file {} has no snapshot ID", 
manifest.manifest_path);
+  }
+
+  return std::make_unique<BaseInheritableMetadata>(
+      manifest.partition_spec_id, manifest.added_snapshot_id, 
manifest.sequence_number,
+      manifest.manifest_path);
+}
+
+Result<std::unique_ptr<InheritableMetadata>> 
InheritableMetadataFactory::ForCopy(
+    int64_t snapshot_id) {
+  return std::make_unique<CopyInheritableMetadata>(snapshot_id);
+}
+
+}  // namespace iceberg
diff --git a/src/iceberg/inheritable_metadata.h 
b/src/iceberg/inheritable_metadata.h
new file mode 100644
index 0000000..f06693a
--- /dev/null
+++ b/src/iceberg/inheritable_metadata.h
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+/// \file iceberg/inheritable_metadata.h
+/// Metadata inheritance system for manifest entries.
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
+#include "iceberg/type_fwd.h"
+
+namespace iceberg {
+
+/// \brief Interface for applying inheritable metadata to manifest entries.
+///
+/// When manifest entries have null values for certain fields (snapshot id,
+/// data sequence number, file sequence number), these values should be 
inherited
+/// from the manifest file. This interface provides a way to apply such 
inheritance rules.
+class ICEBERG_EXPORT InheritableMetadata {
+ public:
+  virtual ~InheritableMetadata() = default;
+
+  /// \brief Apply inheritable metadata to a manifest entry.
+  /// \param entry The manifest entry to modify.
+  /// \return Status indicating success or failure.
+  virtual Status Apply(ManifestEntry& entry) = 0;
+};
+
+/// \brief Base implementation of InheritableMetadata that handles standard 
inheritance
+/// rules.
+class ICEBERG_EXPORT BaseInheritableMetadata : public InheritableMetadata {
+ public:
+  /// \brief Constructor for base inheritable metadata.
+  /// \param spec_id Partition spec ID from the manifest.
+  /// \param snapshot_id Snapshot ID from the manifest.
+  /// \param sequence_number Sequence number from the manifest.
+  /// \param manifest_location Path to the manifest file.
+  BaseInheritableMetadata(int32_t spec_id, int64_t snapshot_id, int64_t 
sequence_number,
+                          std::string manifest_location);
+
+  Status Apply(ManifestEntry& entry) override;
+
+ private:
+  int32_t spec_id_;
+  int64_t snapshot_id_;
+  int64_t sequence_number_;
+  std::string manifest_location_;
+};
+
+/// \brief Empty implementation that applies no inheritance.
+class ICEBERG_EXPORT EmptyInheritableMetadata : public InheritableMetadata {
+ public:
+  Status Apply(ManifestEntry& entry) override;
+};
+
+/// \brief Metadata inheritance for copying manifests before commit.
+class ICEBERG_EXPORT CopyInheritableMetadata : public InheritableMetadata {
+ public:
+  /// \brief Constructor for copy metadata.
+  /// \param snapshot_id The snapshot ID to use for copying.
+  explicit CopyInheritableMetadata(int64_t snapshot_id);
+
+  Status Apply(ManifestEntry& entry) override;
+
+ private:
+  int64_t snapshot_id_;
+};
+
+/// \brief Factory for creating InheritableMetadata instances.
+class ICEBERG_EXPORT InheritableMetadataFactory {
+ public:
+  /// \brief Create an empty metadata instance that applies no inheritance.
+  static Result<std::unique_ptr<InheritableMetadata>> Empty();
+
+  /// \brief Create metadata instance from a manifest file.
+  /// \param manifest The manifest file to extract metadata from.
+  /// \return Inheritable metadata based on the manifest.
+  static Result<std::unique_ptr<InheritableMetadata>> FromManifest(
+      const ManifestFile& manifest);
+
+  /// \brief Create metadata instance for rewriting a manifest before commit.
+  /// \param snapshot_id The snapshot ID for the copy operation.
+  /// \return Inheritable metadata for copying.
+  static Result<std::unique_ptr<InheritableMetadata>> ForCopy(int64_t 
snapshot_id);
+
+ private:
+  InheritableMetadataFactory() = default;
+};
+
+}  // namespace iceberg
diff --git a/src/iceberg/manifest_reader.cc b/src/iceberg/manifest_reader.cc
index 208263d..42606fb 100644
--- a/src/iceberg/manifest_reader.cc
+++ b/src/iceberg/manifest_reader.cc
@@ -23,11 +23,33 @@
 #include "iceberg/manifest_list.h"
 #include "iceberg/manifest_reader_internal.h"
 #include "iceberg/schema.h"
+#include "iceberg/schema_internal.h"
 #include "iceberg/util/macros.h"
 
 namespace iceberg {
 
-Result<std::unique_ptr<ManifestReader>> ManifestReader::MakeReader(
+Result<std::unique_ptr<ManifestReader>> ManifestReader::Make(
+    const ManifestFile& manifest, std::shared_ptr<FileIO> file_io,
+    std::shared_ptr<Schema> partition_schema) {
+  auto manifest_entry_schema = 
ManifestEntry::TypeFromPartitionType(partition_schema);
+  std::shared_ptr<Schema> schema =
+      FromStructType(std::move(*manifest_entry_schema), std::nullopt);
+
+  ICEBERG_ASSIGN_OR_RAISE(auto reader,
+                          ReaderFactoryRegistry::Open(FileFormatType::kAvro,
+                                                      {.path = 
manifest.manifest_path,
+                                                       .length = 
manifest.manifest_length,
+                                                       .io = 
std::move(file_io),
+                                                       .projection = schema}));
+  // Create inheritable metadata for this manifest
+  ICEBERG_ASSIGN_OR_RAISE(auto inheritable_metadata,
+                          InheritableMetadataFactory::FromManifest(manifest));
+
+  return std::make_unique<ManifestReaderImpl>(std::move(reader), 
std::move(schema),
+                                              std::move(inheritable_metadata));
+}
+
+Result<std::unique_ptr<ManifestReader>> ManifestReader::Make(
     std::string_view manifest_location, std::shared_ptr<FileIO> file_io,
     std::shared_ptr<Schema> partition_schema) {
   auto manifest_entry_schema = 
ManifestEntry::TypeFromPartitionType(partition_schema);
@@ -39,10 +61,12 @@ Result<std::unique_ptr<ManifestReader>> 
ManifestReader::MakeReader(
                                                {.path = 
std::string(manifest_location),
                                                 .io = std::move(file_io),
                                                 .projection = schema}));
-  return std::make_unique<ManifestReaderImpl>(std::move(reader), 
std::move(schema));
+  ICEBERG_ASSIGN_OR_RAISE(auto inheritable_metadata, 
InheritableMetadataFactory::Empty());
+  return std::make_unique<ManifestReaderImpl>(std::move(reader), 
std::move(schema),
+                                              std::move(inheritable_metadata));
 }
 
-Result<std::unique_ptr<ManifestListReader>> ManifestListReader::MakeReader(
+Result<std::unique_ptr<ManifestListReader>> ManifestListReader::Make(
     std::string_view manifest_list_location, std::shared_ptr<FileIO> file_io) {
   std::vector<SchemaField> fields(ManifestFile::Type().fields().begin(),
                                   ManifestFile::Type().fields().end());
diff --git a/src/iceberg/manifest_reader.h b/src/iceberg/manifest_reader.h
index 5d231de..7dc0a20 100644
--- a/src/iceberg/manifest_reader.h
+++ b/src/iceberg/manifest_reader.h
@@ -25,8 +25,8 @@
 #include <memory>
 #include <vector>
 
-#include "iceberg/file_reader.h"
 #include "iceberg/iceberg_export.h"
+#include "iceberg/result.h"
 #include "iceberg/type_fwd.h"
 
 namespace iceberg {
@@ -37,11 +37,21 @@ class ICEBERG_EXPORT ManifestReader {
   virtual ~ManifestReader() = default;
   virtual Result<std::vector<ManifestEntry>> Entries() const = 0;
 
+  /// \brief Creates a reader for a manifest file.
+  /// \param manifest A ManifestFile object containing metadata about the 
manifest.
+  /// \param file_io File IO implementation to use.
+  /// \param partition_schema Schema for the partition.
+  /// \return A Result containing the reader or an error.
+  static Result<std::unique_ptr<ManifestReader>> Make(
+      const ManifestFile& manifest, std::shared_ptr<FileIO> file_io,
+      std::shared_ptr<Schema> partition_schema);
+
   /// \brief Creates a reader for a manifest file.
   /// \param manifest_location Path to the manifest file.
   /// \param file_io File IO implementation to use.
+  /// \param partition_schema Schema for the partition.
   /// \return A Result containing the reader or an error.
-  static Result<std::unique_ptr<ManifestReader>> MakeReader(
+  static Result<std::unique_ptr<ManifestReader>> Make(
       std::string_view manifest_location, std::shared_ptr<FileIO> file_io,
       std::shared_ptr<Schema> partition_schema);
 };
@@ -56,7 +66,7 @@ class ICEBERG_EXPORT ManifestListReader {
   /// \param manifest_list_location Path to the manifest list file.
   /// \param file_io File IO implementation to use.
   /// \return A Result containing the reader or an error.
-  static Result<std::unique_ptr<ManifestListReader>> MakeReader(
+  static Result<std::unique_ptr<ManifestListReader>> Make(
       std::string_view manifest_list_location, std::shared_ptr<FileIO> 
file_io);
 };
 
diff --git a/src/iceberg/manifest_reader_internal.cc 
b/src/iceberg/manifest_reader_internal.cc
index e3b9bec..0ff743e 100644
--- a/src/iceberg/manifest_reader_internal.cc
+++ b/src/iceberg/manifest_reader_internal.cc
@@ -19,8 +19,6 @@
 
 #include "manifest_reader_internal.h"
 
-#include <array>
-
 #include <nanoarrow/nanoarrow.h>
 
 #include "iceberg/arrow_c_data_guard_internal.h"
@@ -543,6 +541,12 @@ Result<std::vector<ManifestEntry>> 
ManifestReaderImpl::Entries() const {
       break;
     }
   }
+
+  // Apply inheritance to all entries
+  for (auto& entry : manifest_entries) {
+    ICEBERG_RETURN_UNEXPECTED(inheritable_metadata_->Apply(entry));
+  }
+
   return manifest_entries;
 }
 
diff --git a/src/iceberg/manifest_reader_internal.h 
b/src/iceberg/manifest_reader_internal.h
index a367ede..3144d07 100644
--- a/src/iceberg/manifest_reader_internal.h
+++ b/src/iceberg/manifest_reader_internal.h
@@ -22,6 +22,8 @@
 /// \file iceberg/internal/manifest_reader_internal.h
 /// Reader implementation for manifest list files and manifest files.
 
+#include "iceberg/file_reader.h"
+#include "iceberg/inheritable_metadata.h"
 #include "iceberg/manifest_reader.h"
 
 namespace iceberg {
@@ -30,14 +32,18 @@ namespace iceberg {
 class ManifestReaderImpl : public ManifestReader {
  public:
   explicit ManifestReaderImpl(std::unique_ptr<Reader> reader,
-                              std::shared_ptr<Schema> schema)
-      : schema_(std::move(schema)), reader_(std::move(reader)) {}
+                              std::shared_ptr<Schema> schema,
+                              std::unique_ptr<InheritableMetadata> 
inheritable_metadata)
+      : schema_(std::move(schema)),
+        reader_(std::move(reader)),
+        inheritable_metadata_(std::move(inheritable_metadata)) {}
 
   Result<std::vector<ManifestEntry>> Entries() const override;
 
  private:
   std::shared_ptr<Schema> schema_;
   std::unique_ptr<Reader> reader_;
+  std::unique_ptr<InheritableMetadata> inheritable_metadata_;
 };
 
 /// \brief Read manifest files from a manifest list file.
diff --git a/src/iceberg/table_scan.cc b/src/iceberg/table_scan.cc
index 43deb2a..a7edd5d 100644
--- a/src/iceberg/table_scan.cc
+++ b/src/iceberg/table_scan.cc
@@ -147,7 +147,7 @@ DataTableScan::DataTableScan(TableScanContext context, 
std::shared_ptr<FileIO> f
 Result<std::vector<std::shared_ptr<FileScanTask>>> DataTableScan::PlanFiles() 
const {
   ICEBERG_ASSIGN_OR_RAISE(
       auto manifest_list_reader,
-      ManifestListReader::MakeReader(context_.snapshot->manifest_list, 
file_io_));
+      ManifestListReader::Make(context_.snapshot->manifest_list, file_io_));
   ICEBERG_ASSIGN_OR_RAISE(auto manifest_files, manifest_list_reader->Files());
 
   std::vector<std::shared_ptr<FileScanTask>> tasks;
@@ -155,9 +155,9 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> 
DataTableScan::PlanFiles() co
   auto partition_schema = partition_spec->schema();
 
   for (const auto& manifest_file : manifest_files) {
-    ICEBERG_ASSIGN_OR_RAISE(auto manifest_reader,
-                            
ManifestReader::MakeReader(manifest_file.manifest_path,
-                                                       file_io_, 
partition_schema));
+    ICEBERG_ASSIGN_OR_RAISE(
+        auto manifest_reader,
+        ManifestReader::Make(manifest_file, file_io_, partition_schema));
     ICEBERG_ASSIGN_OR_RAISE(auto manifests, manifest_reader->Entries());
 
     // TODO(gty404): filter manifests using partition spec and filter 
expression
diff --git a/test/manifest_list_reader_test.cc 
b/test/manifest_list_reader_test.cc
index 09984f3..a3c08c3 100644
--- a/test/manifest_list_reader_test.cc
+++ b/test/manifest_list_reader_test.cc
@@ -43,7 +43,7 @@ class ManifestListReaderTestBase : public TempFileTestBase {
   void TestManifestListReading(const std::string& resource_name,
                                const std::vector<ManifestFile>& 
expected_manifest_list) {
     std::string path = GetResourcePath(resource_name);
-    auto manifest_reader_result = ManifestListReader::MakeReader(path, 
file_io_);
+    auto manifest_reader_result = ManifestListReader::Make(path, file_io_);
     ASSERT_EQ(manifest_reader_result.has_value(), true);
 
     auto manifest_reader = std::move(manifest_reader_result.value());
diff --git a/test/manifest_reader_test.cc b/test/manifest_reader_test.cc
index 55fbdd8..db703c1 100644
--- a/test/manifest_reader_test.cc
+++ b/test/manifest_reader_test.cc
@@ -27,13 +27,14 @@
 #include "iceberg/arrow/arrow_fs_file_io_internal.h"
 #include "iceberg/avro/avro_register.h"
 #include "iceberg/manifest_entry.h"
+#include "iceberg/manifest_list.h"
 #include "iceberg/schema.h"
 #include "temp_file_test_base.h"
 #include "test_common.h"
 
 namespace iceberg {
 
-class ManifestReaderV1Test : public TempFileTestBase {
+class ManifestReaderTestBase : public TempFileTestBase {
  protected:
   static void SetUpTestSuite() { avro::RegisterAll(); }
 
@@ -43,7 +44,44 @@ class ManifestReaderV1Test : public TempFileTestBase {
     file_io_ = 
std::make_shared<iceberg::arrow::ArrowFileSystemFileIO>(local_fs_);
   }
 
-  std::vector<ManifestEntry> PrepareV1ManifestEntries() {
+  void TestManifestReading(const std::string& resource_name,
+                           const std::vector<ManifestEntry>& expected_entries,
+                           std::shared_ptr<Schema> partition_schema = nullptr) 
{
+    std::string path = GetResourcePath(resource_name);
+    auto manifest_reader_result = ManifestReader::Make(path, file_io_, 
partition_schema);
+    ASSERT_TRUE(manifest_reader_result.has_value())
+        << manifest_reader_result.error().message;
+
+    auto manifest_reader = std::move(manifest_reader_result.value());
+    auto read_result = manifest_reader->Entries();
+    ASSERT_TRUE(read_result.has_value()) << read_result.error().message;
+    ASSERT_EQ(read_result.value().size(), expected_entries.size());
+    ASSERT_EQ(read_result.value(), expected_entries);
+  }
+
+  void TestManifestReadingWithManifestFile(
+      const ManifestFile& manifest_file,
+      const std::vector<ManifestEntry>& expected_entries,
+      std::shared_ptr<Schema> partition_schema = nullptr) {
+    auto manifest_reader_result =
+        ManifestReader::Make(manifest_file, file_io_, partition_schema);
+    ASSERT_TRUE(manifest_reader_result.has_value())
+        << manifest_reader_result.error().message;
+
+    auto manifest_reader = std::move(manifest_reader_result.value());
+    auto read_result = manifest_reader->Entries();
+    ASSERT_TRUE(read_result.has_value()) << read_result.error().message;
+    ASSERT_EQ(read_result.value().size(), expected_entries.size());
+    ASSERT_EQ(read_result.value(), expected_entries);
+  }
+
+  std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_;
+  std::shared_ptr<FileIO> file_io_;
+};
+
+class ManifestReaderV1Test : public ManifestReaderTestBase {
+ protected:
+  std::vector<ManifestEntry> PreparePartitionedTestData() {
     std::vector<ManifestEntry> manifest_entries;
     std::string test_dir_prefix = "/tmp/db/db/iceberg_test/data/";
     std::vector<std::string> paths = {
@@ -95,39 +133,22 @@ class ManifestReaderV1Test : public TempFileTestBase {
     }
     return manifest_entries;
   }
-
-  std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_;
-  std::shared_ptr<FileIO> file_io_;
 };
 
-TEST_F(ManifestReaderV1Test, V1PartitionedBasicTest) {
+TEST_F(ManifestReaderV1Test, PartitionedTest) {
   iceberg::SchemaField partition_field(1000, "order_ts_hour", 
iceberg::int32(), true);
   auto partition_schema =
       std::make_shared<Schema>(std::vector<SchemaField>({partition_field}));
-  std::string path = 
GetResourcePath("56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro");
-  auto manifest_reader_result =
-      ManifestReader::MakeReader(path, file_io_, partition_schema);
-  ASSERT_EQ(manifest_reader_result.has_value(), true)
-      << manifest_reader_result.error().message;
-  auto manifest_reader = std::move(manifest_reader_result.value());
-  auto read_result = manifest_reader->Entries();
-  ASSERT_EQ(read_result.has_value(), true) << read_result.error().message;
-
-  auto expected_entries = PrepareV1ManifestEntries();
-  ASSERT_EQ(read_result.value(), expected_entries);
+  auto expected_entries = PreparePartitionedTestData();
+  TestManifestReading("56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro", 
expected_entries,
+                      partition_schema);
 }
 
-class ManifestReaderV2Test : public TempFileTestBase {
+class ManifestReaderV2Test : public ManifestReaderTestBase {
  protected:
-  static void SetUpTestSuite() { avro::RegisterAll(); }
-
-  void SetUp() override {
-    TempFileTestBase::SetUp();
-    local_fs_ = std::make_shared<::arrow::fs::LocalFileSystem>();
-    file_io_ = 
std::make_shared<iceberg::arrow::ArrowFileSystemFileIO>(local_fs_);
-  }
-
-  std::vector<ManifestEntry> PrepareV2NonPartitionedManifestEntries() {
+  std::vector<ManifestEntry> CreateV2TestData(
+      std::optional<int64_t> sequence_number = std::nullopt,
+      std::optional<int32_t> partition_spec_id = std::nullopt) {
     std::vector<ManifestEntry> manifest_entries;
     std::string test_dir_prefix = 
"/tmp/db/db/v2_manifest_non_partitioned/data/";
 
@@ -149,51 +170,64 @@ class ManifestReaderV2Test : public TempFileTestBase {
          {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', 
'4'}},
          {4, {0x14, 0xae, 0x47, 0xe1, 0x7a, 0x8c, 0x7c, 0x40}}}};
 
+    DataFile data_file{.file_path = test_dir_prefix + paths[0],
+                       .file_format = FileFormatType::kParquet,
+                       .record_count = record_counts[0],
+                       .file_size_in_bytes = file_sizes[0],
+                       .column_sizes = {{1, 56}, {2, 73}, {3, 66}, {4, 67}},
+                       .value_counts = {{1, 4}, {2, 4}, {3, 4}, {4, 4}},
+                       .null_value_counts = {{1, 0}, {2, 0}, {3, 0}, {4, 0}},
+                       .nan_value_counts = {{4, 0}},
+                       .lower_bounds = lower_bounds[0],
+                       .upper_bounds = upper_bounds[0],
+                       .key_metadata = {},
+                       .split_offsets = {4},
+                       .equality_ids = {},
+                       .sort_order_id = 0,
+                       .first_row_id = std::nullopt,
+                       .referenced_data_file = std::nullopt,
+                       .content_offset = std::nullopt,
+                       .content_size_in_bytes = std::nullopt};
+
+    if (partition_spec_id.has_value()) {
+      data_file.partition_spec_id = partition_spec_id.value();
+    }
+
     manifest_entries.emplace_back(
         ManifestEntry{.status = ManifestStatus::kAdded,
                       .snapshot_id = 679879563479918846LL,
-                      .sequence_number = std::nullopt,
-                      .file_sequence_number = std::nullopt,
-                      .data_file = std::make_shared<DataFile>(
-                          DataFile{.file_path = test_dir_prefix + paths[0],
-                                   .file_format = FileFormatType::kParquet,
-                                   .record_count = record_counts[0],
-                                   .file_size_in_bytes = file_sizes[0],
-                                   .column_sizes = {{1, 56}, {2, 73}, {3, 66}, 
{4, 67}},
-                                   .value_counts = {{1, 4}, {2, 4}, {3, 4}, 
{4, 4}},
-                                   .null_value_counts = {{1, 0}, {2, 0}, {3, 
0}, {4, 0}},
-                                   .nan_value_counts = {{4, 0}},
-                                   .lower_bounds = lower_bounds[0],
-                                   .upper_bounds = upper_bounds[0],
-                                   .key_metadata = {},
-                                   .split_offsets = {4},
-                                   .equality_ids = {},
-                                   .sort_order_id = 0,
-                                   .first_row_id = std::nullopt,
-                                   .referenced_data_file = std::nullopt,
-                                   .content_offset = std::nullopt,
-                                   .content_size_in_bytes = std::nullopt})});
+                      .sequence_number = sequence_number,
+                      .file_sequence_number = sequence_number,
+                      .data_file = std::make_shared<DataFile>(data_file)});
     return manifest_entries;
   }
 
-  std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_;
-  std::shared_ptr<FileIO> file_io_;
-};
-
-TEST_F(ManifestReaderV2Test, V2NonPartitionedBasicTest) {
-  std::string path = 
GetResourcePath("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro");
+  std::vector<ManifestEntry> PrepareNonPartitionedTestData() {
+    return CreateV2TestData();
+  }
 
-  auto manifest_reader_result = ManifestReader::MakeReader(path, file_io_, 
nullptr);
-  ASSERT_EQ(manifest_reader_result.has_value(), true)
-      << manifest_reader_result.error().message;
+  std::vector<ManifestEntry> PrepareMetadataInheritanceTestData() {
+    return CreateV2TestData(/*sequence_number=*/15, /*partition_spec_id*/ 12);
+  }
+};
 
-  auto manifest_reader = std::move(manifest_reader_result.value());
-  auto read_result = manifest_reader->Entries();
-  ASSERT_EQ(read_result.has_value(), true) << read_result.error().message;
-  ASSERT_EQ(read_result.value().size(), 1);
+TEST_F(ManifestReaderV2Test, NonPartitionedTest) {
+  auto expected_entries = PrepareNonPartitionedTestData();
+  TestManifestReading("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro", 
expected_entries);
+}
 
-  auto expected_entries = PrepareV2NonPartitionedManifestEntries();
-  ASSERT_EQ(read_result.value(), expected_entries);
+TEST_F(ManifestReaderV2Test, MetadataInheritanceTest) {
+  std::string path = 
GetResourcePath("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro");
+  ManifestFile manifest_file{
+      .manifest_path = path,
+      .manifest_length = 100,
+      .partition_spec_id = 12,
+      .content = ManifestFile::Content::kData,
+      .sequence_number = 15,
+      .added_snapshot_id = 679879563479918846LL,
+  };
+  auto expected_entries = PrepareMetadataInheritanceTestData();
+  TestManifestReadingWithManifestFile(manifest_file, expected_entries);
 }
 
 }  // namespace iceberg

Reply via email to