This is an automated email from the ASF dual-hosted git repository. fokko pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push: new c9d7867 feat: implement manifest entry metadata inheritance (#178) c9d7867 is described below commit c9d7867fa9aa5891f37259446a0cd56097020682 Author: Li Feiyang <lifeiy...@zju.edu.cn> AuthorDate: Wed Aug 27 04:55:29 2025 +0800 feat: implement manifest entry metadata inheritance (#178) --- src/iceberg/CMakeLists.txt | 1 + src/iceberg/inheritable_metadata.cc | 110 ++++++++++++++++++++++ src/iceberg/inheritable_metadata.h | 111 ++++++++++++++++++++++ src/iceberg/manifest_reader.cc | 30 +++++- src/iceberg/manifest_reader.h | 16 +++- src/iceberg/manifest_reader_internal.cc | 8 +- src/iceberg/manifest_reader_internal.h | 10 +- src/iceberg/table_scan.cc | 8 +- test/manifest_list_reader_test.cc | 2 +- test/manifest_reader_test.cc | 160 +++++++++++++++++++------------- 10 files changed, 378 insertions(+), 78 deletions(-) diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 316b02c..b509c25 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -24,6 +24,7 @@ set(ICEBERG_SOURCES expression/literal.cc file_reader.cc file_writer.cc + inheritable_metadata.cc json_internal.cc manifest_entry.cc manifest_list.cc diff --git a/src/iceberg/inheritable_metadata.cc b/src/iceberg/inheritable_metadata.cc new file mode 100644 index 0000000..ae09208 --- /dev/null +++ b/src/iceberg/inheritable_metadata.cc @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/inheritable_metadata.h" + +#include <cassert> +#include <utility> + +#include <iceberg/result.h> + +#include "iceberg/manifest_entry.h" +#include "iceberg/manifest_list.h" +#include "iceberg/snapshot.h" + +namespace iceberg { + +BaseInheritableMetadata::BaseInheritableMetadata(int32_t spec_id, int64_t snapshot_id, + int64_t sequence_number, + std::string manifest_location) + : spec_id_(spec_id), + snapshot_id_(snapshot_id), + sequence_number_(sequence_number), + manifest_location_(std::move(manifest_location)) {} + +Status BaseInheritableMetadata::Apply(ManifestEntry& entry) { + if (!entry.snapshot_id.has_value()) { + entry.snapshot_id = snapshot_id_; + } + + // In v1 metadata, the data sequence number is not persisted and can be safely defaulted + // to 0. + // In v2 metadata, the data sequence number should be inherited iff the entry status + // is ADDED. + if (!entry.sequence_number.has_value() && + (sequence_number_ == 0 || entry.status == ManifestStatus::kAdded)) { + entry.sequence_number = sequence_number_; + } + + // In v1 metadata, the file sequence number is not persisted and can be safely defaulted + // to 0. + // In v2 metadata, the file sequence number should be inherited iff the entry status + // is ADDED. + if (!entry.file_sequence_number.has_value() && + (sequence_number_ == 0 || entry.status == ManifestStatus::kAdded)) { + entry.file_sequence_number = sequence_number_; + } + + if (entry.data_file) { + entry.data_file->partition_spec_id = spec_id_; + } else { + return InvalidManifest("Manifest entry has no data file"); + } + + return {}; +} + +Status EmptyInheritableMetadata::Apply(ManifestEntry& entry) { + if (!entry.snapshot_id.has_value()) { + return InvalidManifest( + "Entries must have explicit snapshot ids if inherited metadata is empty"); + } + return {}; +} + +CopyInheritableMetadata::CopyInheritableMetadata(int64_t snapshot_id) + : snapshot_id_(snapshot_id) {} + +Status CopyInheritableMetadata::Apply(ManifestEntry& entry) { + entry.snapshot_id = snapshot_id_; + return {}; +} + +Result<std::unique_ptr<InheritableMetadata>> InheritableMetadataFactory::Empty() { + return std::make_unique<EmptyInheritableMetadata>(); +} + +Result<std::unique_ptr<InheritableMetadata>> InheritableMetadataFactory::FromManifest( + const ManifestFile& manifest) { + // Validate that the manifest has a snapshot ID assigned + if (manifest.added_snapshot_id == Snapshot::kInvalidSnapshotId) { + return InvalidManifest("Manifest file {} has no snapshot ID", manifest.manifest_path); + } + + return std::make_unique<BaseInheritableMetadata>( + manifest.partition_spec_id, manifest.added_snapshot_id, manifest.sequence_number, + manifest.manifest_path); +} + +Result<std::unique_ptr<InheritableMetadata>> InheritableMetadataFactory::ForCopy( + int64_t snapshot_id) { + return std::make_unique<CopyInheritableMetadata>(snapshot_id); +} + +} // namespace iceberg diff --git a/src/iceberg/inheritable_metadata.h b/src/iceberg/inheritable_metadata.h new file mode 100644 index 0000000..f06693a --- /dev/null +++ b/src/iceberg/inheritable_metadata.h @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/inheritable_metadata.h +/// Metadata inheritance system for manifest entries. + +#include <cstdint> +#include <memory> +#include <string> + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief Interface for applying inheritable metadata to manifest entries. +/// +/// When manifest entries have null values for certain fields (snapshot id, +/// data sequence number, file sequence number), these values should be inherited +/// from the manifest file. This interface provides a way to apply such inheritance rules. +class ICEBERG_EXPORT InheritableMetadata { + public: + virtual ~InheritableMetadata() = default; + + /// \brief Apply inheritable metadata to a manifest entry. + /// \param entry The manifest entry to modify. + /// \return Status indicating success or failure. + virtual Status Apply(ManifestEntry& entry) = 0; +}; + +/// \brief Base implementation of InheritableMetadata that handles standard inheritance +/// rules. +class ICEBERG_EXPORT BaseInheritableMetadata : public InheritableMetadata { + public: + /// \brief Constructor for base inheritable metadata. + /// \param spec_id Partition spec ID from the manifest. + /// \param snapshot_id Snapshot ID from the manifest. + /// \param sequence_number Sequence number from the manifest. + /// \param manifest_location Path to the manifest file. + BaseInheritableMetadata(int32_t spec_id, int64_t snapshot_id, int64_t sequence_number, + std::string manifest_location); + + Status Apply(ManifestEntry& entry) override; + + private: + int32_t spec_id_; + int64_t snapshot_id_; + int64_t sequence_number_; + std::string manifest_location_; +}; + +/// \brief Empty implementation that applies no inheritance. +class ICEBERG_EXPORT EmptyInheritableMetadata : public InheritableMetadata { + public: + Status Apply(ManifestEntry& entry) override; +}; + +/// \brief Metadata inheritance for copying manifests before commit. +class ICEBERG_EXPORT CopyInheritableMetadata : public InheritableMetadata { + public: + /// \brief Constructor for copy metadata. + /// \param snapshot_id The snapshot ID to use for copying. + explicit CopyInheritableMetadata(int64_t snapshot_id); + + Status Apply(ManifestEntry& entry) override; + + private: + int64_t snapshot_id_; +}; + +/// \brief Factory for creating InheritableMetadata instances. +class ICEBERG_EXPORT InheritableMetadataFactory { + public: + /// \brief Create an empty metadata instance that applies no inheritance. + static Result<std::unique_ptr<InheritableMetadata>> Empty(); + + /// \brief Create metadata instance from a manifest file. + /// \param manifest The manifest file to extract metadata from. + /// \return Inheritable metadata based on the manifest. + static Result<std::unique_ptr<InheritableMetadata>> FromManifest( + const ManifestFile& manifest); + + /// \brief Create metadata instance for rewriting a manifest before commit. + /// \param snapshot_id The snapshot ID for the copy operation. + /// \return Inheritable metadata for copying. + static Result<std::unique_ptr<InheritableMetadata>> ForCopy(int64_t snapshot_id); + + private: + InheritableMetadataFactory() = default; +}; + +} // namespace iceberg diff --git a/src/iceberg/manifest_reader.cc b/src/iceberg/manifest_reader.cc index 208263d..42606fb 100644 --- a/src/iceberg/manifest_reader.cc +++ b/src/iceberg/manifest_reader.cc @@ -23,11 +23,33 @@ #include "iceberg/manifest_list.h" #include "iceberg/manifest_reader_internal.h" #include "iceberg/schema.h" +#include "iceberg/schema_internal.h" #include "iceberg/util/macros.h" namespace iceberg { -Result<std::unique_ptr<ManifestReader>> ManifestReader::MakeReader( +Result<std::unique_ptr<ManifestReader>> ManifestReader::Make( + const ManifestFile& manifest, std::shared_ptr<FileIO> file_io, + std::shared_ptr<Schema> partition_schema) { + auto manifest_entry_schema = ManifestEntry::TypeFromPartitionType(partition_schema); + std::shared_ptr<Schema> schema = + FromStructType(std::move(*manifest_entry_schema), std::nullopt); + + ICEBERG_ASSIGN_OR_RAISE(auto reader, + ReaderFactoryRegistry::Open(FileFormatType::kAvro, + {.path = manifest.manifest_path, + .length = manifest.manifest_length, + .io = std::move(file_io), + .projection = schema})); + // Create inheritable metadata for this manifest + ICEBERG_ASSIGN_OR_RAISE(auto inheritable_metadata, + InheritableMetadataFactory::FromManifest(manifest)); + + return std::make_unique<ManifestReaderImpl>(std::move(reader), std::move(schema), + std::move(inheritable_metadata)); +} + +Result<std::unique_ptr<ManifestReader>> ManifestReader::Make( std::string_view manifest_location, std::shared_ptr<FileIO> file_io, std::shared_ptr<Schema> partition_schema) { auto manifest_entry_schema = ManifestEntry::TypeFromPartitionType(partition_schema); @@ -39,10 +61,12 @@ Result<std::unique_ptr<ManifestReader>> ManifestReader::MakeReader( {.path = std::string(manifest_location), .io = std::move(file_io), .projection = schema})); - return std::make_unique<ManifestReaderImpl>(std::move(reader), std::move(schema)); + ICEBERG_ASSIGN_OR_RAISE(auto inheritable_metadata, InheritableMetadataFactory::Empty()); + return std::make_unique<ManifestReaderImpl>(std::move(reader), std::move(schema), + std::move(inheritable_metadata)); } -Result<std::unique_ptr<ManifestListReader>> ManifestListReader::MakeReader( +Result<std::unique_ptr<ManifestListReader>> ManifestListReader::Make( std::string_view manifest_list_location, std::shared_ptr<FileIO> file_io) { std::vector<SchemaField> fields(ManifestFile::Type().fields().begin(), ManifestFile::Type().fields().end()); diff --git a/src/iceberg/manifest_reader.h b/src/iceberg/manifest_reader.h index 5d231de..7dc0a20 100644 --- a/src/iceberg/manifest_reader.h +++ b/src/iceberg/manifest_reader.h @@ -25,8 +25,8 @@ #include <memory> #include <vector> -#include "iceberg/file_reader.h" #include "iceberg/iceberg_export.h" +#include "iceberg/result.h" #include "iceberg/type_fwd.h" namespace iceberg { @@ -37,11 +37,21 @@ class ICEBERG_EXPORT ManifestReader { virtual ~ManifestReader() = default; virtual Result<std::vector<ManifestEntry>> Entries() const = 0; + /// \brief Creates a reader for a manifest file. + /// \param manifest A ManifestFile object containing metadata about the manifest. + /// \param file_io File IO implementation to use. + /// \param partition_schema Schema for the partition. + /// \return A Result containing the reader or an error. + static Result<std::unique_ptr<ManifestReader>> Make( + const ManifestFile& manifest, std::shared_ptr<FileIO> file_io, + std::shared_ptr<Schema> partition_schema); + /// \brief Creates a reader for a manifest file. /// \param manifest_location Path to the manifest file. /// \param file_io File IO implementation to use. + /// \param partition_schema Schema for the partition. /// \return A Result containing the reader or an error. - static Result<std::unique_ptr<ManifestReader>> MakeReader( + static Result<std::unique_ptr<ManifestReader>> Make( std::string_view manifest_location, std::shared_ptr<FileIO> file_io, std::shared_ptr<Schema> partition_schema); }; @@ -56,7 +66,7 @@ class ICEBERG_EXPORT ManifestListReader { /// \param manifest_list_location Path to the manifest list file. /// \param file_io File IO implementation to use. /// \return A Result containing the reader or an error. - static Result<std::unique_ptr<ManifestListReader>> MakeReader( + static Result<std::unique_ptr<ManifestListReader>> Make( std::string_view manifest_list_location, std::shared_ptr<FileIO> file_io); }; diff --git a/src/iceberg/manifest_reader_internal.cc b/src/iceberg/manifest_reader_internal.cc index e3b9bec..0ff743e 100644 --- a/src/iceberg/manifest_reader_internal.cc +++ b/src/iceberg/manifest_reader_internal.cc @@ -19,8 +19,6 @@ #include "manifest_reader_internal.h" -#include <array> - #include <nanoarrow/nanoarrow.h> #include "iceberg/arrow_c_data_guard_internal.h" @@ -543,6 +541,12 @@ Result<std::vector<ManifestEntry>> ManifestReaderImpl::Entries() const { break; } } + + // Apply inheritance to all entries + for (auto& entry : manifest_entries) { + ICEBERG_RETURN_UNEXPECTED(inheritable_metadata_->Apply(entry)); + } + return manifest_entries; } diff --git a/src/iceberg/manifest_reader_internal.h b/src/iceberg/manifest_reader_internal.h index a367ede..3144d07 100644 --- a/src/iceberg/manifest_reader_internal.h +++ b/src/iceberg/manifest_reader_internal.h @@ -22,6 +22,8 @@ /// \file iceberg/internal/manifest_reader_internal.h /// Reader implementation for manifest list files and manifest files. +#include "iceberg/file_reader.h" +#include "iceberg/inheritable_metadata.h" #include "iceberg/manifest_reader.h" namespace iceberg { @@ -30,14 +32,18 @@ namespace iceberg { class ManifestReaderImpl : public ManifestReader { public: explicit ManifestReaderImpl(std::unique_ptr<Reader> reader, - std::shared_ptr<Schema> schema) - : schema_(std::move(schema)), reader_(std::move(reader)) {} + std::shared_ptr<Schema> schema, + std::unique_ptr<InheritableMetadata> inheritable_metadata) + : schema_(std::move(schema)), + reader_(std::move(reader)), + inheritable_metadata_(std::move(inheritable_metadata)) {} Result<std::vector<ManifestEntry>> Entries() const override; private: std::shared_ptr<Schema> schema_; std::unique_ptr<Reader> reader_; + std::unique_ptr<InheritableMetadata> inheritable_metadata_; }; /// \brief Read manifest files from a manifest list file. diff --git a/src/iceberg/table_scan.cc b/src/iceberg/table_scan.cc index 43deb2a..a7edd5d 100644 --- a/src/iceberg/table_scan.cc +++ b/src/iceberg/table_scan.cc @@ -147,7 +147,7 @@ DataTableScan::DataTableScan(TableScanContext context, std::shared_ptr<FileIO> f Result<std::vector<std::shared_ptr<FileScanTask>>> DataTableScan::PlanFiles() const { ICEBERG_ASSIGN_OR_RAISE( auto manifest_list_reader, - ManifestListReader::MakeReader(context_.snapshot->manifest_list, file_io_)); + ManifestListReader::Make(context_.snapshot->manifest_list, file_io_)); ICEBERG_ASSIGN_OR_RAISE(auto manifest_files, manifest_list_reader->Files()); std::vector<std::shared_ptr<FileScanTask>> tasks; @@ -155,9 +155,9 @@ Result<std::vector<std::shared_ptr<FileScanTask>>> DataTableScan::PlanFiles() co auto partition_schema = partition_spec->schema(); for (const auto& manifest_file : manifest_files) { - ICEBERG_ASSIGN_OR_RAISE(auto manifest_reader, - ManifestReader::MakeReader(manifest_file.manifest_path, - file_io_, partition_schema)); + ICEBERG_ASSIGN_OR_RAISE( + auto manifest_reader, + ManifestReader::Make(manifest_file, file_io_, partition_schema)); ICEBERG_ASSIGN_OR_RAISE(auto manifests, manifest_reader->Entries()); // TODO(gty404): filter manifests using partition spec and filter expression diff --git a/test/manifest_list_reader_test.cc b/test/manifest_list_reader_test.cc index 09984f3..a3c08c3 100644 --- a/test/manifest_list_reader_test.cc +++ b/test/manifest_list_reader_test.cc @@ -43,7 +43,7 @@ class ManifestListReaderTestBase : public TempFileTestBase { void TestManifestListReading(const std::string& resource_name, const std::vector<ManifestFile>& expected_manifest_list) { std::string path = GetResourcePath(resource_name); - auto manifest_reader_result = ManifestListReader::MakeReader(path, file_io_); + auto manifest_reader_result = ManifestListReader::Make(path, file_io_); ASSERT_EQ(manifest_reader_result.has_value(), true); auto manifest_reader = std::move(manifest_reader_result.value()); diff --git a/test/manifest_reader_test.cc b/test/manifest_reader_test.cc index 55fbdd8..db703c1 100644 --- a/test/manifest_reader_test.cc +++ b/test/manifest_reader_test.cc @@ -27,13 +27,14 @@ #include "iceberg/arrow/arrow_fs_file_io_internal.h" #include "iceberg/avro/avro_register.h" #include "iceberg/manifest_entry.h" +#include "iceberg/manifest_list.h" #include "iceberg/schema.h" #include "temp_file_test_base.h" #include "test_common.h" namespace iceberg { -class ManifestReaderV1Test : public TempFileTestBase { +class ManifestReaderTestBase : public TempFileTestBase { protected: static void SetUpTestSuite() { avro::RegisterAll(); } @@ -43,7 +44,44 @@ class ManifestReaderV1Test : public TempFileTestBase { file_io_ = std::make_shared<iceberg::arrow::ArrowFileSystemFileIO>(local_fs_); } - std::vector<ManifestEntry> PrepareV1ManifestEntries() { + void TestManifestReading(const std::string& resource_name, + const std::vector<ManifestEntry>& expected_entries, + std::shared_ptr<Schema> partition_schema = nullptr) { + std::string path = GetResourcePath(resource_name); + auto manifest_reader_result = ManifestReader::Make(path, file_io_, partition_schema); + ASSERT_TRUE(manifest_reader_result.has_value()) + << manifest_reader_result.error().message; + + auto manifest_reader = std::move(manifest_reader_result.value()); + auto read_result = manifest_reader->Entries(); + ASSERT_TRUE(read_result.has_value()) << read_result.error().message; + ASSERT_EQ(read_result.value().size(), expected_entries.size()); + ASSERT_EQ(read_result.value(), expected_entries); + } + + void TestManifestReadingWithManifestFile( + const ManifestFile& manifest_file, + const std::vector<ManifestEntry>& expected_entries, + std::shared_ptr<Schema> partition_schema = nullptr) { + auto manifest_reader_result = + ManifestReader::Make(manifest_file, file_io_, partition_schema); + ASSERT_TRUE(manifest_reader_result.has_value()) + << manifest_reader_result.error().message; + + auto manifest_reader = std::move(manifest_reader_result.value()); + auto read_result = manifest_reader->Entries(); + ASSERT_TRUE(read_result.has_value()) << read_result.error().message; + ASSERT_EQ(read_result.value().size(), expected_entries.size()); + ASSERT_EQ(read_result.value(), expected_entries); + } + + std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_; + std::shared_ptr<FileIO> file_io_; +}; + +class ManifestReaderV1Test : public ManifestReaderTestBase { + protected: + std::vector<ManifestEntry> PreparePartitionedTestData() { std::vector<ManifestEntry> manifest_entries; std::string test_dir_prefix = "/tmp/db/db/iceberg_test/data/"; std::vector<std::string> paths = { @@ -95,39 +133,22 @@ class ManifestReaderV1Test : public TempFileTestBase { } return manifest_entries; } - - std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_; - std::shared_ptr<FileIO> file_io_; }; -TEST_F(ManifestReaderV1Test, V1PartitionedBasicTest) { +TEST_F(ManifestReaderV1Test, PartitionedTest) { iceberg::SchemaField partition_field(1000, "order_ts_hour", iceberg::int32(), true); auto partition_schema = std::make_shared<Schema>(std::vector<SchemaField>({partition_field})); - std::string path = GetResourcePath("56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro"); - auto manifest_reader_result = - ManifestReader::MakeReader(path, file_io_, partition_schema); - ASSERT_EQ(manifest_reader_result.has_value(), true) - << manifest_reader_result.error().message; - auto manifest_reader = std::move(manifest_reader_result.value()); - auto read_result = manifest_reader->Entries(); - ASSERT_EQ(read_result.has_value(), true) << read_result.error().message; - - auto expected_entries = PrepareV1ManifestEntries(); - ASSERT_EQ(read_result.value(), expected_entries); + auto expected_entries = PreparePartitionedTestData(); + TestManifestReading("56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro", expected_entries, + partition_schema); } -class ManifestReaderV2Test : public TempFileTestBase { +class ManifestReaderV2Test : public ManifestReaderTestBase { protected: - static void SetUpTestSuite() { avro::RegisterAll(); } - - void SetUp() override { - TempFileTestBase::SetUp(); - local_fs_ = std::make_shared<::arrow::fs::LocalFileSystem>(); - file_io_ = std::make_shared<iceberg::arrow::ArrowFileSystemFileIO>(local_fs_); - } - - std::vector<ManifestEntry> PrepareV2NonPartitionedManifestEntries() { + std::vector<ManifestEntry> CreateV2TestData( + std::optional<int64_t> sequence_number = std::nullopt, + std::optional<int32_t> partition_spec_id = std::nullopt) { std::vector<ManifestEntry> manifest_entries; std::string test_dir_prefix = "/tmp/db/db/v2_manifest_non_partitioned/data/"; @@ -149,51 +170,64 @@ class ManifestReaderV2Test : public TempFileTestBase { {3, {'d', 'a', 't', 'a', '_', 'c', 'o', 'n', 't', 'e', 'n', 't', '_', '4'}}, {4, {0x14, 0xae, 0x47, 0xe1, 0x7a, 0x8c, 0x7c, 0x40}}}}; + DataFile data_file{.file_path = test_dir_prefix + paths[0], + .file_format = FileFormatType::kParquet, + .record_count = record_counts[0], + .file_size_in_bytes = file_sizes[0], + .column_sizes = {{1, 56}, {2, 73}, {3, 66}, {4, 67}}, + .value_counts = {{1, 4}, {2, 4}, {3, 4}, {4, 4}}, + .null_value_counts = {{1, 0}, {2, 0}, {3, 0}, {4, 0}}, + .nan_value_counts = {{4, 0}}, + .lower_bounds = lower_bounds[0], + .upper_bounds = upper_bounds[0], + .key_metadata = {}, + .split_offsets = {4}, + .equality_ids = {}, + .sort_order_id = 0, + .first_row_id = std::nullopt, + .referenced_data_file = std::nullopt, + .content_offset = std::nullopt, + .content_size_in_bytes = std::nullopt}; + + if (partition_spec_id.has_value()) { + data_file.partition_spec_id = partition_spec_id.value(); + } + manifest_entries.emplace_back( ManifestEntry{.status = ManifestStatus::kAdded, .snapshot_id = 679879563479918846LL, - .sequence_number = std::nullopt, - .file_sequence_number = std::nullopt, - .data_file = std::make_shared<DataFile>( - DataFile{.file_path = test_dir_prefix + paths[0], - .file_format = FileFormatType::kParquet, - .record_count = record_counts[0], - .file_size_in_bytes = file_sizes[0], - .column_sizes = {{1, 56}, {2, 73}, {3, 66}, {4, 67}}, - .value_counts = {{1, 4}, {2, 4}, {3, 4}, {4, 4}}, - .null_value_counts = {{1, 0}, {2, 0}, {3, 0}, {4, 0}}, - .nan_value_counts = {{4, 0}}, - .lower_bounds = lower_bounds[0], - .upper_bounds = upper_bounds[0], - .key_metadata = {}, - .split_offsets = {4}, - .equality_ids = {}, - .sort_order_id = 0, - .first_row_id = std::nullopt, - .referenced_data_file = std::nullopt, - .content_offset = std::nullopt, - .content_size_in_bytes = std::nullopt})}); + .sequence_number = sequence_number, + .file_sequence_number = sequence_number, + .data_file = std::make_shared<DataFile>(data_file)}); return manifest_entries; } - std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_; - std::shared_ptr<FileIO> file_io_; -}; - -TEST_F(ManifestReaderV2Test, V2NonPartitionedBasicTest) { - std::string path = GetResourcePath("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro"); + std::vector<ManifestEntry> PrepareNonPartitionedTestData() { + return CreateV2TestData(); + } - auto manifest_reader_result = ManifestReader::MakeReader(path, file_io_, nullptr); - ASSERT_EQ(manifest_reader_result.has_value(), true) - << manifest_reader_result.error().message; + std::vector<ManifestEntry> PrepareMetadataInheritanceTestData() { + return CreateV2TestData(/*sequence_number=*/15, /*partition_spec_id*/ 12); + } +}; - auto manifest_reader = std::move(manifest_reader_result.value()); - auto read_result = manifest_reader->Entries(); - ASSERT_EQ(read_result.has_value(), true) << read_result.error().message; - ASSERT_EQ(read_result.value().size(), 1); +TEST_F(ManifestReaderV2Test, NonPartitionedTest) { + auto expected_entries = PrepareNonPartitionedTestData(); + TestManifestReading("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro", expected_entries); +} - auto expected_entries = PrepareV2NonPartitionedManifestEntries(); - ASSERT_EQ(read_result.value(), expected_entries); +TEST_F(ManifestReaderV2Test, MetadataInheritanceTest) { + std::string path = GetResourcePath("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro"); + ManifestFile manifest_file{ + .manifest_path = path, + .manifest_length = 100, + .partition_spec_id = 12, + .content = ManifestFile::Content::kData, + .sequence_number = 15, + .added_snapshot_id = 679879563479918846LL, + }; + auto expected_entries = PrepareMetadataInheritanceTestData(); + TestManifestReadingWithManifestFile(manifest_file, expected_entries); } } // namespace iceberg