This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new f33754de chore: remove unnecessary manifest reader/writer test cases
(#377)
f33754de is described below
commit f33754de772f07001af2fe7f3cfd0cfb5e9464db
Author: Junwang Zhao <[email protected]>
AuthorDate: Mon Dec 1 17:54:55 2025 +0800
chore: remove unnecessary manifest reader/writer test cases (#377)
The manifest_writer_versions_test.cc and manifest_list_versions_test.cc
should be adequate to cover the manifest module. Remove
manifest_list_reader_writer_test.cc and manifest_reader_writer_test.cc
along with their related .avro files, so we don't need to ship binary
files any more.
---
dev/release/rat_exclude_files.txt | 2 -
src/iceberg/test/CMakeLists.txt | 2 -
.../test/manifest_list_reader_writer_test.cc | 391 ---------------------
src/iceberg/test/manifest_reader_writer_test.cc | 333 ------------------
.../2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro | Bin 7207 -> 0 bytes
.../56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro | Bin 7533 -> 0 bytes
...399-1-ccb6dbcb-0611-48da-be68-bd506ea63188.avro | Bin 4598 -> 0 bytes
...835-1-aeffe099-3bac-4011-bc17-5875210d8dc0.avro | Bin 3951 -> 0 bytes
...213-1-2bccd69e-d642-4816-bba0-261cd9bd0d93.avro | Bin 4628 -> 0 bytes
...621-1-5d690750-8fb4-4cd1-8ae7-85c7b39abe14.avro | Bin 4003 -> 0 bytes
...098-1-eafd2972-f58e-4185-9237-6378f564787e.avro | Bin 3986 -> 0 bytes
11 files changed, 728 deletions(-)
diff --git a/dev/release/rat_exclude_files.txt
b/dev/release/rat_exclude_files.txt
index a20621cf..b8a1c098 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -23,7 +23,5 @@ dist/**
.git/**
requirements.txt
test/resources/**
-*.avro
*.json
-*.parquet
src/iceberg/util/murmurhash3_internal.*
diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt
index 21ccd4d6..af3dfa0f 100644
--- a/src/iceberg/test/CMakeLists.txt
+++ b/src/iceberg/test/CMakeLists.txt
@@ -126,9 +126,7 @@ if(ICEBERG_BUILD_BUNDLE)
avro_test.cc
avro_schema_test.cc
avro_stream_test.cc
- manifest_list_reader_writer_test.cc
manifest_list_versions_test.cc
- manifest_reader_writer_test.cc
manifest_writer_versions_test.cc
test_common.cc)
diff --git a/src/iceberg/test/manifest_list_reader_writer_test.cc
b/src/iceberg/test/manifest_list_reader_writer_test.cc
deleted file mode 100644
index ee6c7d9f..00000000
--- a/src/iceberg/test/manifest_list_reader_writer_test.cc
+++ /dev/null
@@ -1,391 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <arrow/filesystem/localfs.h>
-#include <gtest/gtest.h>
-
-#include "iceberg/arrow/arrow_fs_file_io_internal.h"
-#include "iceberg/avro/avro_register.h"
-#include "iceberg/expression/literal.h"
-#include "iceberg/manifest/manifest_list.h"
-#include "iceberg/manifest/manifest_reader.h"
-#include "iceberg/manifest/manifest_writer.h"
-#include "iceberg/test/matchers.h"
-#include "iceberg/test/temp_file_test_base.h"
-#include "iceberg/test/test_common.h"
-
-namespace iceberg {
-
-class ManifestListReaderWriterTestBase : public TempFileTestBase {
- protected:
- static void SetUpTestSuite() { avro::RegisterAll(); }
-
- void SetUp() override {
- TempFileTestBase::SetUp();
- local_fs_ = std::make_shared<::arrow::fs::LocalFileSystem>();
- file_io_ =
std::make_shared<iceberg::arrow::ArrowFileSystemFileIO>(local_fs_);
- }
-
- void TestManifestListReading(const std::string& resource_name,
- const std::vector<ManifestFile>&
expected_manifest_list) {
- std::string path = GetResourcePath(resource_name);
- TestManifestListReadingByPath(path, expected_manifest_list);
- }
-
- void TestManifestListReadingByPath(
- const std::string& path, const std::vector<ManifestFile>&
expected_manifest_list) {
- auto manifest_reader_result = ManifestListReader::Make(path, file_io_);
- ASSERT_EQ(manifest_reader_result.has_value(), true);
-
- auto manifest_reader = std::move(manifest_reader_result.value());
- auto read_result = manifest_reader->Files();
- ASSERT_EQ(read_result.has_value(), true);
- ASSERT_EQ(read_result.value().size(), expected_manifest_list.size());
- ASSERT_EQ(read_result.value(), expected_manifest_list);
- }
-
- void TestNonPartitionedManifests(const std::vector<ManifestFile>&
manifest_files) {
- for (const auto& manifest : manifest_files) {
- ASSERT_EQ(manifest.partition_spec_id, 0);
- ASSERT_TRUE(manifest.partitions.empty());
- ASSERT_EQ(manifest.content, ManifestContent::kData);
- }
- }
-
- std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_;
- std::shared_ptr<FileIO> file_io_;
-};
-
-class ManifestListReaderWriterV1Test : public ManifestListReaderWriterTestBase
{
- protected:
- std::vector<ManifestFile> PreparePartitionedTestData() {
- std::vector<std::string> paths = {
- "iceberg-warehouse/db/v1_partition_test/metadata/"
- "eafd2972-f58e-4185-9237-6378f564787e-m1.avro",
- "iceberg-warehouse/db/v1_partition_test/metadata/"
- "eafd2972-f58e-4185-9237-6378f564787e-m0.avro"};
- std::vector<int64_t> file_size = {6185, 6113};
- std::vector<int64_t> snapshot_id = {7532614258660258098,
7532614258660258098};
-
- return {
- {.manifest_path = paths[0],
- .manifest_length = file_size[0],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[0],
- .added_files_count = 4,
- .existing_files_count = 0,
- .deleted_files_count = 0,
- .added_rows_count = 6,
- .existing_rows_count = 0,
- .deleted_rows_count = 0,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound =
Literal::String("2022-02-22").Serialize().value(),
- .upper_bound =
-
Literal::String("2022-2-23").Serialize().value()}}},
-
- {.manifest_path = paths[1],
- .manifest_length = file_size[1],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[1],
- .added_files_count = 0,
- .existing_files_count = 0,
- .deleted_files_count = 2,
- .added_rows_count = 0,
- .existing_rows_count = 0,
- .deleted_rows_count = 6,
- .partitions = {
- {.contains_null = false,
- .contains_nan = false,
- .lower_bound = Literal::String("2022-2-22").Serialize().value(),
- .upper_bound =
Literal::String("2022-2-23").Serialize().value()}}}};
- }
-
- std::vector<ManifestFile> PrepareComplexTypeTestData() {
- std::vector<std::string> paths = {
- "iceberg-warehouse/db/v1_type_test/metadata/"
- "aeffe099-3bac-4011-bc17-5875210d8dc0-m1.avro",
- "iceberg-warehouse/db/v1_type_test/metadata/"
- "aeffe099-3bac-4011-bc17-5875210d8dc0-m0.avro"};
- std::vector<int64_t> file_size = {6498, 6513};
- std::vector<int64_t> snapshot_id = {4134160420377642835,
4134160420377642835};
-
- return {{.manifest_path = paths[0],
- .manifest_length = file_size[0],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[0],
- .added_files_count = 1,
- .existing_files_count = 0,
- .deleted_files_count = 0,
- .added_rows_count = 2,
- .existing_rows_count = 0,
- .deleted_rows_count = 0},
-
- {.manifest_path = paths[1],
- .manifest_length = file_size[1],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[1],
- .added_files_count = 0,
- .existing_files_count = 0,
- .deleted_files_count = 1,
- .added_rows_count = 0,
- .existing_rows_count = 0,
- .deleted_rows_count = 3}};
- }
-
- std::vector<ManifestFile> PrepareComplexPartitionedTestData() {
- std::vector<std::string> paths = {
- "iceberg-warehouse/db2/v1_complex_partition_test/metadata/"
- "5d690750-8fb4-4cd1-8ae7-85c7b39abe14-m0.avro",
- "iceberg-warehouse/db2/v1_complex_partition_test/metadata/"
- "5d690750-8fb4-4cd1-8ae7-85c7b39abe14-m1.avro"};
- std::vector<int64_t> file_size = {6402, 6318};
- std::vector<int64_t> snapshot_id = {7522296285847100621,
7522296285847100621};
-
- std::vector<std::vector<std::uint8_t>> lower_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32},
- {0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32},
- {0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}};
-
- std::vector<std::vector<std::uint8_t>> upper_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x34},
- {0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33},
- {0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}};
-
- return {{.manifest_path = paths[0],
- .manifest_length = file_size[0],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[0],
- .added_files_count = 0,
- .existing_files_count = 3,
- .deleted_files_count = 1,
- .added_rows_count = 0,
- .existing_rows_count = 4,
- .deleted_rows_count = 2,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[0],
- .upper_bound = upper_bounds[0]},
- {.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[1],
- .upper_bound = upper_bounds[1]}}},
-
- {.manifest_path = paths[1],
- .manifest_length = file_size[1],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[1],
- .added_files_count = 0,
- .existing_files_count = 1,
- .deleted_files_count = 1,
- .added_rows_count = 0,
- .existing_rows_count = 1,
- .deleted_rows_count = 1,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[2],
- .upper_bound = upper_bounds[2]},
- {.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[3],
- .upper_bound = upper_bounds[3]}}}};
- }
-
- void TestWriteManifestList(const std::string& manifest_list_path,
- const std::vector<ManifestFile>& manifest_files) {
- auto result = ManifestListWriter::MakeV1Writer(1, 0, manifest_list_path,
file_io_);
- ASSERT_TRUE(result.has_value()) << result.error().message;
- auto writer = std::move(result.value());
- auto status = writer->AddAll(manifest_files);
- EXPECT_THAT(status, IsOk());
- status = writer->Close();
- EXPECT_THAT(status, IsOk());
- }
-};
-
-class ManifestListReaderWriterV2Test : public ManifestListReaderWriterTestBase
{
- protected:
- std::vector<ManifestFile> PreparePartitionedTestData() {
- std::vector<ManifestFile> manifest_files;
- std::string test_dir_prefix = "/tmp/db/db/iceberg_test/metadata/";
- std::vector<std::string> paths =
{"2bccd69e-d642-4816-bba0-261cd9bd0d93-m0.avro",
-
"9b6ffacd-ef10-4abf-a89c-01c733696796-m0.avro",
-
"2541e6b5-4923-4bd5-886d-72c6f7228400-m0.avro",
-
"3118c801-d2e0-4df6-8c7a-7d4eaade32f8-m0.avro"};
- std::vector<int64_t> file_size = {7433, 7431, 7433, 7431};
- std::vector<int64_t> snapshot_id = {7412193043800610213,
5485972788975780755,
- 1679468743751242972,
1579605567338877265};
- std::vector<std::vector<uint8_t>> bounds = {{'x', ';', 0x07, 0x00},
- {'(', 0x19, 0x07, 0x00},
- {0xd0, 0xd4, 0x06, 0x00},
- {0xb8, 0xd4, 0x06, 0x00}};
- for (int i = 0; i < 4; ++i) {
- ManifestFile manifest_file;
- manifest_file.manifest_path = test_dir_prefix + paths[i];
- manifest_file.manifest_length = file_size[i];
- manifest_file.partition_spec_id = 0;
- manifest_file.content = ManifestContent::kData;
- manifest_file.sequence_number = 4 - i;
- manifest_file.min_sequence_number = 4 - i;
- manifest_file.added_snapshot_id = snapshot_id[i];
- manifest_file.added_files_count = 1;
- manifest_file.existing_files_count = 0;
- manifest_file.deleted_files_count = 0;
- manifest_file.added_rows_count = 1;
- manifest_file.existing_rows_count = 0;
- manifest_file.deleted_rows_count = 0;
- PartitionFieldSummary partition;
- partition.contains_null = false;
- partition.contains_nan = false;
- partition.lower_bound = bounds[i];
- partition.upper_bound = bounds[i];
- manifest_file.partitions.emplace_back(partition);
- manifest_files.emplace_back(manifest_file);
- }
- return manifest_files;
- }
-
- std::vector<ManifestFile> PrepareNonPartitionedTestData() {
- std::vector<ManifestFile> manifest_files;
- std::string test_dir_prefix =
"/tmp/db/db/v2_non_partitioned_test/metadata/";
-
- std::vector<std::string> paths =
{"ccb6dbcb-0611-48da-be68-bd506ea63188-m0.avro",
-
"b89a10c9-a7a8-4526-99c5-5587a4ea7527-m0.avro",
-
"a74d20fa-c800-4706-9ddb-66be15a5ecb0-m0.avro",
-
"ae7d5fce-7245-4335-9b57-bc598c595c84-m0.avro"};
-
- std::vector<int64_t> file_size = {7169, 7170, 7169, 7170};
-
- std::vector<int64_t> snapshot_id = {251167482216575399,
4248697313956014690,
- 281757490425433194,
5521202581490753283};
-
- for (int i = 0; i < 4; ++i) {
- ManifestFile manifest_file;
- manifest_file.manifest_path = test_dir_prefix + paths[i];
- manifest_file.manifest_length = file_size[i];
- manifest_file.partition_spec_id = 0;
- manifest_file.content = ManifestContent::kData;
- manifest_file.sequence_number = 4 - i;
- manifest_file.min_sequence_number = 4 - i;
- manifest_file.added_snapshot_id = snapshot_id[i];
- manifest_file.added_files_count = 1;
- manifest_file.existing_files_count = 0;
- manifest_file.deleted_files_count = 0;
- manifest_file.added_rows_count = 1;
- manifest_file.existing_rows_count = 0;
- manifest_file.deleted_rows_count = 0;
- // Note: no partitions for non-partitioned test
- manifest_files.emplace_back(manifest_file);
- }
- return manifest_files;
- }
-
- void TestWriteManifestList(const std::string& manifest_list_path,
- const std::vector<ManifestFile>& manifest_files) {
- auto result = ManifestListWriter::MakeV2Writer(1, 0, 4,
manifest_list_path, file_io_);
- ASSERT_TRUE(result.has_value()) << result.error().message;
- auto writer = std::move(result.value());
- auto status = writer->AddAll(manifest_files);
- EXPECT_THAT(status, IsOk());
- status = writer->Close();
- EXPECT_THAT(status, IsOk());
- }
-};
-
-// V1 Tests
-TEST_F(ManifestListReaderWriterV1Test, PartitionedTest) {
- auto expected_manifest_list = PreparePartitionedTestData();
- TestManifestListReading(
- "snap-7532614258660258098-1-eafd2972-f58e-4185-9237-6378f564787e.avro",
- expected_manifest_list);
-}
-
-TEST_F(ManifestListReaderWriterV1Test, ComplexTypeTest) {
- auto expected_manifest_list = PrepareComplexTypeTestData();
- TestManifestListReading(
- "snap-4134160420377642835-1-aeffe099-3bac-4011-bc17-5875210d8dc0.avro",
- expected_manifest_list);
-}
-
-TEST_F(ManifestListReaderWriterV1Test, ComplexPartitionedTest) {
- auto expected_manifest_list = PrepareComplexPartitionedTestData();
- TestManifestListReading(
- "snap-7522296285847100621-1-5d690750-8fb4-4cd1-8ae7-85c7b39abe14.avro",
- expected_manifest_list);
-}
-
-TEST_F(ManifestListReaderWriterV1Test, WritePartitionedTest) {
- auto expected_manifest_list = PreparePartitionedTestData();
- auto write_manifest_list_path = CreateNewTempFilePath();
- TestWriteManifestList(write_manifest_list_path, expected_manifest_list);
- TestManifestListReadingByPath(write_manifest_list_path,
expected_manifest_list);
-}
-
-TEST_F(ManifestListReaderWriterV1Test, WriteComplexTypeTest) {
- auto expected_manifest_list = PrepareComplexTypeTestData();
- auto write_manifest_list_path = CreateNewTempFilePath();
- TestWriteManifestList(write_manifest_list_path, expected_manifest_list);
- TestManifestListReadingByPath(write_manifest_list_path,
expected_manifest_list);
-}
-
-TEST_F(ManifestListReaderWriterV1Test, WriteComplexPartitionedTest) {
- auto expected_manifest_list = PrepareComplexPartitionedTestData();
- auto write_manifest_list_path = CreateNewTempFilePath();
- TestWriteManifestList(write_manifest_list_path, expected_manifest_list);
- TestManifestListReadingByPath(write_manifest_list_path,
expected_manifest_list);
-}
-
-// V2 Tests
-TEST_F(ManifestListReaderWriterV2Test, PartitionedTest) {
- auto expected_manifest_list = PreparePartitionedTestData();
- TestManifestListReading(
- "snap-7412193043800610213-1-2bccd69e-d642-4816-bba0-261cd9bd0d93.avro",
- expected_manifest_list);
-}
-
-TEST_F(ManifestListReaderWriterV2Test, NonPartitionedTest) {
- auto expected_manifest_list = PrepareNonPartitionedTestData();
- TestManifestListReading(
- "snap-251167482216575399-1-ccb6dbcb-0611-48da-be68-bd506ea63188.avro",
- expected_manifest_list);
-
- // Additional verification: ensure all manifests are truly non-partitioned
- TestNonPartitionedManifests(expected_manifest_list);
-}
-
-TEST_F(ManifestListReaderWriterV2Test, WritePartitionedTest) {
- auto expected_manifest_list = PreparePartitionedTestData();
- auto write_manifest_list_path = CreateNewTempFilePath();
- TestWriteManifestList(write_manifest_list_path, expected_manifest_list);
- TestManifestListReadingByPath(write_manifest_list_path,
expected_manifest_list);
-}
-
-TEST_F(ManifestListReaderWriterV2Test, WriteNonPartitionedTest) {
- auto expected_manifest_list = PrepareNonPartitionedTestData();
- auto write_manifest_list_path = CreateNewTempFilePath();
- TestWriteManifestList(write_manifest_list_path, expected_manifest_list);
- TestManifestListReadingByPath(write_manifest_list_path,
expected_manifest_list);
-
- // Additional verification: ensure all manifests are truly non-partitioned
- TestNonPartitionedManifests(expected_manifest_list);
-}
-
-} // namespace iceberg
diff --git a/src/iceberg/test/manifest_reader_writer_test.cc
b/src/iceberg/test/manifest_reader_writer_test.cc
deleted file mode 100644
index b3daaf94..00000000
--- a/src/iceberg/test/manifest_reader_writer_test.cc
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <cstddef>
-
-#include <arrow/filesystem/localfs.h>
-#include <gtest/gtest.h>
-
-#include "iceberg/arrow/arrow_fs_file_io_internal.h"
-#include "iceberg/avro/avro_register.h"
-#include "iceberg/manifest/manifest_entry.h"
-#include "iceberg/manifest/manifest_list.h"
-#include "iceberg/manifest/manifest_reader.h"
-#include "iceberg/manifest/manifest_writer.h"
-#include "iceberg/partition_spec.h"
-#include "iceberg/schema.h"
-#include "iceberg/test/matchers.h"
-#include "iceberg/test/temp_file_test_base.h"
-#include "iceberg/test/test_common.h"
-#include "iceberg/transform.h"
-#include "iceberg/type.h"
-
-namespace iceberg {
-
-class ManifestReaderWriterTestBase : public TempFileTestBase {
- protected:
- static void SetUpTestSuite() { avro::RegisterAll(); }
-
- void SetUp() override {
- TempFileTestBase::SetUp();
- local_fs_ = std::make_shared<::arrow::fs::LocalFileSystem>();
- file_io_ =
std::make_shared<iceberg::arrow::ArrowFileSystemFileIO>(local_fs_);
- }
-
- void TestManifestReading(const std::string& resource_name,
- const std::vector<ManifestEntry>& expected_entries,
- std::shared_ptr<Schema> partition_schema = nullptr,
- std::optional<int64_t> snapshot_id = std::nullopt) {
- std::string path = GetResourcePath(resource_name);
- TestManifestReadingByPath(path, expected_entries, partition_schema,
snapshot_id);
- }
-
- void TestManifestReadingByPath(const std::string& path,
- const std::vector<ManifestEntry>&
expected_entries,
- std::shared_ptr<Schema> partition_schema =
nullptr,
- std::optional<int64_t> snapshot_id =
std::nullopt) {
- auto manifest_reader_result = ManifestReader::Make(path, file_io_,
partition_schema);
- ASSERT_TRUE(manifest_reader_result.has_value())
- << manifest_reader_result.error().message;
-
- auto manifest_reader = std::move(manifest_reader_result.value());
- auto read_result = manifest_reader->Entries();
- ASSERT_TRUE(read_result.has_value()) << read_result.error().message;
- ASSERT_EQ(read_result.value().size(), expected_entries.size());
- ASSERT_EQ(read_result.value(), expected_entries);
- }
-
- void TestManifestReadingWithManifestFile(
- const ManifestFile& manifest_file,
- const std::vector<ManifestEntry>& expected_entries,
- std::shared_ptr<Schema> partition_schema = nullptr) {
- auto manifest_reader_result =
- ManifestReader::Make(manifest_file, file_io_, partition_schema);
- ASSERT_TRUE(manifest_reader_result.has_value())
- << manifest_reader_result.error().message;
-
- auto manifest_reader = std::move(manifest_reader_result.value());
- auto read_result = manifest_reader->Entries();
- ASSERT_TRUE(read_result.has_value()) << read_result.error().message;
- ASSERT_EQ(read_result.value().size(), expected_entries.size());
- ASSERT_EQ(read_result.value(), expected_entries);
- }
-
- std::shared_ptr<::arrow::fs::LocalFileSystem> local_fs_;
- std::shared_ptr<FileIO> file_io_;
-};
-
-class ManifestV1Test : public ManifestReaderWriterTestBase {
- protected:
- std::vector<ManifestEntry> PreparePartitionedTestData() {
- std::vector<ManifestEntry> manifest_entries;
- std::string test_dir_prefix = "/tmp/db/db/iceberg_test/data/";
- std::vector<std::string> paths = {
- "order_ts_hour=2021-01-27-00/"
- "00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00001.parquet",
- "order_ts_hour=2024-01-27-00/"
- "00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00002.parquet",
- "order_ts_hour=2023-01-26-00/"
- "00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00003.parquet",
- "order_ts_hour=2021-01-26-00/"
- "00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00004.parquet"};
- std::vector<int64_t> partitions = {447696, 473976, 465192, 447672};
-
- // Note: The precision and scale for decimal literals are chosen
arbitrarily here,
- // since the lower and upper bounds for decimal values are stored as
unscaled int128_t
- // values in manifest files.
- std::vector<std::map<int32_t, std::vector<uint8_t>>> bounds = {
- {{1, Literal::Long(1234).Serialize().value()},
- {2, Literal::Long(5678).Serialize().value()},
- {3, Literal::Decimal(4834, 10, 2).Serialize().value()},
- {4, Literal::Timestamp(1611706223000000LL).Serialize().value()}},
-
- {{1, Literal::Long(1234).Serialize().value()},
- {2, Literal::Long(5678).Serialize().value()},
- {3, Literal::Decimal(4835, 10, 2).Serialize().value()},
- {4, Literal::Timestamp(1706314223000000LL).Serialize().value()}},
-
- {{1, Literal::Long(123).Serialize().value()},
- {2, Literal::Long(456).Serialize().value()},
- {3, Literal::Decimal(3618, 10, 2).Serialize().value()},
- {4, Literal::Timestamp(1674691823000000LL).Serialize().value()}},
-
- {{1, Literal::Long(123).Serialize().value()},
- {2, Literal::Long(456).Serialize().value()},
- {3, Literal::Decimal(3617, 10, 2).Serialize().value()},
- {4, Literal::Timestamp(1611619823000000LL).Serialize().value()}},
- };
-
- for (int i = 0; i < 4; ++i) {
- ManifestEntry entry;
- entry.status = ManifestStatus::kAdded;
- entry.snapshot_id = 6387266376565973956;
- entry.data_file = std::make_shared<DataFile>();
- entry.data_file->file_path = test_dir_prefix + paths[i];
- entry.data_file->file_format = FileFormatType::kParquet;
- entry.data_file->partition.AddValue(Literal::Int(partitions[i]));
- entry.data_file->record_count = 1;
- entry.data_file->file_size_in_bytes = 1375;
- entry.data_file->column_sizes = {{1, 49}, {2, 49}, {3, 49}, {4, 49}};
- entry.data_file->value_counts = {{1, 1}, {2, 1}, {3, 1}, {4, 1}};
- entry.data_file->null_value_counts = {{1, 0}, {2, 0}, {3, 0}, {4, 0}};
- entry.data_file->split_offsets = {4};
- entry.data_file->sort_order_id = 0;
- entry.data_file->upper_bounds = bounds[i];
- entry.data_file->lower_bounds = bounds[i];
- manifest_entries.emplace_back(entry);
- }
- return manifest_entries;
- }
-
- void TestWriteManifest(int64_t snapshot_id, const std::string&
manifest_list_path,
- std::shared_ptr<PartitionSpec> partition_spec,
- const std::vector<ManifestEntry>& manifest_entries,
- std::shared_ptr<Schema> table_schema) {
- auto result =
- ManifestWriter::MakeV1Writer(snapshot_id, manifest_list_path, file_io_,
- std::move(partition_spec),
std::move(table_schema));
- ASSERT_TRUE(result.has_value()) << result.error().message;
- auto writer = std::move(result.value());
- auto status = writer->AddAll(manifest_entries);
- EXPECT_THAT(status, IsOk());
- status = writer->Close();
- EXPECT_THAT(status, IsOk());
- }
-};
-
-TEST_F(ManifestV1Test, ReadPartitionedTest) {
- // TODO(xiao.dong) we need to add more cases for different partition types
- SchemaField partition_field(1000, "order_ts_hour", int32(), true);
- auto partition_schema =
- std::make_shared<Schema>(std::vector<SchemaField>({partition_field}));
- auto expected_entries = PreparePartitionedTestData();
- TestManifestReading("56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro",
expected_entries,
- partition_schema);
-}
-
-TEST_F(ManifestV1Test, WritePartitionedTest) {
- SchemaField table_field(1, "order_ts_hour_source", int32(), true);
- SchemaField partition_field(1000, "order_ts_hour", int32(), true);
- auto table_schema =
std::make_shared<Schema>(std::vector<SchemaField>({table_field}));
- auto partition_schema =
- std::make_shared<Schema>(std::vector<SchemaField>({partition_field}));
- auto identity_transform = Transform::Identity();
- std::vector<PartitionField> fields{
- PartitionField(1, 1000, "order_ts_hour", identity_transform)};
- ICEBERG_UNWRAP_OR_FAIL(std::shared_ptr<PartitionSpec> partition_spec,
- PartitionSpec::Make(*table_schema, 1, fields, false));
-
- auto expected_entries = PreparePartitionedTestData();
- auto write_manifest_path = CreateNewTempFilePath();
- TestWriteManifest(1, write_manifest_path, partition_spec, expected_entries,
- table_schema);
- TestManifestReadingByPath(write_manifest_path, expected_entries,
partition_schema, 1);
-}
-
-class ManifestV2Test : public ManifestReaderWriterTestBase {
- protected:
- std::vector<ManifestEntry> CreateV2TestData(
- std::optional<int64_t> sequence_number = std::nullopt,
- std::optional<int32_t> partition_spec_id = std::nullopt) {
- std::vector<ManifestEntry> manifest_entries;
- std::string test_dir_prefix =
"/tmp/db/db/v2_manifest_non_partitioned/data/";
-
- std::vector<std::string> paths = {
- "00000-0-b0f98903-6d21-45fd-9e0b-afbd4963e365-0-00001.parquet"};
-
- std::vector<int64_t> file_sizes = {1344};
- std::vector<int64_t> record_counts = {4};
-
- std::vector<std::map<int32_t, std::vector<uint8_t>>> lower_bounds = {
- {{1, Literal::Long(1).Serialize().value()},
- {2, Literal::String("record_four").Serialize().value()},
- {3, Literal::String("data_content_1").Serialize().value()},
- {4, Literal::Double(123.45).Serialize().value()}}};
-
- std::vector<std::map<int32_t, std::vector<uint8_t>>> upper_bounds = {
- {{1, Literal::Long(4).Serialize().value()},
- {2, Literal::String("record_two").Serialize().value()},
- {3, Literal::String("data_content_4").Serialize().value()},
- {4, Literal::Double(456.78).Serialize().value()}}};
-
- DataFile data_file{.file_path = test_dir_prefix + paths[0],
- .file_format = FileFormatType::kParquet,
- .record_count = record_counts[0],
- .file_size_in_bytes = file_sizes[0],
- .column_sizes = {{1, 56}, {2, 73}, {3, 66}, {4, 67}},
- .value_counts = {{1, 4}, {2, 4}, {3, 4}, {4, 4}},
- .null_value_counts = {{1, 0}, {2, 0}, {3, 0}, {4, 0}},
- .nan_value_counts = {{4, 0}},
- .lower_bounds = lower_bounds[0],
- .upper_bounds = upper_bounds[0],
- .key_metadata = {},
- .split_offsets = {4},
- .equality_ids = {},
- .sort_order_id = 0,
- .first_row_id = std::nullopt,
- .referenced_data_file = std::nullopt,
- .content_offset = std::nullopt,
- .content_size_in_bytes = std::nullopt};
-
- if (partition_spec_id.has_value()) {
- data_file.partition_spec_id = partition_spec_id.value();
- }
-
- manifest_entries.emplace_back(
- ManifestEntry{.status = ManifestStatus::kAdded,
- .snapshot_id = 679879563479918846LL,
- .sequence_number = sequence_number,
- .file_sequence_number = sequence_number,
- .data_file = std::make_shared<DataFile>(data_file)});
- return manifest_entries;
- }
-
- std::vector<ManifestEntry> PrepareNonPartitionedTestData() {
- return CreateV2TestData();
- }
-
- std::vector<ManifestEntry> PrepareMetadataInheritanceTestData() {
- return CreateV2TestData(/*sequence_number=*/15, /*partition_spec_id*/ 12);
- }
-
- void TestWriteManifest(int64_t snapshot_id, const std::string&
manifest_list_path,
- std::shared_ptr<PartitionSpec> partition_spec,
- const std::vector<ManifestEntry>& manifest_entries,
- std::shared_ptr<Schema> table_schema) {
- auto result = ManifestWriter::MakeV2Writer(
- snapshot_id, manifest_list_path, file_io_, std::move(partition_spec),
- std::move(table_schema), ManifestContent::kData);
- ASSERT_TRUE(result.has_value()) << result.error().message;
- auto writer = std::move(result.value());
- auto status = writer->AddAll(manifest_entries);
- EXPECT_THAT(status, IsOk());
- status = writer->Close();
- EXPECT_THAT(status, IsOk());
- }
-};
-
-TEST_F(ManifestV2Test, ReadNonPartitionedTest) {
- auto expected_entries = PrepareNonPartitionedTestData();
- TestManifestReading("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro",
expected_entries);
-}
-
-TEST_F(ManifestV2Test, ReadMetadataInheritanceTest) {
- std::string path =
GetResourcePath("2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro");
- ManifestFile manifest_file{
- .manifest_path = path,
- .manifest_length = 100,
- .partition_spec_id = 12,
- .content = ManifestContent::kData,
- .sequence_number = 15,
- .added_snapshot_id = 679879563479918846LL,
- };
- auto expected_entries = PrepareMetadataInheritanceTestData();
- TestManifestReadingWithManifestFile(manifest_file, expected_entries);
-}
-
-TEST_F(ManifestV2Test, WriteNonPartitionedTest) {
- SchemaField table_field(1, "order_ts_hour_source", int32(), true);
- SchemaField partition_field(1000, "order_ts_hour", int32(), true);
- auto table_schema =
std::make_shared<Schema>(std::vector<SchemaField>({table_field}));
- auto expected_entries = PrepareNonPartitionedTestData();
- auto write_manifest_path = CreateNewTempFilePath();
- TestWriteManifest(679879563479918846LL, write_manifest_path,
- PartitionSpec::Unpartitioned(), expected_entries,
table_schema);
- TestManifestReadingByPath(write_manifest_path, expected_entries);
-}
-
-TEST_F(ManifestV2Test, WriteInheritancePartitionedTest) {
- SchemaField table_field(1, "order_ts_hour_source", int32(), true);
- SchemaField partition_field(1000, "order_ts_hour", int32(), true);
- auto table_schema =
std::make_shared<Schema>(std::vector<SchemaField>({table_field}));
- auto expected_entries = PrepareMetadataInheritanceTestData();
- auto write_manifest_path = CreateNewTempFilePath();
- TestWriteManifest(679879563479918846LL, write_manifest_path,
- PartitionSpec::Unpartitioned(), expected_entries,
table_schema);
- ManifestFile manifest_file{
- .manifest_path = write_manifest_path,
- .manifest_length = 100,
- .partition_spec_id = 12,
- .content = ManifestContent::kData,
- .sequence_number = 15,
- .added_snapshot_id = 679879563479918846LL,
- };
- TestManifestReadingWithManifestFile(manifest_file, expected_entries);
-}
-
-} // namespace iceberg
diff --git
a/src/iceberg/test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro
b/src/iceberg/test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro
deleted file mode 100644
index f8e6c1c4..00000000
Binary files
a/src/iceberg/test/resources/2ddf1bc9-830b-4015-aced-c060df36f150-m0.avro and
/dev/null differ
diff --git
a/src/iceberg/test/resources/56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro
b/src/iceberg/test/resources/56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro
deleted file mode 100644
index c671dfdf..00000000
Binary files
a/src/iceberg/test/resources/56357cd7-391f-4df8-aa24-e7e667da8870-m4.avro and
/dev/null differ
diff --git
a/src/iceberg/test/resources/snap-251167482216575399-1-ccb6dbcb-0611-48da-be68-bd506ea63188.avro
b/src/iceberg/test/resources/snap-251167482216575399-1-ccb6dbcb-0611-48da-be68-bd506ea63188.avro
deleted file mode 100644
index d8621c6b..00000000
Binary files
a/src/iceberg/test/resources/snap-251167482216575399-1-ccb6dbcb-0611-48da-be68-bd506ea63188.avro
and /dev/null differ
diff --git
a/src/iceberg/test/resources/snap-4134160420377642835-1-aeffe099-3bac-4011-bc17-5875210d8dc0.avro
b/src/iceberg/test/resources/snap-4134160420377642835-1-aeffe099-3bac-4011-bc17-5875210d8dc0.avro
deleted file mode 100644
index 29584b8c..00000000
Binary files
a/src/iceberg/test/resources/snap-4134160420377642835-1-aeffe099-3bac-4011-bc17-5875210d8dc0.avro
and /dev/null differ
diff --git
a/src/iceberg/test/resources/snap-7412193043800610213-1-2bccd69e-d642-4816-bba0-261cd9bd0d93.avro
b/src/iceberg/test/resources/snap-7412193043800610213-1-2bccd69e-d642-4816-bba0-261cd9bd0d93.avro
deleted file mode 100644
index c2299391..00000000
Binary files
a/src/iceberg/test/resources/snap-7412193043800610213-1-2bccd69e-d642-4816-bba0-261cd9bd0d93.avro
and /dev/null differ
diff --git
a/src/iceberg/test/resources/snap-7522296285847100621-1-5d690750-8fb4-4cd1-8ae7-85c7b39abe14.avro
b/src/iceberg/test/resources/snap-7522296285847100621-1-5d690750-8fb4-4cd1-8ae7-85c7b39abe14.avro
deleted file mode 100644
index 590edc1f..00000000
Binary files
a/src/iceberg/test/resources/snap-7522296285847100621-1-5d690750-8fb4-4cd1-8ae7-85c7b39abe14.avro
and /dev/null differ
diff --git
a/src/iceberg/test/resources/snap-7532614258660258098-1-eafd2972-f58e-4185-9237-6378f564787e.avro
b/src/iceberg/test/resources/snap-7532614258660258098-1-eafd2972-f58e-4185-9237-6378f564787e.avro
deleted file mode 100644
index 4fba684a..00000000
Binary files
a/src/iceberg/test/resources/snap-7532614258660258098-1-eafd2972-f58e-4185-9237-6378f564787e.avro
and /dev/null differ