This is an automated email from the ASF dual-hosted git repository.
xuanwo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 257b1ad feat: implement schema selection and projection methods (#207)
257b1ad is described below
commit 257b1adbfbb5fe2d5d500bbb5be2f2167fc3dedb
Author: chao liu <[email protected]>
AuthorDate: Tue Sep 23 17:52:57 2025 +0800
feat: implement schema selection and projection methods (#207)
- Added select and project methods to the Schema class for creating
projection schemas based on specified field names or IDs.
- Introduced PruneColumnVisitor to handle the logic for selecting and
projecting fields, including support for nested structures.
---------
Co-authored-by: nullccxsy <[email protected]>
---
src/iceberg/schema.cc | 145 +++++++++++++
src/iceberg/schema.h | 34 +++-
src/iceberg/schema_internal.cc | 5 +
src/iceberg/schema_internal.h | 2 +
test/schema_test.cc | 453 ++++++++++++++++++++++++++++++++++++++++-
5 files changed, 631 insertions(+), 8 deletions(-)
diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc
index 1df20c6..2ab2f7e 100644
--- a/src/iceberg/schema.cc
+++ b/src/iceberg/schema.cc
@@ -22,6 +22,7 @@
#include <format>
#include <functional>
+#include "iceberg/schema_internal.h"
#include "iceberg/type.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
#include "iceberg/util/macros.h"
@@ -260,4 +261,148 @@ void NameToIdVisitor::Finish() {
}
}
+/// \brief Visitor for pruning columns based on selected field IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing
only
+/// the specified fields. When `select_full_types` is true, a field with all
its
+/// sub-fields are selected if its field-id has been selected; otherwise, only
leaf
+/// fields of selected field-ids are selected.
+///
+/// \note It returns an error when projection is not successful.
+class PruneColumnVisitor {
+ public:
+ PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const
{
+ switch (type->type_id()) {
+ case TypeId::kStruct:
+ return Visit(internal::checked_pointer_cast<StructType>(type));
+ case TypeId::kList:
+ return Visit(internal::checked_pointer_cast<ListType>(type));
+ case TypeId::kMap:
+ return Visit(internal::checked_pointer_cast<MapType>(type));
+ default:
+ return nullptr;
+ }
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const {
+ if (selected_ids_.contains(field.field_id())) {
+ return (select_full_types_ || field.type()->is_primitive()) ?
field.type()
+ :
Visit(field.type());
+ }
+ return Visit(field.type());
+ }
+
+ static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type>
type) {
+ return {field.field_id(), std::string(field.name()), std::move(type),
+ field.optional(), std::string(field.doc())};
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type)
const {
+ bool same_types = true;
+ std::vector<SchemaField> selected_fields;
+ for (const auto& field : type->fields()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field));
+ if (child_type) {
+ same_types = same_types && (child_type == field.type());
+ selected_fields.emplace_back(MakeField(field, std::move(child_type)));
+ }
+ }
+
+ if (selected_fields.empty()) {
+ return nullptr;
+ } else if (same_types && selected_fields.size() == type->fields().size()) {
+ return type;
+ }
+ return std::make_shared<StructType>(std::move(selected_fields));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type)
const {
+ const auto& elem_field = type->fields()[0];
+ ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field));
+ if (elem_type == nullptr) {
+ return nullptr;
+ } else if (elem_type == elem_field.type()) {
+ return type;
+ }
+ return std::make_shared<ListType>(MakeField(elem_field,
std::move(elem_type)));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type)
const {
+ const auto& key_field = type->fields()[0];
+ const auto& value_field = type->fields()[1];
+ ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field));
+ ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field));
+
+ if (key_type == nullptr && value_type == nullptr) {
+ return nullptr;
+ } else if (value_type == value_field.type() &&
+ (key_type == key_field.type() || key_type == nullptr)) {
+ return type;
+ } else if (value_type == nullptr) {
+ return InvalidArgument("Cannot project Map without value field");
+ }
+ return std::make_shared<MapType>(
+ (key_type == nullptr ? key_field : MakeField(key_field,
std::move(key_type))),
+ MakeField(value_field, std::move(value_type)));
+ }
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ const bool select_full_types_;
+};
+
+Result<std::unique_ptr<Schema>> Schema::Select(std::span<const std::string>
names,
+ bool case_sensitive) const {
+ const std::string kAllColumns = "*";
+ if (std::ranges::find(names, kAllColumns) != names.end()) {
+ auto struct_type = ToStructType(*this);
+ return FromStructType(std::move(*struct_type), std::nullopt);
+ }
+
+ std::unordered_set<int32_t> selected_ids;
+ for (const auto& name : names) {
+ ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name,
case_sensitive));
+ if (result.has_value()) {
+ selected_ids.insert(result.value().get().field_id());
+ }
+ }
+
+ PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true);
+ ICEBERG_ASSIGN_OR_RAISE(
+ auto pruned_type,
visitor.Visit(std::shared_ptr<StructType>(ToStructType(*this))));
+
+ if (!pruned_type) {
+ return std::make_unique<Schema>(std::vector<SchemaField>{}, std::nullopt);
+ }
+
+ if (pruned_type->type_id() != TypeId::kStruct) {
+ return InvalidSchema("Projected type must be a struct type");
+ }
+
+ return
FromStructType(std::move(internal::checked_cast<StructType&>(*pruned_type)),
+ std::nullopt);
+}
+
+Result<std::unique_ptr<Schema>> Schema::Project(
+ const std::unordered_set<int32_t>& field_ids) const {
+ PruneColumnVisitor visitor(field_ids, /*select_full_types=*/false);
+ ICEBERG_ASSIGN_OR_RAISE(
+ auto project_type,
visitor.Visit(std::shared_ptr<StructType>(ToStructType(*this))));
+
+ if (!project_type) {
+ return std::make_unique<Schema>(std::vector<SchemaField>{}, std::nullopt);
+ }
+
+ if (project_type->type_id() != TypeId::kStruct) {
+ return InvalidSchema("Projected type must be a struct type");
+ }
+
+ return
FromStructType(std::move(internal::checked_cast<StructType&>(*project_type)),
+ std::nullopt);
+}
+
} // namespace iceberg
diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h
index 260d9d3..81f9aa3 100644
--- a/src/iceberg/schema.h
+++ b/src/iceberg/schema.h
@@ -27,6 +27,7 @@
#include <mutex>
#include <optional>
#include <string>
+#include <unordered_set>
#include <vector>
#include "iceberg/iceberg_export.h"
@@ -53,9 +54,9 @@ class ICEBERG_EXPORT Schema : public StructType {
///
/// A schema is identified by a unique ID for the purposes of schema
/// evolution.
- [[nodiscard]] std::optional<int32_t> schema_id() const;
+ std::optional<int32_t> schema_id() const;
- [[nodiscard]] std::string ToString() const override;
+ std::string ToString() const override;
/// \brief Find the SchemaField by field name.
///
@@ -66,18 +67,37 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ ///
+ /// \param names Selected field names and nested names are dot-concatenated.
+ /// \param case_sensitive Whether name matching is case-sensitive (default:
true).
+ /// \return Projected schema containing only selected fields.
+ /// \note If the field name of a nested type has been selected, all of its
+ /// sub-fields will be selected.
+ Result<std::unique_ptr<Schema>> Select(std::span<const std::string> names,
+ bool case_sensitive = true) const;
+
+ /// \brief Creates a projected schema from selected field IDs.
+ ///
+ /// \param field_ids Set of field IDs to select
+ /// \return Projected schema containing only the specified fields.
+ /// \note Field ID of a nested field may not be projected unless at least
+ /// one of its sub-fields has been projected.
+ Result<std::unique_ptr<Schema>> Project(
+ const std::unordered_set<int32_t>& field_ids) const;
friend bool operator==(const Schema& lhs, const Schema& rhs) { return
lhs.Equals(rhs); }
private:
/// \brief Compare two schemas for equality.
- [[nodiscard]] bool Equals(const Schema& other) const;
+ bool Equals(const Schema& other) const;
Status InitIdToFieldMap() const;
Status InitNameToIdMap() const;
diff --git a/src/iceberg/schema_internal.cc b/src/iceberg/schema_internal.cc
index beb973b..e020a9b 100644
--- a/src/iceberg/schema_internal.cc
+++ b/src/iceberg/schema_internal.cc
@@ -325,4 +325,9 @@ Result<std::unique_ptr<Schema>> FromArrowSchema(const
ArrowSchema& schema,
return FromStructType(std::move(struct_type), schema_id);
}
+std::unique_ptr<StructType> ToStructType(const Schema& schema) {
+ std::vector<SchemaField> fields(schema.fields().begin(),
schema.fields().end());
+ return std::make_unique<StructType>(std::move(fields));
+}
+
} // namespace iceberg
diff --git a/src/iceberg/schema_internal.h b/src/iceberg/schema_internal.h
index 8b29085..5c7209d 100644
--- a/src/iceberg/schema_internal.h
+++ b/src/iceberg/schema_internal.h
@@ -53,4 +53,6 @@ Result<std::unique_ptr<Schema>> FromArrowSchema(const
ArrowSchema& schema,
std::unique_ptr<Schema> FromStructType(StructType&& struct_type,
std::optional<int32_t> schema_id);
+std::unique_ptr<StructType> ToStructType(const Schema& schema);
+
} // namespace iceberg
diff --git a/test/schema_test.cc b/test/schema_test.cc
index b01ffe9..3d10fb8 100644
--- a/test/schema_test.cc
+++ b/test/schema_test.cc
@@ -26,10 +26,24 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>
+#include "gtest/gtest.h"
+#include "iceberg/result.h"
#include "iceberg/schema_field.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
#include "matchers.h"
+template <typename... Args>
+std::shared_ptr<iceberg::StructType> MakeStructType(Args&&... args) {
+ return std::make_shared<iceberg::StructType>(
+ std::vector<iceberg::SchemaField>{std::move(args)...});
+}
+
+template <typename... Args>
+std::unique_ptr<iceberg::Schema> MakeSchema(Args&&... args) {
+ return std::make_unique<iceberg::Schema>(
+ std::vector<iceberg::SchemaField>{std::move(args)...}, std::nullopt);
+}
+
TEST(SchemaTest, Basics) {
{
iceberg::SchemaField field1(5, "foo", iceberg::int32(), true);
@@ -492,7 +506,444 @@ TEST(SchemaTest, NestedDuplicateFieldIdError) {
::testing::HasSubstr("Duplicate field id found: 1"));
}
-// Thread safety tests for Lazy Init
+namespace {
+
+iceberg::SchemaField Id() { return {1, "id", iceberg::int32(), true}; }
+iceberg::SchemaField Name() { return {2, "name", iceberg::string(), false}; }
+iceberg::SchemaField Age() { return {3, "age", iceberg::int32(), true}; }
+iceberg::SchemaField Email() { return {4, "email", iceberg::string(), true}; }
+iceberg::SchemaField Street() { return {11, "street", iceberg::string(),
true}; }
+iceberg::SchemaField City() { return {12, "city", iceberg::string(), true}; }
+iceberg::SchemaField Zip() { return {13, "zip", iceberg::int32(), true}; }
+iceberg::SchemaField Theme() { return {24, "theme", iceberg::string(), true}; }
+iceberg::SchemaField Key() { return {31, "key", iceberg::int32(), false}; }
+iceberg::SchemaField Value() { return {32, "value", iceberg::string(), false};
}
+iceberg::SchemaField Element() { return {41, "element", iceberg::string(),
false}; }
+
+static std::unique_ptr<iceberg::Schema> BasicSchema() {
+ return MakeSchema(Id(), Name(), Age(), Email());
+}
+
+static std::unique_ptr<iceberg::Schema> AddressSchema() {
+ auto address_type = MakeStructType(Street(), City(), Zip());
+ auto address_field = iceberg::SchemaField{14, "address",
std::move(address_type), true};
+ return MakeSchema(Id(), Name(), std::move(address_field));
+}
+
+static std::unique_ptr<iceberg::Schema> NestedUserSchema() {
+ auto address_type = MakeStructType(Street(), City());
+ auto address_field = iceberg::SchemaField{16, "address",
std::move(address_type), true};
+ auto user_type = MakeStructType(Name(), address_field);
+ auto user_field = iceberg::SchemaField{17, "user", std::move(user_type),
true};
+ return MakeSchema(Id(), user_field);
+}
+
+static std::unique_ptr<iceberg::Schema> MultiLevelSchema() {
+ auto profile_type = MakeStructType(Name(), Age());
+ auto profile_field = iceberg::SchemaField{23, "profile",
std::move(profile_type), true};
+
+ auto settings_type = MakeStructType(Theme());
+ auto settings_field =
+ iceberg::SchemaField{25, "settings", std::move(settings_type), true};
+
+ auto user_type = MakeStructType(profile_field, settings_field);
+ auto user_field = iceberg::SchemaField{26, "user", std::move(user_type),
true};
+
+ return MakeSchema(Id(), user_field);
+}
+
+static std::unique_ptr<iceberg::Schema> ListSchema() {
+ auto list_type = std::make_shared<iceberg::ListType>(Element());
+ auto tags_field = iceberg::SchemaField{42, "tags", std::move(list_type),
true};
+
+ auto user_type = MakeStructType(Name(), Age());
+ auto user_field = iceberg::SchemaField{45, "user", std::move(user_type),
true};
+
+ return MakeSchema(Id(), tags_field, user_field);
+}
+
+static std::unique_ptr<iceberg::Schema> MapSchema() {
+ auto map_type = std::make_shared<iceberg::MapType>(Key(), Value());
+ auto map_field = iceberg::SchemaField{33, "map_field", std::move(map_type),
true};
+ return MakeSchema(map_field);
+}
+
+static std::unique_ptr<iceberg::Schema> ListWithStructElementSchema() {
+ auto struct_type = MakeStructType(Name(), Age());
+ auto element_field = iceberg::SchemaField{53, "element",
std::move(struct_type), false};
+ auto list_type = std::make_shared<iceberg::ListType>(element_field);
+ auto list_field = iceberg::SchemaField{54, "list_field",
std::move(list_type), true};
+ return MakeSchema(list_field);
+}
+
+static std::unique_ptr<iceberg::Schema> ListOfMapSchema() {
+ auto map_value_struct = MakeStructType(Name(), Age());
+ auto map_value_field =
+ iceberg::SchemaField{64, "value", std::move(map_value_struct), false};
+ auto map_type = std::make_shared<iceberg::MapType>(Key(), map_value_field);
+ auto list_element = iceberg::SchemaField{65, "element", std::move(map_type),
false};
+ auto list_type = std::make_shared<iceberg::ListType>(list_element);
+ auto list_field = iceberg::SchemaField{66, "list_field",
std::move(list_type), true};
+ return MakeSchema(list_field);
+}
+
+static std::unique_ptr<iceberg::Schema> ComplexMapSchema() {
+ auto key_id_field = iceberg::SchemaField{71, "id", iceberg::int32(), false};
+ auto key_name_field = iceberg::SchemaField{72, "name", iceberg::string(),
false};
+ auto key_struct = MakeStructType(key_id_field, key_name_field);
+ auto key_field = iceberg::SchemaField{73, "key", std::move(key_struct),
false};
+
+ auto value_id_field = iceberg::SchemaField{74, "id", iceberg::int32(),
false};
+ auto value_name_field = iceberg::SchemaField{75, "name", iceberg::string(),
false};
+ auto value_struct = MakeStructType(value_id_field, value_name_field);
+ auto value_field = iceberg::SchemaField{76, "value",
std::move(value_struct), false};
+
+ auto map_type = std::make_shared<iceberg::MapType>(key_field, value_field);
+ auto map_field = iceberg::SchemaField{77, "map_field", std::move(map_type),
true};
+ return MakeSchema(map_field);
+}
+} // namespace
+
+struct SelectTestParam {
+ std::string test_name;
+ std::function<std::unique_ptr<iceberg::Schema>()> create_schema;
+ std::vector<std::string> select_fields;
+ std::function<std::unique_ptr<iceberg::Schema>()> expected_schema;
+ bool should_succeed;
+ std::string expected_error_message;
+ bool case_sensitive = true;
+};
+
+class SelectParamTest : public ::testing::TestWithParam<SelectTestParam> {};
+
+TEST_P(SelectParamTest, SelectFields) {
+ const auto& param = GetParam();
+ auto input_schema = param.create_schema();
+ auto result = input_schema->Select(param.select_fields,
param.case_sensitive);
+
+ if (param.should_succeed) {
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(*result.value(), *param.expected_schema());
+ } else {
+ ASSERT_FALSE(result.has_value());
+ ASSERT_THAT(result,
iceberg::IsError(iceberg::ErrorKind::kInvalidArgument));
+ ASSERT_THAT(result,
iceberg::HasErrorMessage(param.expected_error_message));
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ SelectTestCases, SelectParamTest,
+ ::testing::Values(
+ SelectTestParam{.test_name = "SelectAllColumns",
+ .create_schema = []() { return BasicSchema(); },
+ .select_fields = {"*"},
+ .expected_schema = []() { return BasicSchema(); },
+ .should_succeed = true},
+
+ SelectTestParam{.test_name = "SelectSingleField",
+ .create_schema = []() { return BasicSchema(); },
+ .select_fields = {"name"},
+ .expected_schema = []() { return MakeSchema(Name()); },
+ .should_succeed = true},
+
+ SelectTestParam{
+ .test_name = "SelectMultipleFields",
+ .create_schema = []() { return BasicSchema(); },
+ .select_fields = {"id", "name", "age"},
+ .expected_schema = []() { return MakeSchema(Id(), Name(), Age());
},
+ .should_succeed = true},
+
+ SelectTestParam{.test_name = "SelectNonExistentField",
+ .create_schema = []() { return BasicSchema(); },
+ .select_fields = {"nonexistent"},
+ .expected_schema = []() { return MakeSchema(); },
+ .should_succeed = true},
+
+ SelectTestParam{.test_name = "SelectCaseSensitive",
+ .create_schema = []() { return BasicSchema(); },
+ .select_fields = {"Name"}, // case-sensitive
+ .expected_schema = []() { return MakeSchema(); },
+ .should_succeed = true},
+
+ SelectTestParam{.test_name = "SelectCaseInsensitive",
+ .create_schema = []() { return BasicSchema(); },
+ .select_fields = {"Name"}, // case-insensitive
+ .expected_schema = []() { return MakeSchema(Name()); },
+ .should_succeed = true,
+ .case_sensitive = false}));
+
+INSTANTIATE_TEST_SUITE_P(
+ SelectNestedTestCases, SelectParamTest,
+ ::testing::Values(SelectTestParam{
+ .test_name = "SelectTopLevelFields",
+ .create_schema = []() { return AddressSchema(); },
+ .select_fields = {"id", "name"},
+ .expected_schema = []() { return MakeSchema(Id(),
Name()); },
+ .should_succeed = true},
+
+ SelectTestParam{.test_name = "SelectNestedField",
+ .create_schema = []() { return
AddressSchema(); },
+ .select_fields = {"address.street"},
+ .expected_schema =
+ []() {
+ auto address_type =
MakeStructType(Street());
+ auto address_field =
iceberg::SchemaField{
+ 14, "address",
std::move(address_type),
+ true};
+ return MakeSchema(address_field);
+ },
+ .should_succeed = true}));
+
+INSTANTIATE_TEST_SUITE_P(
+ SelectMultiLevelTestCases, SelectParamTest,
+ ::testing::Values(
+ SelectTestParam{.test_name = "SelectTopLevelAndNestedFields",
+ .create_schema = []() { return NestedUserSchema(); },
+ .select_fields = {"id", "user.name",
"user.address.street"},
+ .expected_schema =
+ []() {
+ auto address_type = MakeStructType(Street());
+ auto address_field = iceberg::SchemaField{
+ 16, "address", std::move(address_type),
true};
+ auto user_type = MakeStructType(Name(),
address_field);
+ auto user_field = iceberg::SchemaField{
+ 17, "user", std::move(user_type), true};
+ return MakeSchema(Id(), user_field);
+ },
+ .should_succeed = true},
+
+ SelectTestParam{.test_name = "SelectNestedFieldsAtDifferentLevels",
+ .create_schema = []() { return MultiLevelSchema(); },
+ .select_fields = {"user.profile.name",
"user.settings.theme"},
+ .expected_schema =
+ []() {
+ auto profile_type = MakeStructType(Name());
+ auto profile_field = iceberg::SchemaField{
+ 23, "profile", std::move(profile_type),
true};
+
+ auto settings_type = MakeStructType(Theme());
+ auto settings_field = iceberg::SchemaField{
+ 25, "settings", std::move(settings_type),
true};
+
+ auto user_type =
+ MakeStructType(profile_field,
settings_field);
+ auto user_field = iceberg::SchemaField{
+ 26, "user", std::move(user_type), true};
+ return MakeSchema(user_field);
+ },
+ .should_succeed = true},
+
+ SelectTestParam{.test_name = "SelectListAndNestedFields",
+ .create_schema = []() { return ListSchema(); },
+ .select_fields = {"id", "user.name"},
+ .expected_schema =
+ []() {
+ auto user_type = MakeStructType(Name());
+ auto user_field = iceberg::SchemaField{
+ 45, "user", std::move(user_type), true};
+ return MakeSchema(Id(), user_field);
+ },
+ .should_succeed = true}));
+
+struct ProjectTestParam {
+ std::string test_name;
+ std::function<std::unique_ptr<iceberg::Schema>()> create_schema;
+ std::unordered_set<int32_t> selected_ids;
+ std::function<std::unique_ptr<iceberg::Schema>()> expected_schema;
+ bool should_succeed;
+ std::string expected_error_message;
+};
+
+class ProjectParamTest : public ::testing::TestWithParam<ProjectTestParam> {};
+
+TEST_P(ProjectParamTest, ProjectFields) {
+ const auto& param = GetParam();
+ auto input_schema = param.create_schema();
+ auto result = input_schema->Project(param.selected_ids);
+
+ if (param.should_succeed) {
+ ASSERT_TRUE(result.has_value());
+ ASSERT_EQ(*result.value(), *param.expected_schema());
+ } else {
+ ASSERT_FALSE(result.has_value());
+ ASSERT_THAT(result,
iceberg::IsError(iceberg::ErrorKind::kInvalidArgument));
+ ASSERT_THAT(result,
iceberg::HasErrorMessage(param.expected_error_message));
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ ProjectTestCases, ProjectParamTest,
+ ::testing::Values(ProjectTestParam{.test_name = "ProjectAllFields",
+ .create_schema = []() { return
BasicSchema(); },
+ .selected_ids = {1, 2, 3, 4},
+ .expected_schema = []() { return
BasicSchema(); },
+ .should_succeed = true},
+
+ ProjectTestParam{
+ .test_name = "ProjectSingleField",
+ .create_schema = []() { return BasicSchema(); },
+ .selected_ids = {2},
+ .expected_schema = []() { return MakeSchema(Name());
},
+ .should_succeed = true},
+
+ ProjectTestParam{.test_name =
"ProjectNonExistentFieldId",
+ .create_schema = []() { return
BasicSchema(); },
+ .selected_ids = {999},
+ .expected_schema = []() { return
MakeSchema(); },
+ .should_succeed = true},
+
+ ProjectTestParam{.test_name = "ProjectEmptySelection",
+ .create_schema = []() { return
BasicSchema(); },
+ .selected_ids = {},
+ .expected_schema = []() { return
MakeSchema(); },
+ .should_succeed = true}));
+
+INSTANTIATE_TEST_SUITE_P(ProjectNestedTestCases, ProjectParamTest,
+ ::testing::Values(ProjectTestParam{
+ .test_name = "ProjectNestedStructField",
+ .create_schema = []() { return AddressSchema(); },
+ .selected_ids = {11},
+ .expected_schema =
+ []() {
+ auto address_type =
MakeStructType(Street());
+ auto address_field = iceberg::SchemaField{
+ 14, "address", std::move(address_type),
true};
+ return MakeSchema(address_field);
+ },
+ .should_succeed = true}));
+
+INSTANTIATE_TEST_SUITE_P(
+ ProjectMultiLevelTestCases, ProjectParamTest,
+ ::testing::Values(
+ ProjectTestParam{.test_name = "ProjectTopLevelAndNestedFields",
+ .create_schema = []() { return NestedUserSchema(); },
+ .selected_ids = {1, 2, 11},
+ .expected_schema =
+ []() {
+ auto address_type = MakeStructType(Street());
+ auto address_field = iceberg::SchemaField{
+ 16, "address", std::move(address_type),
true};
+ auto user_type = MakeStructType(Name(),
address_field);
+ auto user_field = iceberg::SchemaField{
+ 17, "user", std::move(user_type), true};
+ return MakeSchema(Id(), user_field);
+ },
+ .should_succeed = true},
+
+ ProjectTestParam{.test_name = "ProjectNestedFieldsAtDifferentLevels",
+ .create_schema = []() { return MultiLevelSchema(); },
+ .selected_ids = {2, 24},
+ .expected_schema =
+ []() {
+ auto profile_type = MakeStructType(Name());
+ auto profile_field = iceberg::SchemaField{
+ 23, "profile", std::move(profile_type),
true};
+
+ auto settings_type = MakeStructType(Theme());
+ auto settings_field = iceberg::SchemaField{
+ 25, "settings", std::move(settings_type),
true};
+
+ auto user_type =
+ MakeStructType(profile_field,
settings_field);
+ auto user_field = iceberg::SchemaField{
+ 26, "user", std::move(user_type), true};
+ return MakeSchema(user_field);
+ },
+ .should_succeed = true},
+
+ ProjectTestParam{.test_name = "ProjectListAndNestedFields",
+ .create_schema = []() { return ListSchema(); },
+ .selected_ids = {1, 2},
+ .expected_schema =
+ []() {
+ auto user_type = MakeStructType(Name());
+ auto user_field = iceberg::SchemaField{
+ 45, "user", std::move(user_type), true};
+ return MakeSchema(Id(), user_field);
+ },
+ .should_succeed = true}));
+
+INSTANTIATE_TEST_SUITE_P(
+ ProjectMapErrorTestCases, ProjectParamTest,
+ ::testing::Values(ProjectTestParam{
+ .test_name = "ProjectMapWithOnlyKey",
+ .create_schema = []() { return MapSchema(); },
+ .selected_ids = {31}, // Only select key field, not value field
+ .expected_schema = []() { return nullptr; },
+ .should_succeed = false,
+ .expected_error_message = "Cannot project Map without value field"}));
+
+INSTANTIATE_TEST_SUITE_P(
+ ProjectListAndMapTestCases, ProjectParamTest,
+ ::testing::Values(
+ ProjectTestParam{.test_name = "ProjectListElement",
+ .create_schema = []() { return
ListWithStructElementSchema(); },
+ .selected_ids = {2}, // Only select name field from
list element
+ .expected_schema =
+ []() {
+ auto struct_type = MakeStructType(Name());
+ auto element_field = iceberg::SchemaField{
+ 53, "element", std::move(struct_type),
false};
+ auto list_type =
+
std::make_shared<iceberg::ListType>(element_field);
+ auto list_field = iceberg::SchemaField{
+ 54, "list_field", std::move(list_type),
true};
+ return MakeSchema(list_field);
+ },
+ .should_succeed = true},
+
+ ProjectTestParam{.test_name = "ProjectListOfMap",
+ .create_schema = []() { return ListOfMapSchema(); },
+ .selected_ids = {2, 3},
+ .expected_schema =
+ []() {
+ auto map_value_struct = MakeStructType(Name(),
Age());
+ auto map_value_field = iceberg::SchemaField{
+ 64, "value", std::move(map_value_struct),
false};
+ auto map_type =
std::make_shared<iceberg::MapType>(
+ Key(), map_value_field);
+ auto list_element = iceberg::SchemaField{
+ 65, "element", std::move(map_type), false};
+ auto list_type =
+
std::make_shared<iceberg::ListType>(list_element);
+ auto list_field = iceberg::SchemaField{
+ 66, "list_field", std::move(list_type),
true};
+ return MakeSchema(list_field);
+ },
+ .should_succeed = true},
+
+ ProjectTestParam{
+ .test_name = "ProjectMapKeyAndValue",
+ .create_schema = []() { return ComplexMapSchema(); },
+ .selected_ids = {71, 74},
+ .expected_schema =
+ []() {
+ auto key_id_field =
+ iceberg::SchemaField{71, "id", iceberg::int32(), false};
+ auto key_struct = MakeStructType(key_id_field);
+ auto key_field =
+ iceberg::SchemaField{73, "key", std::move(key_struct),
false};
+
+ auto value_id_field =
+ iceberg::SchemaField{74, "id", iceberg::int32(), false};
+ auto value_struct = MakeStructType(value_id_field);
+ auto value_field =
+ iceberg::SchemaField{76, "value",
std::move(value_struct), false};
+
+ auto map_type =
+ std::make_shared<iceberg::MapType>(key_field,
value_field);
+ auto map_field =
+ iceberg::SchemaField{77, "map_field",
std::move(map_type), true};
+ return MakeSchema(map_field);
+ },
+ .should_succeed = true},
+
+ ProjectTestParam{.test_name = "ProjectEmptyResult",
+ .create_schema = []() { return BasicSchema(); },
+ .selected_ids = {999}, // Select non-existent field
+ .expected_schema = []() { return MakeSchema(); },
+ .should_succeed = true}));
+
class SchemaThreadSafetyTest : public ::testing::Test {
protected:
void SetUp() override {