This is an automated email from the ASF dual-hosted git repository.

xuanwo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new 257b1ad  feat: implement schema selection and projection methods (#207)
257b1ad is described below

commit 257b1adbfbb5fe2d5d500bbb5be2f2167fc3dedb
Author: chao liu <[email protected]>
AuthorDate: Tue Sep 23 17:52:57 2025 +0800

    feat: implement schema selection and projection methods (#207)
    
    - Added select and project methods to the Schema class for creating
    projection schemas based on specified field names or IDs.
    - Introduced PruneColumnVisitor to handle the logic for selecting and
    projecting fields, including support for nested structures.
    
    ---------
    
    Co-authored-by: nullccxsy <[email protected]>
---
 src/iceberg/schema.cc          | 145 +++++++++++++
 src/iceberg/schema.h           |  34 +++-
 src/iceberg/schema_internal.cc |   5 +
 src/iceberg/schema_internal.h  |   2 +
 test/schema_test.cc            | 453 ++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 631 insertions(+), 8 deletions(-)

diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc
index 1df20c6..2ab2f7e 100644
--- a/src/iceberg/schema.cc
+++ b/src/iceberg/schema.cc
@@ -22,6 +22,7 @@
 #include <format>
 #include <functional>
 
+#include "iceberg/schema_internal.h"
 #include "iceberg/type.h"
 #include "iceberg/util/formatter.h"  // IWYU pragma: keep
 #include "iceberg/util/macros.h"
@@ -260,4 +261,148 @@ void NameToIdVisitor::Finish() {
   }
 }
 
+/// \brief Visitor for pruning columns based on selected field IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing 
only
+/// the specified fields. When `select_full_types` is true, a field with all 
its
+/// sub-fields are selected if its field-id has been selected; otherwise, only 
leaf
+/// fields of selected field-ids are selected.
+///
+/// \note It returns an error when projection is not successful.
+class PruneColumnVisitor {
+ public:
+  PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+                     bool select_full_types)
+      : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+  Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const 
{
+    switch (type->type_id()) {
+      case TypeId::kStruct:
+        return Visit(internal::checked_pointer_cast<StructType>(type));
+      case TypeId::kList:
+        return Visit(internal::checked_pointer_cast<ListType>(type));
+      case TypeId::kMap:
+        return Visit(internal::checked_pointer_cast<MapType>(type));
+      default:
+        return nullptr;
+    }
+  }
+
+  Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const {
+    if (selected_ids_.contains(field.field_id())) {
+      return (select_full_types_ || field.type()->is_primitive()) ? 
field.type()
+                                                                  : 
Visit(field.type());
+    }
+    return Visit(field.type());
+  }
+
+  static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type> 
type) {
+    return {field.field_id(), std::string(field.name()), std::move(type),
+            field.optional(), std::string(field.doc())};
+  }
+
+  Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type) 
const {
+    bool same_types = true;
+    std::vector<SchemaField> selected_fields;
+    for (const auto& field : type->fields()) {
+      ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field));
+      if (child_type) {
+        same_types = same_types && (child_type == field.type());
+        selected_fields.emplace_back(MakeField(field, std::move(child_type)));
+      }
+    }
+
+    if (selected_fields.empty()) {
+      return nullptr;
+    } else if (same_types && selected_fields.size() == type->fields().size()) {
+      return type;
+    }
+    return std::make_shared<StructType>(std::move(selected_fields));
+  }
+
+  Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type) 
const {
+    const auto& elem_field = type->fields()[0];
+    ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field));
+    if (elem_type == nullptr) {
+      return nullptr;
+    } else if (elem_type == elem_field.type()) {
+      return type;
+    }
+    return std::make_shared<ListType>(MakeField(elem_field, 
std::move(elem_type)));
+  }
+
+  Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type) 
const {
+    const auto& key_field = type->fields()[0];
+    const auto& value_field = type->fields()[1];
+    ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field));
+    ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field));
+
+    if (key_type == nullptr && value_type == nullptr) {
+      return nullptr;
+    } else if (value_type == value_field.type() &&
+               (key_type == key_field.type() || key_type == nullptr)) {
+      return type;
+    } else if (value_type == nullptr) {
+      return InvalidArgument("Cannot project Map without value field");
+    }
+    return std::make_shared<MapType>(
+        (key_type == nullptr ? key_field : MakeField(key_field, 
std::move(key_type))),
+        MakeField(value_field, std::move(value_type)));
+  }
+
+ private:
+  const std::unordered_set<int32_t>& selected_ids_;
+  const bool select_full_types_;
+};
+
+Result<std::unique_ptr<Schema>> Schema::Select(std::span<const std::string> 
names,
+                                               bool case_sensitive) const {
+  const std::string kAllColumns = "*";
+  if (std::ranges::find(names, kAllColumns) != names.end()) {
+    auto struct_type = ToStructType(*this);
+    return FromStructType(std::move(*struct_type), std::nullopt);
+  }
+
+  std::unordered_set<int32_t> selected_ids;
+  for (const auto& name : names) {
+    ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name, 
case_sensitive));
+    if (result.has_value()) {
+      selected_ids.insert(result.value().get().field_id());
+    }
+  }
+
+  PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true);
+  ICEBERG_ASSIGN_OR_RAISE(
+      auto pruned_type, 
visitor.Visit(std::shared_ptr<StructType>(ToStructType(*this))));
+
+  if (!pruned_type) {
+    return std::make_unique<Schema>(std::vector<SchemaField>{}, std::nullopt);
+  }
+
+  if (pruned_type->type_id() != TypeId::kStruct) {
+    return InvalidSchema("Projected type must be a struct type");
+  }
+
+  return 
FromStructType(std::move(internal::checked_cast<StructType&>(*pruned_type)),
+                        std::nullopt);
+}
+
+Result<std::unique_ptr<Schema>> Schema::Project(
+    const std::unordered_set<int32_t>& field_ids) const {
+  PruneColumnVisitor visitor(field_ids, /*select_full_types=*/false);
+  ICEBERG_ASSIGN_OR_RAISE(
+      auto project_type, 
visitor.Visit(std::shared_ptr<StructType>(ToStructType(*this))));
+
+  if (!project_type) {
+    return std::make_unique<Schema>(std::vector<SchemaField>{}, std::nullopt);
+  }
+
+  if (project_type->type_id() != TypeId::kStruct) {
+    return InvalidSchema("Projected type must be a struct type");
+  }
+
+  return 
FromStructType(std::move(internal::checked_cast<StructType&>(*project_type)),
+                        std::nullopt);
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h
index 260d9d3..81f9aa3 100644
--- a/src/iceberg/schema.h
+++ b/src/iceberg/schema.h
@@ -27,6 +27,7 @@
 #include <mutex>
 #include <optional>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "iceberg/iceberg_export.h"
@@ -53,9 +54,9 @@ class ICEBERG_EXPORT Schema : public StructType {
   ///
   /// A schema is identified by a unique ID for the purposes of schema
   /// evolution.
-  [[nodiscard]] std::optional<int32_t> schema_id() const;
+  std::optional<int32_t> schema_id() const;
 
-  [[nodiscard]] std::string ToString() const override;
+  std::string ToString() const override;
 
   /// \brief Find the SchemaField by field name.
   ///
@@ -66,18 +67,37 @@ class ICEBERG_EXPORT Schema : public StructType {
   /// canonical name 'm.value.x'
   /// FIXME: Currently only handles ASCII lowercase conversion; extend to 
support
   /// non-ASCII characters (e.g., using std::towlower or ICU)
-  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
-  FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+  Result<std::optional<std::reference_wrapper<const SchemaField>>> 
FindFieldByName(
+      std::string_view name, bool case_sensitive = true) const;
 
   /// \brief Find the SchemaField by field id.
-  [[nodiscard]] Result<std::optional<std::reference_wrapper<const 
SchemaField>>>
-  FindFieldById(int32_t field_id) const;
+  Result<std::optional<std::reference_wrapper<const SchemaField>>> 
FindFieldById(
+      int32_t field_id) const;
+
+  /// \brief Creates a projected schema from selected field names.
+  ///
+  /// \param names Selected field names and nested names are dot-concatenated.
+  /// \param case_sensitive Whether name matching is case-sensitive (default: 
true).
+  /// \return Projected schema containing only selected fields.
+  /// \note If the field name of a nested type has been selected, all of its
+  /// sub-fields will be selected.
+  Result<std::unique_ptr<Schema>> Select(std::span<const std::string> names,
+                                         bool case_sensitive = true) const;
+
+  /// \brief Creates a projected schema from selected field IDs.
+  ///
+  /// \param field_ids Set of field IDs to select
+  /// \return Projected schema containing only the specified fields.
+  /// \note Field ID of a nested field may not be projected unless at least
+  /// one of its sub-fields has been projected.
+  Result<std::unique_ptr<Schema>> Project(
+      const std::unordered_set<int32_t>& field_ids) const;
 
   friend bool operator==(const Schema& lhs, const Schema& rhs) { return 
lhs.Equals(rhs); }
 
  private:
   /// \brief Compare two schemas for equality.
-  [[nodiscard]] bool Equals(const Schema& other) const;
+  bool Equals(const Schema& other) const;
 
   Status InitIdToFieldMap() const;
   Status InitNameToIdMap() const;
diff --git a/src/iceberg/schema_internal.cc b/src/iceberg/schema_internal.cc
index beb973b..e020a9b 100644
--- a/src/iceberg/schema_internal.cc
+++ b/src/iceberg/schema_internal.cc
@@ -325,4 +325,9 @@ Result<std::unique_ptr<Schema>> FromArrowSchema(const 
ArrowSchema& schema,
   return FromStructType(std::move(struct_type), schema_id);
 }
 
+std::unique_ptr<StructType> ToStructType(const Schema& schema) {
+  std::vector<SchemaField> fields(schema.fields().begin(), 
schema.fields().end());
+  return std::make_unique<StructType>(std::move(fields));
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/schema_internal.h b/src/iceberg/schema_internal.h
index 8b29085..5c7209d 100644
--- a/src/iceberg/schema_internal.h
+++ b/src/iceberg/schema_internal.h
@@ -53,4 +53,6 @@ Result<std::unique_ptr<Schema>> FromArrowSchema(const 
ArrowSchema& schema,
 std::unique_ptr<Schema> FromStructType(StructType&& struct_type,
                                        std::optional<int32_t> schema_id);
 
+std::unique_ptr<StructType> ToStructType(const Schema& schema);
+
 }  // namespace iceberg
diff --git a/test/schema_test.cc b/test/schema_test.cc
index b01ffe9..3d10fb8 100644
--- a/test/schema_test.cc
+++ b/test/schema_test.cc
@@ -26,10 +26,24 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "gtest/gtest.h"
+#include "iceberg/result.h"
 #include "iceberg/schema_field.h"
 #include "iceberg/util/formatter.h"  // IWYU pragma: keep
 #include "matchers.h"
 
+template <typename... Args>
+std::shared_ptr<iceberg::StructType> MakeStructType(Args&&... args) {
+  return std::make_shared<iceberg::StructType>(
+      std::vector<iceberg::SchemaField>{std::move(args)...});
+}
+
+template <typename... Args>
+std::unique_ptr<iceberg::Schema> MakeSchema(Args&&... args) {
+  return std::make_unique<iceberg::Schema>(
+      std::vector<iceberg::SchemaField>{std::move(args)...}, std::nullopt);
+}
+
 TEST(SchemaTest, Basics) {
   {
     iceberg::SchemaField field1(5, "foo", iceberg::int32(), true);
@@ -492,7 +506,444 @@ TEST(SchemaTest, NestedDuplicateFieldIdError) {
               ::testing::HasSubstr("Duplicate field id found: 1"));
 }
 
-// Thread safety tests for Lazy Init
+namespace {
+
+iceberg::SchemaField Id() { return {1, "id", iceberg::int32(), true}; }
+iceberg::SchemaField Name() { return {2, "name", iceberg::string(), false}; }
+iceberg::SchemaField Age() { return {3, "age", iceberg::int32(), true}; }
+iceberg::SchemaField Email() { return {4, "email", iceberg::string(), true}; }
+iceberg::SchemaField Street() { return {11, "street", iceberg::string(), 
true}; }
+iceberg::SchemaField City() { return {12, "city", iceberg::string(), true}; }
+iceberg::SchemaField Zip() { return {13, "zip", iceberg::int32(), true}; }
+iceberg::SchemaField Theme() { return {24, "theme", iceberg::string(), true}; }
+iceberg::SchemaField Key() { return {31, "key", iceberg::int32(), false}; }
+iceberg::SchemaField Value() { return {32, "value", iceberg::string(), false}; 
}
+iceberg::SchemaField Element() { return {41, "element", iceberg::string(), 
false}; }
+
+static std::unique_ptr<iceberg::Schema> BasicSchema() {
+  return MakeSchema(Id(), Name(), Age(), Email());
+}
+
+static std::unique_ptr<iceberg::Schema> AddressSchema() {
+  auto address_type = MakeStructType(Street(), City(), Zip());
+  auto address_field = iceberg::SchemaField{14, "address", 
std::move(address_type), true};
+  return MakeSchema(Id(), Name(), std::move(address_field));
+}
+
+static std::unique_ptr<iceberg::Schema> NestedUserSchema() {
+  auto address_type = MakeStructType(Street(), City());
+  auto address_field = iceberg::SchemaField{16, "address", 
std::move(address_type), true};
+  auto user_type = MakeStructType(Name(), address_field);
+  auto user_field = iceberg::SchemaField{17, "user", std::move(user_type), 
true};
+  return MakeSchema(Id(), user_field);
+}
+
+static std::unique_ptr<iceberg::Schema> MultiLevelSchema() {
+  auto profile_type = MakeStructType(Name(), Age());
+  auto profile_field = iceberg::SchemaField{23, "profile", 
std::move(profile_type), true};
+
+  auto settings_type = MakeStructType(Theme());
+  auto settings_field =
+      iceberg::SchemaField{25, "settings", std::move(settings_type), true};
+
+  auto user_type = MakeStructType(profile_field, settings_field);
+  auto user_field = iceberg::SchemaField{26, "user", std::move(user_type), 
true};
+
+  return MakeSchema(Id(), user_field);
+}
+
+static std::unique_ptr<iceberg::Schema> ListSchema() {
+  auto list_type = std::make_shared<iceberg::ListType>(Element());
+  auto tags_field = iceberg::SchemaField{42, "tags", std::move(list_type), 
true};
+
+  auto user_type = MakeStructType(Name(), Age());
+  auto user_field = iceberg::SchemaField{45, "user", std::move(user_type), 
true};
+
+  return MakeSchema(Id(), tags_field, user_field);
+}
+
+static std::unique_ptr<iceberg::Schema> MapSchema() {
+  auto map_type = std::make_shared<iceberg::MapType>(Key(), Value());
+  auto map_field = iceberg::SchemaField{33, "map_field", std::move(map_type), 
true};
+  return MakeSchema(map_field);
+}
+
+static std::unique_ptr<iceberg::Schema> ListWithStructElementSchema() {
+  auto struct_type = MakeStructType(Name(), Age());
+  auto element_field = iceberg::SchemaField{53, "element", 
std::move(struct_type), false};
+  auto list_type = std::make_shared<iceberg::ListType>(element_field);
+  auto list_field = iceberg::SchemaField{54, "list_field", 
std::move(list_type), true};
+  return MakeSchema(list_field);
+}
+
+static std::unique_ptr<iceberg::Schema> ListOfMapSchema() {
+  auto map_value_struct = MakeStructType(Name(), Age());
+  auto map_value_field =
+      iceberg::SchemaField{64, "value", std::move(map_value_struct), false};
+  auto map_type = std::make_shared<iceberg::MapType>(Key(), map_value_field);
+  auto list_element = iceberg::SchemaField{65, "element", std::move(map_type), 
false};
+  auto list_type = std::make_shared<iceberg::ListType>(list_element);
+  auto list_field = iceberg::SchemaField{66, "list_field", 
std::move(list_type), true};
+  return MakeSchema(list_field);
+}
+
+static std::unique_ptr<iceberg::Schema> ComplexMapSchema() {
+  auto key_id_field = iceberg::SchemaField{71, "id", iceberg::int32(), false};
+  auto key_name_field = iceberg::SchemaField{72, "name", iceberg::string(), 
false};
+  auto key_struct = MakeStructType(key_id_field, key_name_field);
+  auto key_field = iceberg::SchemaField{73, "key", std::move(key_struct), 
false};
+
+  auto value_id_field = iceberg::SchemaField{74, "id", iceberg::int32(), 
false};
+  auto value_name_field = iceberg::SchemaField{75, "name", iceberg::string(), 
false};
+  auto value_struct = MakeStructType(value_id_field, value_name_field);
+  auto value_field = iceberg::SchemaField{76, "value", 
std::move(value_struct), false};
+
+  auto map_type = std::make_shared<iceberg::MapType>(key_field, value_field);
+  auto map_field = iceberg::SchemaField{77, "map_field", std::move(map_type), 
true};
+  return MakeSchema(map_field);
+}
+}  // namespace
+
+struct SelectTestParam {
+  std::string test_name;
+  std::function<std::unique_ptr<iceberg::Schema>()> create_schema;
+  std::vector<std::string> select_fields;
+  std::function<std::unique_ptr<iceberg::Schema>()> expected_schema;
+  bool should_succeed;
+  std::string expected_error_message;
+  bool case_sensitive = true;
+};
+
+class SelectParamTest : public ::testing::TestWithParam<SelectTestParam> {};
+
+TEST_P(SelectParamTest, SelectFields) {
+  const auto& param = GetParam();
+  auto input_schema = param.create_schema();
+  auto result = input_schema->Select(param.select_fields, 
param.case_sensitive);
+
+  if (param.should_succeed) {
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(*result.value(), *param.expected_schema());
+  } else {
+    ASSERT_FALSE(result.has_value());
+    ASSERT_THAT(result, 
iceberg::IsError(iceberg::ErrorKind::kInvalidArgument));
+    ASSERT_THAT(result, 
iceberg::HasErrorMessage(param.expected_error_message));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    SelectTestCases, SelectParamTest,
+    ::testing::Values(
+        SelectTestParam{.test_name = "SelectAllColumns",
+                        .create_schema = []() { return BasicSchema(); },
+                        .select_fields = {"*"},
+                        .expected_schema = []() { return BasicSchema(); },
+                        .should_succeed = true},
+
+        SelectTestParam{.test_name = "SelectSingleField",
+                        .create_schema = []() { return BasicSchema(); },
+                        .select_fields = {"name"},
+                        .expected_schema = []() { return MakeSchema(Name()); },
+                        .should_succeed = true},
+
+        SelectTestParam{
+            .test_name = "SelectMultipleFields",
+            .create_schema = []() { return BasicSchema(); },
+            .select_fields = {"id", "name", "age"},
+            .expected_schema = []() { return MakeSchema(Id(), Name(), Age()); 
},
+            .should_succeed = true},
+
+        SelectTestParam{.test_name = "SelectNonExistentField",
+                        .create_schema = []() { return BasicSchema(); },
+                        .select_fields = {"nonexistent"},
+                        .expected_schema = []() { return MakeSchema(); },
+                        .should_succeed = true},
+
+        SelectTestParam{.test_name = "SelectCaseSensitive",
+                        .create_schema = []() { return BasicSchema(); },
+                        .select_fields = {"Name"},  // case-sensitive
+                        .expected_schema = []() { return MakeSchema(); },
+                        .should_succeed = true},
+
+        SelectTestParam{.test_name = "SelectCaseInsensitive",
+                        .create_schema = []() { return BasicSchema(); },
+                        .select_fields = {"Name"},  // case-insensitive
+                        .expected_schema = []() { return MakeSchema(Name()); },
+                        .should_succeed = true,
+                        .case_sensitive = false}));
+
+INSTANTIATE_TEST_SUITE_P(
+    SelectNestedTestCases, SelectParamTest,
+    ::testing::Values(SelectTestParam{
+                          .test_name = "SelectTopLevelFields",
+                          .create_schema = []() { return AddressSchema(); },
+                          .select_fields = {"id", "name"},
+                          .expected_schema = []() { return MakeSchema(Id(), 
Name()); },
+                          .should_succeed = true},
+
+                      SelectTestParam{.test_name = "SelectNestedField",
+                                      .create_schema = []() { return 
AddressSchema(); },
+                                      .select_fields = {"address.street"},
+                                      .expected_schema =
+                                          []() {
+                                            auto address_type = 
MakeStructType(Street());
+                                            auto address_field = 
iceberg::SchemaField{
+                                                14, "address", 
std::move(address_type),
+                                                true};
+                                            return MakeSchema(address_field);
+                                          },
+                                      .should_succeed = true}));
+
+INSTANTIATE_TEST_SUITE_P(
+    SelectMultiLevelTestCases, SelectParamTest,
+    ::testing::Values(
+        SelectTestParam{.test_name = "SelectTopLevelAndNestedFields",
+                        .create_schema = []() { return NestedUserSchema(); },
+                        .select_fields = {"id", "user.name", 
"user.address.street"},
+                        .expected_schema =
+                            []() {
+                              auto address_type = MakeStructType(Street());
+                              auto address_field = iceberg::SchemaField{
+                                  16, "address", std::move(address_type), 
true};
+                              auto user_type = MakeStructType(Name(), 
address_field);
+                              auto user_field = iceberg::SchemaField{
+                                  17, "user", std::move(user_type), true};
+                              return MakeSchema(Id(), user_field);
+                            },
+                        .should_succeed = true},
+
+        SelectTestParam{.test_name = "SelectNestedFieldsAtDifferentLevels",
+                        .create_schema = []() { return MultiLevelSchema(); },
+                        .select_fields = {"user.profile.name", 
"user.settings.theme"},
+                        .expected_schema =
+                            []() {
+                              auto profile_type = MakeStructType(Name());
+                              auto profile_field = iceberg::SchemaField{
+                                  23, "profile", std::move(profile_type), 
true};
+
+                              auto settings_type = MakeStructType(Theme());
+                              auto settings_field = iceberg::SchemaField{
+                                  25, "settings", std::move(settings_type), 
true};
+
+                              auto user_type =
+                                  MakeStructType(profile_field, 
settings_field);
+                              auto user_field = iceberg::SchemaField{
+                                  26, "user", std::move(user_type), true};
+                              return MakeSchema(user_field);
+                            },
+                        .should_succeed = true},
+
+        SelectTestParam{.test_name = "SelectListAndNestedFields",
+                        .create_schema = []() { return ListSchema(); },
+                        .select_fields = {"id", "user.name"},
+                        .expected_schema =
+                            []() {
+                              auto user_type = MakeStructType(Name());
+                              auto user_field = iceberg::SchemaField{
+                                  45, "user", std::move(user_type), true};
+                              return MakeSchema(Id(), user_field);
+                            },
+                        .should_succeed = true}));
+
+struct ProjectTestParam {
+  std::string test_name;
+  std::function<std::unique_ptr<iceberg::Schema>()> create_schema;
+  std::unordered_set<int32_t> selected_ids;
+  std::function<std::unique_ptr<iceberg::Schema>()> expected_schema;
+  bool should_succeed;
+  std::string expected_error_message;
+};
+
+class ProjectParamTest : public ::testing::TestWithParam<ProjectTestParam> {};
+
+TEST_P(ProjectParamTest, ProjectFields) {
+  const auto& param = GetParam();
+  auto input_schema = param.create_schema();
+  auto result = input_schema->Project(param.selected_ids);
+
+  if (param.should_succeed) {
+    ASSERT_TRUE(result.has_value());
+    ASSERT_EQ(*result.value(), *param.expected_schema());
+  } else {
+    ASSERT_FALSE(result.has_value());
+    ASSERT_THAT(result, 
iceberg::IsError(iceberg::ErrorKind::kInvalidArgument));
+    ASSERT_THAT(result, 
iceberg::HasErrorMessage(param.expected_error_message));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ProjectTestCases, ProjectParamTest,
+    ::testing::Values(ProjectTestParam{.test_name = "ProjectAllFields",
+                                       .create_schema = []() { return 
BasicSchema(); },
+                                       .selected_ids = {1, 2, 3, 4},
+                                       .expected_schema = []() { return 
BasicSchema(); },
+                                       .should_succeed = true},
+
+                      ProjectTestParam{
+                          .test_name = "ProjectSingleField",
+                          .create_schema = []() { return BasicSchema(); },
+                          .selected_ids = {2},
+                          .expected_schema = []() { return MakeSchema(Name()); 
},
+                          .should_succeed = true},
+
+                      ProjectTestParam{.test_name = 
"ProjectNonExistentFieldId",
+                                       .create_schema = []() { return 
BasicSchema(); },
+                                       .selected_ids = {999},
+                                       .expected_schema = []() { return 
MakeSchema(); },
+                                       .should_succeed = true},
+
+                      ProjectTestParam{.test_name = "ProjectEmptySelection",
+                                       .create_schema = []() { return 
BasicSchema(); },
+                                       .selected_ids = {},
+                                       .expected_schema = []() { return 
MakeSchema(); },
+                                       .should_succeed = true}));
+
+INSTANTIATE_TEST_SUITE_P(ProjectNestedTestCases, ProjectParamTest,
+                         ::testing::Values(ProjectTestParam{
+                             .test_name = "ProjectNestedStructField",
+                             .create_schema = []() { return AddressSchema(); },
+                             .selected_ids = {11},
+                             .expected_schema =
+                                 []() {
+                                   auto address_type = 
MakeStructType(Street());
+                                   auto address_field = iceberg::SchemaField{
+                                       14, "address", std::move(address_type), 
true};
+                                   return MakeSchema(address_field);
+                                 },
+                             .should_succeed = true}));
+
+INSTANTIATE_TEST_SUITE_P(
+    ProjectMultiLevelTestCases, ProjectParamTest,
+    ::testing::Values(
+        ProjectTestParam{.test_name = "ProjectTopLevelAndNestedFields",
+                         .create_schema = []() { return NestedUserSchema(); },
+                         .selected_ids = {1, 2, 11},
+                         .expected_schema =
+                             []() {
+                               auto address_type = MakeStructType(Street());
+                               auto address_field = iceberg::SchemaField{
+                                   16, "address", std::move(address_type), 
true};
+                               auto user_type = MakeStructType(Name(), 
address_field);
+                               auto user_field = iceberg::SchemaField{
+                                   17, "user", std::move(user_type), true};
+                               return MakeSchema(Id(), user_field);
+                             },
+                         .should_succeed = true},
+
+        ProjectTestParam{.test_name = "ProjectNestedFieldsAtDifferentLevels",
+                         .create_schema = []() { return MultiLevelSchema(); },
+                         .selected_ids = {2, 24},
+                         .expected_schema =
+                             []() {
+                               auto profile_type = MakeStructType(Name());
+                               auto profile_field = iceberg::SchemaField{
+                                   23, "profile", std::move(profile_type), 
true};
+
+                               auto settings_type = MakeStructType(Theme());
+                               auto settings_field = iceberg::SchemaField{
+                                   25, "settings", std::move(settings_type), 
true};
+
+                               auto user_type =
+                                   MakeStructType(profile_field, 
settings_field);
+                               auto user_field = iceberg::SchemaField{
+                                   26, "user", std::move(user_type), true};
+                               return MakeSchema(user_field);
+                             },
+                         .should_succeed = true},
+
+        ProjectTestParam{.test_name = "ProjectListAndNestedFields",
+                         .create_schema = []() { return ListSchema(); },
+                         .selected_ids = {1, 2},
+                         .expected_schema =
+                             []() {
+                               auto user_type = MakeStructType(Name());
+                               auto user_field = iceberg::SchemaField{
+                                   45, "user", std::move(user_type), true};
+                               return MakeSchema(Id(), user_field);
+                             },
+                         .should_succeed = true}));
+
+INSTANTIATE_TEST_SUITE_P(
+    ProjectMapErrorTestCases, ProjectParamTest,
+    ::testing::Values(ProjectTestParam{
+        .test_name = "ProjectMapWithOnlyKey",
+        .create_schema = []() { return MapSchema(); },
+        .selected_ids = {31},  // Only select key field, not value field
+        .expected_schema = []() { return nullptr; },
+        .should_succeed = false,
+        .expected_error_message = "Cannot project Map without value field"}));
+
+INSTANTIATE_TEST_SUITE_P(
+    ProjectListAndMapTestCases, ProjectParamTest,
+    ::testing::Values(
+        ProjectTestParam{.test_name = "ProjectListElement",
+                         .create_schema = []() { return 
ListWithStructElementSchema(); },
+                         .selected_ids = {2},  // Only select name field from 
list element
+                         .expected_schema =
+                             []() {
+                               auto struct_type = MakeStructType(Name());
+                               auto element_field = iceberg::SchemaField{
+                                   53, "element", std::move(struct_type), 
false};
+                               auto list_type =
+                                   
std::make_shared<iceberg::ListType>(element_field);
+                               auto list_field = iceberg::SchemaField{
+                                   54, "list_field", std::move(list_type), 
true};
+                               return MakeSchema(list_field);
+                             },
+                         .should_succeed = true},
+
+        ProjectTestParam{.test_name = "ProjectListOfMap",
+                         .create_schema = []() { return ListOfMapSchema(); },
+                         .selected_ids = {2, 3},
+                         .expected_schema =
+                             []() {
+                               auto map_value_struct = MakeStructType(Name(), 
Age());
+                               auto map_value_field = iceberg::SchemaField{
+                                   64, "value", std::move(map_value_struct), 
false};
+                               auto map_type = 
std::make_shared<iceberg::MapType>(
+                                   Key(), map_value_field);
+                               auto list_element = iceberg::SchemaField{
+                                   65, "element", std::move(map_type), false};
+                               auto list_type =
+                                   
std::make_shared<iceberg::ListType>(list_element);
+                               auto list_field = iceberg::SchemaField{
+                                   66, "list_field", std::move(list_type), 
true};
+                               return MakeSchema(list_field);
+                             },
+                         .should_succeed = true},
+
+        ProjectTestParam{
+            .test_name = "ProjectMapKeyAndValue",
+            .create_schema = []() { return ComplexMapSchema(); },
+            .selected_ids = {71, 74},
+            .expected_schema =
+                []() {
+                  auto key_id_field =
+                      iceberg::SchemaField{71, "id", iceberg::int32(), false};
+                  auto key_struct = MakeStructType(key_id_field);
+                  auto key_field =
+                      iceberg::SchemaField{73, "key", std::move(key_struct), 
false};
+
+                  auto value_id_field =
+                      iceberg::SchemaField{74, "id", iceberg::int32(), false};
+                  auto value_struct = MakeStructType(value_id_field);
+                  auto value_field =
+                      iceberg::SchemaField{76, "value", 
std::move(value_struct), false};
+
+                  auto map_type =
+                      std::make_shared<iceberg::MapType>(key_field, 
value_field);
+                  auto map_field =
+                      iceberg::SchemaField{77, "map_field", 
std::move(map_type), true};
+                  return MakeSchema(map_field);
+                },
+            .should_succeed = true},
+
+        ProjectTestParam{.test_name = "ProjectEmptyResult",
+                         .create_schema = []() { return BasicSchema(); },
+                         .selected_ids = {999},  // Select non-existent field
+                         .expected_schema = []() { return MakeSchema(); },
+                         .should_succeed = true}));
+
 class SchemaThreadSafetyTest : public ::testing::Test {
  protected:
   void SetUp() override {


Reply via email to