wgtmac commented on code in PR #207:
URL: https://github.com/apache/iceberg-cpp/pull/207#discussion_r2328730429
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +258,268 @@ void NameToIdVisitor::Finish() {
}
}
+/// \brief Visitor class for pruning schema columns based on selected field
IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing
only
+/// the specified fields. It handles different projection modes:
+/// - select_full_types=true: Include entire fields when their ID is selected
+/// - select_full_types=false: Recursively project nested fields within
selected structs
+///
+/// \warning Error conditions that will cause projection to fail:
+/// - Attempting to explicitly project List or Map types (returns
InvalidArgument)
+/// - Projecting a List when element result is null (returns InvalidArgument)
+/// - Projecting a Map without a defined map value type (returns
InvalidArgument)
+/// - Projecting a struct when result is not StructType (returns
InvalidArgument)
Review Comment:
I think it is easy and valid to support projections in the nested map and
list types and don't know why the Java impl does not support this. The code
will be much simpler (shorter) if we support them.
@Fokko Do you have any context on the Java impl?
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +258,268 @@ void NameToIdVisitor::Finish() {
}
}
+/// \brief Visitor class for pruning schema columns based on selected field
IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing
only
+/// the specified fields. It handles different projection modes:
+/// - select_full_types=true: Include entire fields when their ID is selected
+/// - select_full_types=false: Recursively project nested fields within
selected structs
+///
+/// \warning Error conditions that will cause projection to fail:
+/// - Attempting to explicitly project List or Map types (returns
InvalidArgument)
+/// - Projecting a List when element result is null (returns InvalidArgument)
+/// - Projecting a Map without a defined map value type (returns
InvalidArgument)
+/// - Projecting a struct when result is not StructType (returns
InvalidArgument)
+class PruneColumnVisitor {
+ public:
+ PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+ Status Visit(const StructType& type, std::unique_ptr<Type>* out) const {
Review Comment:
```suggestion
Status Visit(const std::shared_ptr<StructType>& type,
std::shared_ptr<Type>* out) const {
```
I still think we need to change the signature to this to enable returning
original type (which is a common case for select). Or even better, we can
simplify it as `Result<std::shared_ptr<Type>> Visit(const
std::shared_ptr<StructType>& type) const` and add our own dispatching in the
`Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const`.
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +258,268 @@ void NameToIdVisitor::Finish() {
}
}
+/// \brief Visitor class for pruning schema columns based on selected field
IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing
only
+/// the specified fields. It handles different projection modes:
+/// - select_full_types=true: Include entire fields when their ID is selected
+/// - select_full_types=false: Recursively project nested fields within
selected structs
+///
+/// \warning Error conditions that will cause projection to fail:
+/// - Attempting to explicitly project List or Map types (returns
InvalidArgument)
+/// - Projecting a List when element result is null (returns InvalidArgument)
+/// - Projecting a Map without a defined map value type (returns
InvalidArgument)
+/// - Projecting a struct when result is not StructType (returns
InvalidArgument)
+class PruneColumnVisitor {
+ public:
+ PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+ Status Visit(const StructType& type, std::unique_ptr<Type>* out) const {
+ std::vector<std::shared_ptr<const Type>> selected_types;
+ for (const auto& field : type.fields()) {
Review Comment:
There are a lot of duplicate (or at least similar code) in this visitor
class and can be greatly simplified by adding a Visit function for `const
SchemaField&`. Please check that Struct, List, and Map are all nested types
with a list of `SchemaField`s.
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +258,268 @@ void NameToIdVisitor::Finish() {
}
}
+/// \brief Visitor class for pruning schema columns based on selected field
IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing
only
+/// the specified fields. It handles different projection modes:
+/// - select_full_types=true: Include entire fields when their ID is selected
+/// - select_full_types=false: Recursively project nested fields within
selected structs
+///
+/// \warning Error conditions that will cause projection to fail:
+/// - Attempting to explicitly project List or Map types (returns
InvalidArgument)
+/// - Projecting a List when element result is null (returns InvalidArgument)
+/// - Projecting a Map without a defined map value type (returns
InvalidArgument)
+/// - Projecting a struct when result is not StructType (returns
InvalidArgument)
+class PruneColumnVisitor {
+ public:
+ PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+ Status Visit(const StructType& type, std::unique_ptr<Type>* out) const {
+ std::vector<std::shared_ptr<const Type>> selected_types;
+ for (const auto& field : type.fields()) {
+ std::unique_ptr<Type> child_result;
+ if (select_full_types_ and selected_ids_.contains(field.field_id())) {
+ selected_types.emplace_back(field.type());
+ continue;
+ }
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this,
&child_result));
+ if (selected_ids_.contains(field.field_id())) {
+ // select
+ if (field.type()->type_id() == TypeId::kStruct) {
+ // project(kstruct)
+ ICEBERG_RETURN_UNEXPECTED(ProjectStruct(field, &child_result));
+ selected_types.emplace_back(std::move(child_result));
+ } else {
+ // project(list, map, primitive)
+ if (!field.type()->is_primitive()) {
+ return InvalidArgument(
+ "Cannot explicitly project List or Map types, {}:{} of type {}
was "
+ "selected",
+ field.field_id(), field.name(), field.type()->ToString());
+ }
+ selected_types.emplace_back(field.type());
+ }
+ } else if (child_result) {
+ // project, select
+ selected_types.emplace_back(std::move(child_result));
+ } else {
+ // project, select
+ selected_types.emplace_back(nullptr);
+ }
+ }
+
+ bool same_types = true;
+ std::vector<SchemaField> selected_fields;
+ const auto& fields = type.fields();
+ for (size_t i = 0; i < fields.size(); i++) {
+ if (fields[i].type() == selected_types[i]) {
+ selected_fields.emplace_back(fields[i]);
+ } else if (selected_types[i]) {
+ same_types = false;
+ selected_fields.emplace_back(fields[i].field_id(),
std::string(fields[i].name()),
+
std::const_pointer_cast<Type>(selected_types[i]),
+ fields[i].optional(),
std::string(fields[i].doc()));
+ }
+ }
+
+ if (!selected_fields.empty()) {
+ if (same_types && selected_fields.size() == fields.size()) {
+ *out = std::make_unique<StructType>(type);
+ } else {
+ *out = std::make_unique<StructType>(std::move(selected_fields));
+ }
+ }
+
+ return {};
+ }
+
+ Status Visit(const ListType& type, std::unique_ptr<Type>* out) const {
+ const auto& element_field = type.fields()[0];
+
+ if (select_full_types_ and
selected_ids_.contains(element_field.field_id())) {
+ *out = std::make_unique<ListType>(type);
+ return {};
+ }
+
+ std::unique_ptr<Type> child_result;
+ ICEBERG_RETURN_UNEXPECTED(
+ VisitTypeInline(*element_field.type(), this, &child_result));
+
+ if (selected_ids_.contains(element_field.field_id())) {
+ if (element_field.type()->type_id() == TypeId::kStruct) {
+ ICEBERG_RETURN_UNEXPECTED(ProjectStruct(element_field, &child_result));
+ ICEBERG_RETURN_UNEXPECTED(
+ ProjectList(element_field, std::move(child_result), out));
+ } else {
+ if (!element_field.type()->is_primitive()) {
+ return InvalidArgument(
+ "Cannot explicitly project List or Map types, List element {} of
type {} "
+ "was "
+ "selected",
+ element_field.field_id(), element_field.name());
+ }
+ *out = std::make_unique<ListType>(element_field);
+ }
+ } else if (child_result) {
+ ICEBERG_RETURN_UNEXPECTED(ProjectList(element_field,
std::move(child_result), out));
+ }
+
+ return {};
+ }
+
+ Status Visit(const MapType& type, std::unique_ptr<Type>* out) const {
+ const auto& key_field = type.fields()[0];
+ const auto& value_field = type.fields()[1];
+
+ if (select_full_types_ and selected_ids_.contains(value_field.field_id()))
{
+ *out = std::make_unique<MapType>(type);
+ return {};
+ }
+
+ std::unique_ptr<Type> key_result;
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*key_field.type(), this,
&key_result));
Review Comment:
Are you sure about this? In the Java impl, key type should be fully
projected.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]