wgtmac commented on code in PR #207:
URL: https://github.com/apache/iceberg-cpp/pull/207#discussion_r2320851050
##########
src/iceberg/schema.h:
##########
@@ -41,7 +42,8 @@ namespace iceberg {
/// A schema is a list of typed columns, along with a unique integer ID. A
/// Table may have different schemas over its lifetime due to schema
/// evolution.
-class ICEBERG_EXPORT Schema : public StructType {
+class ICEBERG_EXPORT Schema : public StructType,
Review Comment:
We shouldn't inherit enable_shared_from_this, otherwise it greatly limits
its usage.
##########
src/iceberg/schema.h:
##########
@@ -65,19 +67,41 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ Result<std::shared_ptr<const Schema>> select(const std::vector<std::string>&
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ Result<std::shared_ptr<const Schema>> select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ template <typename... Args>
+ Result<std::shared_ptr<const Schema>> select(Args&&... names,
+ bool case_sensitive = true)
const {
+ static_assert((std::is_convertible_v<Args, std::string> && ...),
+ "All arguments must be convertible to std::string");
+ return select({std::string(names)...}, case_sensitive);
+ }
+
+ Result<std::shared_ptr<const Schema>> project(std::unordered_set<int32_t>&
ids) const;
friend bool operator==(const Schema& lhs, const Schema& rhs) { return
lhs.Equals(rhs); }
private:
/// \brief Compare two schemas for equality.
[[nodiscard]] bool Equals(const Schema& other) const;
+ Result<std::shared_ptr<const Schema>> internalSelect(
Review Comment:
```suggestion
Result<std::shared_ptr<const Schema>> SelectInternal(
```
##########
src/iceberg/schema.h:
##########
@@ -65,19 +67,41 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ Result<std::shared_ptr<const Schema>> select(const std::vector<std::string>&
names,
Review Comment:
Function names should be capitalized unless it is a trivial getter. So
please rename `select` to `Select`.
BTW, why the return value is a `shared_ptr` not `unique_ptr`? And why it is
`const`? I'd suggest to return `Result<std::unique_ptr<Schema>>` so users can
accept it as both unique_ptr and shared_ptr.
##########
src/iceberg/schema.h:
##########
@@ -65,19 +67,41 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
Review Comment:
```suggestion
/// \brief Creates a projected schema from selected field names.
```
BTW, it would be clear about the distinction between `Select` and `Project`
functions and what are the requirements for the `names` parameter. For example,
does it support selecting fields in a nested field?
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,259 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
+ public:
+ explicit PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types = false);
Review Comment:
Can we avoid default parameter as much as possible?
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,260 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
+ public:
+ explicit PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types = false);
+ Status Visit(const ListType& type, std::shared_ptr<const Type>& result);
Review Comment:
```suggestion
Status Visit(const ListType& type, std::shared_ptr<Type>* result);
```
For output parameter, use pointer instead of reference for better
readability per the google coding style:
https://google.github.io/styleguide/cppguide.html#Inputs_and_Outputs
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,260 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
+ public:
+ explicit PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types = false);
+ Status Visit(const ListType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const MapType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const StructType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const PrimitiveType& type, std::shared_ptr<const Type>& result);
+ Status ProjectList(const SchemaField& element,
Review Comment:
Why `ProjectSelectedStruct` from Java impl is missing here?
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,260 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
+ public:
+ explicit PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types = false);
+ Status Visit(const ListType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const MapType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const StructType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const PrimitiveType& type, std::shared_ptr<const Type>& result);
+ Status ProjectList(const SchemaField& element,
+ std::shared_ptr<const Type>& child_result,
+ std::shared_ptr<const Type>& result);
+ Status ProjectMap(const SchemaField& key_field, const SchemaField&
value_field,
+ std::shared_ptr<const Type>& value_result,
+ std::shared_ptr<const Type>& result);
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ bool select_full_types_;
+};
+
+Result<std::shared_ptr<const Schema>> Schema::select(
+ const std::vector<std::string>& names, bool case_sensitive) const {
+ return internalSelect(names, case_sensitive);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::select(
+ const std::initializer_list<std::string>& names, bool case_sensitive)
const {
+ return internalSelect(std::vector<std::string>(names), case_sensitive);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::internalSelect(
+ const std::vector<std::string>& names, bool case_sensitive) const {
+ const std::string ALL_COLUMNS = "*";
+ if (std::ranges::find(names, ALL_COLUMNS) != names.end()) {
+ return shared_from_this();
+ }
+
+ std::unordered_set<int32_t> selected_ids;
+ for (const auto& name : names) {
+ ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name,
case_sensitive));
+ if (result.has_value()) {
+ selected_ids.insert(result.value().get().field_id());
+ }
+ }
+
+ std::shared_ptr<const Type> result;
+ PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true);
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, result));
+
+ if (!result) {
+ return std::make_shared<Schema>(std::vector<SchemaField>{}, schema_id_);
+ }
+
+ if (result->type_id() != TypeId::kStruct) {
+ return InvalidSchema("Projected type must be a struct type");
+ }
+
+ const auto& projected_struct = internal::checked_cast<const
StructType&>(*result);
+
+ std::vector<SchemaField> fields_vec(projected_struct.fields().begin(),
+ projected_struct.fields().end());
+ return std::make_shared<Schema>(std::move(fields_vec), schema_id_);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::project(
+ std::unordered_set<int32_t>& selected_ids) const {
+ PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/false);
+
+ std::shared_ptr<const Type> result;
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, result));
+
+ if (!result) {
+ return std::make_shared<Schema>(std::vector<SchemaField>{}, schema_id_);
+ }
+
+ if (result->type_id() != TypeId::kStruct) {
+ return InvalidSchema("Projected type must be a struct type");
+ }
+
+ const auto& projected_struct = internal::checked_cast<const
StructType&>(*result);
+ std::vector<SchemaField> fields_vec(projected_struct.fields().begin(),
+ projected_struct.fields().end());
+ return std::make_shared<Schema>(std::move(fields_vec), schema_id_);
+}
+
+PruneColumnVisitor::PruneColumnVisitor(const std::unordered_set<int32_t>&
selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+Status PruneColumnVisitor::Visit(const StructType& type,
+ std::shared_ptr<const Type>& result) {
+ std::vector<std::shared_ptr<const Type>> selected_types;
+ const auto& fields = type.fields();
+ for (const auto& field : fields) {
Review Comment:
```suggestion
for (const auto& field : type.fields()) {
```
Let's make it shorter.
##########
src/iceberg/schema.h:
##########
@@ -65,19 +67,41 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ Result<std::shared_ptr<const Schema>> select(const std::vector<std::string>&
names,
Review Comment:
Should we use `std::span<const std::string>` instead of `const
std::vector<std::string>&` to accept more std container types? It can support
both vector and initializer_list.
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,260 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
+ public:
+ explicit PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types = false);
+ Status Visit(const ListType& type, std::shared_ptr<const Type>& result);
Review Comment:
BTW, I prefer to use `out` to replace `result` because it may confuse
readers with the `Result` type.
##########
src/iceberg/schema.h:
##########
@@ -65,19 +67,41 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ Result<std::shared_ptr<const Schema>> select(const std::vector<std::string>&
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ Result<std::shared_ptr<const Schema>> select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
Review Comment:
Usually `initializer_list` is used to create a container type and more
likely be used in the testing. But I'm fine to keep it.
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,259 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
Review Comment:
Why not directly put the function definitions in the class? We don't have to
split class declaration and definition if this class is not declared in a
header file. If you really want to split them, at least those function
definitions should be right after this class.
##########
src/iceberg/schema.h:
##########
@@ -65,19 +67,41 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ Result<std::shared_ptr<const Schema>> select(const std::vector<std::string>&
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ Result<std::shared_ptr<const Schema>> select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ template <typename... Args>
+ Result<std::shared_ptr<const Schema>> select(Args&&... names,
+ bool case_sensitive = true)
const {
+ static_assert((std::is_convertible_v<Args, std::string> && ...),
+ "All arguments must be convertible to std::string");
+ return select({std::string(names)...}, case_sensitive);
+ }
+
+ Result<std::shared_ptr<const Schema>> project(std::unordered_set<int32_t>&
ids) const;
Review Comment:
Please add comment for it and rename `ids` to `field_ids` to be clear. Also
the comment should explain what to expect if a non-leaf id has been specified.
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,260 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
+ public:
+ explicit PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types = false);
+ Status Visit(const ListType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const MapType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const StructType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const PrimitiveType& type, std::shared_ptr<const Type>& result);
+ Status ProjectList(const SchemaField& element,
+ std::shared_ptr<const Type>& child_result,
+ std::shared_ptr<const Type>& result);
+ Status ProjectMap(const SchemaField& key_field, const SchemaField&
value_field,
+ std::shared_ptr<const Type>& value_result,
+ std::shared_ptr<const Type>& result);
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ bool select_full_types_;
+};
+
+Result<std::shared_ptr<const Schema>> Schema::select(
+ const std::vector<std::string>& names, bool case_sensitive) const {
+ return internalSelect(names, case_sensitive);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::select(
+ const std::initializer_list<std::string>& names, bool case_sensitive)
const {
+ return internalSelect(std::vector<std::string>(names), case_sensitive);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::internalSelect(
+ const std::vector<std::string>& names, bool case_sensitive) const {
+ const std::string ALL_COLUMNS = "*";
+ if (std::ranges::find(names, ALL_COLUMNS) != names.end()) {
+ return shared_from_this();
Review Comment:
It is not worthy to use `shared_from_this` only for this case. In this case
we can just return a full copy of the current schema.
##########
src/iceberg/schema.h:
##########
@@ -65,19 +67,41 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ Result<std::shared_ptr<const Schema>> select(const std::vector<std::string>&
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ Result<std::shared_ptr<const Schema>> select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ template <typename... Args>
+ Result<std::shared_ptr<const Schema>> select(Args&&... names,
+ bool case_sensitive = true)
const {
+ static_assert((std::is_convertible_v<Args, std::string> && ...),
Review Comment:
Shouldn't we accept `std::string_view` as well?
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,259 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
+ public:
+ explicit PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types = false);
+ Status Visit(const ListType& type);
+ Status Visit(const MapType& type);
+ Status Visit(const StructType& type);
+ Status Visit(const PrimitiveType& type);
+ std::shared_ptr<const Type> GetResult() const;
+ void SetResult(std::shared_ptr<const Type> result);
+ Status ProjectList(const SchemaField& element,
+ std::shared_ptr<const Type> element_result);
+ Status ProjectMap(const SchemaField& key_field, const SchemaField&
value_field,
+ std::shared_ptr<const Type> value_result);
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ bool select_full_types_;
Review Comment:
Can you add comment to explain what is this?
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,260 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
+ public:
+ explicit PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types = false);
+ Status Visit(const ListType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const MapType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const StructType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const PrimitiveType& type, std::shared_ptr<const Type>& result);
+ Status ProjectList(const SchemaField& element,
+ std::shared_ptr<const Type>& child_result,
+ std::shared_ptr<const Type>& result);
+ Status ProjectMap(const SchemaField& key_field, const SchemaField&
value_field,
+ std::shared_ptr<const Type>& value_result,
+ std::shared_ptr<const Type>& result);
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ bool select_full_types_;
+};
+
+Result<std::shared_ptr<const Schema>> Schema::select(
+ const std::vector<std::string>& names, bool case_sensitive) const {
+ return internalSelect(names, case_sensitive);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::select(
+ const std::initializer_list<std::string>& names, bool case_sensitive)
const {
+ return internalSelect(std::vector<std::string>(names), case_sensitive);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::internalSelect(
+ const std::vector<std::string>& names, bool case_sensitive) const {
+ const std::string ALL_COLUMNS = "*";
Review Comment:
```suggestion
const std::string kAllColumns = "*";
```
##########
src/iceberg/schema.h:
##########
@@ -65,19 +67,41 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projection schema for a subset of columns, selected by
name.
+ Result<std::shared_ptr<const Schema>> select(const std::vector<std::string>&
names,
Review Comment:
Same suggestion applies to below functions.
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,260 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
+ public:
+ explicit PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types = false);
+ Status Visit(const ListType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const MapType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const StructType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const PrimitiveType& type, std::shared_ptr<const Type>& result);
+ Status ProjectList(const SchemaField& element,
+ std::shared_ptr<const Type>& child_result,
+ std::shared_ptr<const Type>& result);
+ Status ProjectMap(const SchemaField& key_field, const SchemaField&
value_field,
+ std::shared_ptr<const Type>& value_result,
+ std::shared_ptr<const Type>& result);
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ bool select_full_types_;
+};
+
+Result<std::shared_ptr<const Schema>> Schema::select(
+ const std::vector<std::string>& names, bool case_sensitive) const {
+ return internalSelect(names, case_sensitive);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::select(
+ const std::initializer_list<std::string>& names, bool case_sensitive)
const {
+ return internalSelect(std::vector<std::string>(names), case_sensitive);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::internalSelect(
+ const std::vector<std::string>& names, bool case_sensitive) const {
+ const std::string ALL_COLUMNS = "*";
+ if (std::ranges::find(names, ALL_COLUMNS) != names.end()) {
+ return shared_from_this();
+ }
+
+ std::unordered_set<int32_t> selected_ids;
+ for (const auto& name : names) {
+ ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name,
case_sensitive));
+ if (result.has_value()) {
+ selected_ids.insert(result.value().get().field_id());
+ }
+ }
+
+ std::shared_ptr<const Type> result;
+ PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true);
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, result));
+
+ if (!result) {
+ return std::make_shared<Schema>(std::vector<SchemaField>{}, schema_id_);
+ }
+
+ if (result->type_id() != TypeId::kStruct) {
+ return InvalidSchema("Projected type must be a struct type");
+ }
+
+ const auto& projected_struct = internal::checked_cast<const
StructType&>(*result);
Review Comment:
nit: use `FromStructType` function in the schema_internal.h to save some
lines.
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,259 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
Review Comment:
Can we change the class definition to one of the two forms below?
```cpp
class PruneColumnVisitor {
public:
PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
bool select_full_types);
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type)
const;
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type)
const;
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>&
type) const;
Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<PrimitiveType>&
type) const;
Result<std::unique_ptr<Schema>> Visit(const Schema& schema) const;
private:
const std::unordered_set<int32_t>& selected_ids_;
bool select_full_types_;
};
```
or
```cpp
class PruneColumnVisitor {
public:
PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
bool select_full_types);
Status Visit(const std::shared_ptr<ListType>& type, std::shared_ptr<Type>*
out) const;
Status Visit(const std::shared_ptr<MapType>& type, std::shared_ptr<Type>*
out) const;
Status Visit(const std::shared_ptr<StructType>& type,
std::shared_ptr<Type>* out) const;
Status Visit(const std::shared_ptr<PrimitiveType>& type,
std::shared_ptr<Type>* out) const;
Status Visit(const Schema& schema, std::unique_ptr<Schema>* out) const;
private:
const std::unordered_set<int32_t>& selected_ids_;
bool select_full_types_;
};
```
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +257,260 @@ void NameToIdVisitor::Finish() {
}
}
+class PruneColumnVisitor {
+ public:
+ explicit PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types = false);
+ Status Visit(const ListType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const MapType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const StructType& type, std::shared_ptr<const Type>& result);
+ Status Visit(const PrimitiveType& type, std::shared_ptr<const Type>& result);
+ Status ProjectList(const SchemaField& element,
+ std::shared_ptr<const Type>& child_result,
+ std::shared_ptr<const Type>& result);
+ Status ProjectMap(const SchemaField& key_field, const SchemaField&
value_field,
+ std::shared_ptr<const Type>& value_result,
+ std::shared_ptr<const Type>& result);
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ bool select_full_types_;
+};
+
+Result<std::shared_ptr<const Schema>> Schema::select(
+ const std::vector<std::string>& names, bool case_sensitive) const {
+ return internalSelect(names, case_sensitive);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::select(
+ const std::initializer_list<std::string>& names, bool case_sensitive)
const {
+ return internalSelect(std::vector<std::string>(names), case_sensitive);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::internalSelect(
+ const std::vector<std::string>& names, bool case_sensitive) const {
+ const std::string ALL_COLUMNS = "*";
+ if (std::ranges::find(names, ALL_COLUMNS) != names.end()) {
+ return shared_from_this();
+ }
+
+ std::unordered_set<int32_t> selected_ids;
+ for (const auto& name : names) {
+ ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name,
case_sensitive));
+ if (result.has_value()) {
+ selected_ids.insert(result.value().get().field_id());
+ }
+ }
+
+ std::shared_ptr<const Type> result;
+ PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true);
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, result));
+
+ if (!result) {
+ return std::make_shared<Schema>(std::vector<SchemaField>{}, schema_id_);
+ }
+
+ if (result->type_id() != TypeId::kStruct) {
+ return InvalidSchema("Projected type must be a struct type");
+ }
+
+ const auto& projected_struct = internal::checked_cast<const
StructType&>(*result);
+
+ std::vector<SchemaField> fields_vec(projected_struct.fields().begin(),
+ projected_struct.fields().end());
+ return std::make_shared<Schema>(std::move(fields_vec), schema_id_);
+}
+
+Result<std::shared_ptr<const Schema>> Schema::project(
+ std::unordered_set<int32_t>& selected_ids) const {
+ PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/false);
+
+ std::shared_ptr<const Type> result;
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor, result));
+
+ if (!result) {
+ return std::make_shared<Schema>(std::vector<SchemaField>{}, schema_id_);
+ }
+
+ if (result->type_id() != TypeId::kStruct) {
+ return InvalidSchema("Projected type must be a struct type");
+ }
+
+ const auto& projected_struct = internal::checked_cast<const
StructType&>(*result);
+ std::vector<SchemaField> fields_vec(projected_struct.fields().begin(),
+ projected_struct.fields().end());
+ return std::make_shared<Schema>(std::move(fields_vec), schema_id_);
+}
+
+PruneColumnVisitor::PruneColumnVisitor(const std::unordered_set<int32_t>&
selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+Status PruneColumnVisitor::Visit(const StructType& type,
+ std::shared_ptr<const Type>& result) {
+ std::vector<std::shared_ptr<const Type>> selected_types;
+ const auto& fields = type.fields();
+ for (const auto& field : fields) {
+ std::shared_ptr<const Type> child_result;
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this,
child_result));
+ if (selected_ids_.contains(field.field_id())) {
+ // select
+ if (select_full_types_) {
+ selected_types.emplace_back(field.type());
Review Comment:
In this case, line 355 is wasted.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]