github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3229478356
##########
be/src/format/table/hive/hive_parquet_nested_column_utils.cpp:
##########
@@ -18,20 +18,176 @@
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema,
std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_name(variant_field, "metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_name(variant_field, "value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+bool extract_nested_column_ids_by_name(const FieldSchema& field_schema,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids);
+
+bool extract_variant_nested_column_ids(const FieldSchema& variant_field,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids) {
+ const auto* typed_value = find_child_by_name(variant_field, "typed_value");
+ bool has_child_columns = false;
+
+ for (const auto& path : paths) {
+ if (path.empty()) {
+ add_column_id_range(variant_field, column_ids);
+ has_child_columns = true;
+ continue;
+ }
+
+ add_variant_metadata(variant_field, column_ids);
+ bool found_typed_path = false;
+ if (typed_value != nullptr) {
+ if (const auto* typed_child = find_child_by_name(*typed_value,
path[0])) {
+ if (path.size() == 1) {
+ add_column_id_range(*typed_child, column_ids);
+ found_typed_path = true;
+ } else {
+ std::vector<std::vector<std::string>> child_paths {
+ std::vector<std::string>(path.begin() + 1,
path.end())};
+ found_typed_path =
extract_nested_column_ids_by_name(*typed_child, child_paths,
Review Comment:
This handles `v['metric']`, but deeper accesses under a shredded field still
prune the wrong residual. For a schema like `typed_value.metric { value,
typed_value int64 }`, the access path for `v['metric']['x']` reaches this
branch with `typed_child == metric` and `path.size() > 1`. `metric` is parsed
as a regular struct, not `TYPE_VARIANT`, so
`extract_nested_column_ids_by_name(*typed_child, {['x']}, ...)` finds no child
named `x` and returns false. The code then falls back to top-level `v.value`,
even though the Parquet VARIANT shredding spec stores non-int/object residuals
for `metric` in `v.typed_value.metric.value` and forbids duplicating shredded
field keys in top-level `v.value`. That makes deeper projections/filters
silently return missing/null for rows where `metric` is an object stored in the
field-level residual. Please treat these unannotated shredded field groups
(`value`/`typed_value`) as variant-like during recursive pruning, and add a
regression that projects a subp
ath from a shredded field whose residual is stored in
`typed_value.<field>.value`.
##########
be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp:
##########
@@ -18,21 +18,177 @@
#include "format/table/iceberg/iceberg_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <iostream>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema,
std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_name(variant_field, "metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_name(variant_field, "value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+bool extract_nested_column_ids_by_name(const FieldSchema& field_schema,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids);
+
+bool extract_variant_nested_column_ids(const FieldSchema& variant_field,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids) {
+ const auto* typed_value = find_child_by_name(variant_field, "typed_value");
+ bool has_child_columns = false;
+
+ for (const auto& path : paths) {
+ if (path.empty()) {
+ add_column_id_range(variant_field, column_ids);
+ has_child_columns = true;
+ continue;
+ }
+
+ add_variant_metadata(variant_field, column_ids);
+ bool found_typed_path = false;
+ if (typed_value != nullptr) {
+ if (const auto* typed_child = find_child_by_name(*typed_value,
path[0])) {
+ if (path.size() == 1) {
+ add_column_id_range(*typed_child, column_ids);
+ found_typed_path = true;
+ } else {
+ std::vector<std::vector<std::string>> child_paths {
+ std::vector<std::string>(path.begin() + 1,
path.end())};
+ found_typed_path =
extract_nested_column_ids_by_name(*typed_child, child_paths,
Review Comment:
Same distinct deeper-path issue as the Hive/local helper, separate from the
already-discussed top-level residual question. When Iceberg pruning sees an
access like `v['metric']['x']` and `metric` is a shredded field group with
`value` plus `typed_value`, the recursive call treats `metric` as a plain
struct and returns false because there is no child named `x`. The fallback then
selects top-level `v.value`, but Iceberg/Parquet shredded field residuals for
`metric` live in `v.typed_value.metric.value`; top-level `v.value` must not
contain that shredded key. Iceberg scans can therefore lose residual object
values for deeper projections or filters under a shredded field. Please make
the recursive pruning recognize the unannotated `value`/`typed_value`
shredded-field layout and cover this with an Iceberg/local regression case.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]