Re: [PR] [feature](iceberg) Support reading Iceberg variant from Parquet [doris]

via GitHub Tue, 12 May 2026 13:15:06 -0700


github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3229478356



##########
be/src/format/table/hive/hive_parquet_nested_column_utils.cpp:
##########
@@ -18,20 +18,176 @@
 #include "format/table/hive/hive_parquet_nested_column_utils.h"
 
 #include <algorithm>
+#include <cctype>
 #include <memory>
 #include <set>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <vector>
 
 #include "format/parquet/schema_desc.h"
 #include "format/table/table_schema_change_helper.h"
 
 namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>& 
column_ids) {
+    const uint64_t start_id = field_schema.get_column_id();
+    const uint64_t max_column_id = field_schema.get_max_column_id();
+    for (uint64_t id = start_id; id <= max_column_id; ++id) {
+        column_ids.insert(id);
+    }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema, 
std::string_view name) {
+    std::string lower_name(name);
+    std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+                   [](unsigned char c) { return 
static_cast<char>(std::tolower(c)); });
+    for (const auto& child : field_schema.children) {
+        if (child.name == name || child.lower_case_name == lower_name) {
+            return &child;
+        }
+    }
+    return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field, 
std::set<uint64_t>& column_ids) {
+    if (const auto* metadata = find_child_by_name(variant_field, "metadata")) {
+        add_column_id_range(*metadata, column_ids);
+    }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>& 
column_ids) {
+    add_variant_metadata(variant_field, column_ids);
+    if (const auto* value = find_child_by_name(variant_field, "value")) {
+        add_column_id_range(*value, column_ids);
+    }
+}
+
+bool extract_nested_column_ids_by_name(const FieldSchema& field_schema,
+                                       const 
std::vector<std::vector<std::string>>& paths,
+                                       std::set<uint64_t>& column_ids);
+
+bool extract_variant_nested_column_ids(const FieldSchema& variant_field,
+                                       const 
std::vector<std::vector<std::string>>& paths,
+                                       std::set<uint64_t>& column_ids) {
+    const auto* typed_value = find_child_by_name(variant_field, "typed_value");
+    bool has_child_columns = false;
+
+    for (const auto& path : paths) {
+        if (path.empty()) {
+            add_column_id_range(variant_field, column_ids);
+            has_child_columns = true;
+            continue;
+        }
+
+        add_variant_metadata(variant_field, column_ids);
+        bool found_typed_path = false;
+        if (typed_value != nullptr) {
+            if (const auto* typed_child = find_child_by_name(*typed_value, 
path[0])) {
+                if (path.size() == 1) {
+                    add_column_id_range(*typed_child, column_ids);
+                    found_typed_path = true;
+                } else {
+                    std::vector<std::vector<std::string>> child_paths {
+                            std::vector<std::string>(path.begin() + 1, 
path.end())};
+                    found_typed_path = 
extract_nested_column_ids_by_name(*typed_child, child_paths,

Review Comment:
   This handles `v['metric']`, but deeper accesses under a shredded field still 
prune the wrong residual. For a schema like `typed_value.metric { value, 
typed_value int64 }`, the access path for `v['metric']['x']` reaches this 
branch with `typed_child == metric` and `path.size() > 1`. `metric` is parsed 
as a regular struct, not `TYPE_VARIANT`, so 
`extract_nested_column_ids_by_name(*typed_child, {['x']}, ...)` finds no child 
named `x` and returns false. The code then falls back to top-level `v.value`, 
even though the Parquet VARIANT shredding spec stores non-int/object residuals 
for `metric` in `v.typed_value.metric.value` and forbids duplicating shredded 
field keys in top-level `v.value`. That makes deeper projections/filters 
silently return missing/null for rows where `metric` is an object stored in the 
field-level residual. Please treat these unannotated shredded field groups 
(`value`/`typed_value`) as variant-like during recursive pruning, and add a 
regression that projects a subp
 ath from a shredded field whose residual is stored in 
`typed_value.<field>.value`.



##########
be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp:
##########
@@ -18,21 +18,177 @@
 #include "format/table/iceberg/iceberg_parquet_nested_column_utils.h"
 
 #include <algorithm>
+#include <cctype>
 #include <iostream>
 #include <memory>
 #include <set>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <vector>
 
 #include "format/parquet/schema_desc.h"
 #include "format/table/table_schema_change_helper.h"
 
 namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>& 
column_ids) {
+    const uint64_t start_id = field_schema.get_column_id();
+    const uint64_t max_column_id = field_schema.get_max_column_id();
+    for (uint64_t id = start_id; id <= max_column_id; ++id) {
+        column_ids.insert(id);
+    }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema, 
std::string_view name) {
+    std::string lower_name(name);
+    std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+                   [](unsigned char c) { return 
static_cast<char>(std::tolower(c)); });
+    for (const auto& child : field_schema.children) {
+        if (child.name == name || child.lower_case_name == lower_name) {
+            return &child;
+        }
+    }
+    return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field, 
std::set<uint64_t>& column_ids) {
+    if (const auto* metadata = find_child_by_name(variant_field, "metadata")) {
+        add_column_id_range(*metadata, column_ids);
+    }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>& 
column_ids) {
+    add_variant_metadata(variant_field, column_ids);
+    if (const auto* value = find_child_by_name(variant_field, "value")) {
+        add_column_id_range(*value, column_ids);
+    }
+}
+
+bool extract_nested_column_ids_by_name(const FieldSchema& field_schema,
+                                       const 
std::vector<std::vector<std::string>>& paths,
+                                       std::set<uint64_t>& column_ids);
+
+bool extract_variant_nested_column_ids(const FieldSchema& variant_field,
+                                       const 
std::vector<std::vector<std::string>>& paths,
+                                       std::set<uint64_t>& column_ids) {
+    const auto* typed_value = find_child_by_name(variant_field, "typed_value");
+    bool has_child_columns = false;
+
+    for (const auto& path : paths) {
+        if (path.empty()) {
+            add_column_id_range(variant_field, column_ids);
+            has_child_columns = true;
+            continue;
+        }
+
+        add_variant_metadata(variant_field, column_ids);
+        bool found_typed_path = false;
+        if (typed_value != nullptr) {
+            if (const auto* typed_child = find_child_by_name(*typed_value, 
path[0])) {
+                if (path.size() == 1) {
+                    add_column_id_range(*typed_child, column_ids);
+                    found_typed_path = true;
+                } else {
+                    std::vector<std::vector<std::string>> child_paths {
+                            std::vector<std::string>(path.begin() + 1, 
path.end())};
+                    found_typed_path = 
extract_nested_column_ids_by_name(*typed_child, child_paths,

Review Comment:
   Same distinct deeper-path issue as the Hive/local helper, separate from the 
already-discussed top-level residual question. When Iceberg pruning sees an 
access like `v['metric']['x']` and `metric` is a shredded field group with 
`value` plus `typed_value`, the recursive call treats `metric` as a plain 
struct and returns false because there is no child named `x`. The fallback then 
selects top-level `v.value`, but Iceberg/Parquet shredded field residuals for 
`metric` live in `v.typed_value.metric.value`; top-level `v.value` must not 
contain that shredded key. Iceberg scans can therefore lose residual object 
values for deeper projections or filters under a shredded field. Please make 
the recursive pruning recognize the unannotated `value`/`typed_value` 
shredded-field layout and cover this with an Iceberg/local regression case.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [feature](iceberg) Support reading Iceberg variant from Parquet [doris]

Reply via email to