github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3229304233


##########
be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp:
##########
@@ -18,21 +18,173 @@
 #include "format/table/iceberg/iceberg_parquet_nested_column_utils.h"
 
 #include <algorithm>
+#include <cctype>
 #include <iostream>
 #include <memory>
 #include <set>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <vector>
 
 #include "format/parquet/schema_desc.h"
 #include "format/table/table_schema_change_helper.h"
 
 namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>& 
column_ids) {
+    const uint64_t start_id = field_schema.get_column_id();
+    const uint64_t max_column_id = field_schema.get_max_column_id();
+    for (uint64_t id = start_id; id <= max_column_id; ++id) {
+        column_ids.insert(id);
+    }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema, 
std::string_view name) {
+    std::string lower_name(name);
+    std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+                   [](unsigned char c) { return 
static_cast<char>(std::tolower(c)); });
+    for (const auto& child : field_schema.children) {
+        if (child.name == name || child.lower_case_name == lower_name) {
+            return &child;
+        }
+    }
+    return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field, 
std::set<uint64_t>& column_ids) {
+    if (const auto* metadata = find_child_by_name(variant_field, "metadata")) {
+        add_column_id_range(*metadata, column_ids);
+    }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>& 
column_ids) {
+    add_variant_metadata(variant_field, column_ids);
+    if (const auto* value = find_child_by_name(variant_field, "value")) {
+        add_column_id_range(*value, column_ids);
+    }
+}
+
+bool extract_nested_column_ids_by_name(const FieldSchema& field_schema,
+                                       const 
std::vector<std::vector<std::string>>& paths,
+                                       std::set<uint64_t>& column_ids);
+
+bool extract_variant_nested_column_ids(const FieldSchema& variant_field,
+                                       const 
std::vector<std::vector<std::string>>& paths,
+                                       std::set<uint64_t>& column_ids) {
+    const auto* typed_value = find_child_by_name(variant_field, "typed_value");
+    bool has_child_columns = false;
+
+    for (const auto& path : paths) {
+        if (path.empty()) {
+            add_column_id_range(variant_field, column_ids);
+            has_child_columns = true;
+            continue;
+        }
+
+        add_variant_metadata(variant_field, column_ids);
+        bool found_typed_path = false;
+        if (typed_value != nullptr) {
+            if (const auto* typed_child = find_child_by_name(*typed_value, 
path[0])) {
+                if (path.size() == 1) {
+                    add_column_id_range(*typed_child, column_ids);
+                    found_typed_path = true;
+                } else {
+                    std::vector<std::vector<std::string>> child_paths {
+                            std::vector<std::string>(path.begin() + 1, 
path.end())};
+                    found_typed_path = 
extract_nested_column_ids_by_name(*typed_child, child_paths,
+                                                                         
column_ids);
+                }
+                if (found_typed_path) {
+                    column_ids.insert(typed_value->get_column_id());
+                }
+            }
+        }
+
+        if (!found_typed_path) {
+            add_variant_value(variant_field, column_ids);

Review Comment:
   Same correctness issue as the standalone/Hive Parquet path: once a typed 
child is found, the Iceberg reader prunes away the residual `value` leaf. 
Iceberg VARIANT shredding can leave a selected field in `value` for rows that 
do not fit the typed shard, so Iceberg table scans can silently lose those 
values when `v['path']` is projected or filtered. The Iceberg path needs the 
same fallback handling and test coverage.



##########
be/src/format/table/hive/hive_parquet_nested_column_utils.cpp:
##########
@@ -18,20 +18,172 @@
 #include "format/table/hive/hive_parquet_nested_column_utils.h"
 
 #include <algorithm>
+#include <cctype>
 #include <memory>
 #include <set>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <vector>
 
 #include "format/parquet/schema_desc.h"
 #include "format/table/table_schema_change_helper.h"
 
 namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>& 
column_ids) {
+    const uint64_t start_id = field_schema.get_column_id();
+    const uint64_t max_column_id = field_schema.get_max_column_id();
+    for (uint64_t id = start_id; id <= max_column_id; ++id) {
+        column_ids.insert(id);
+    }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema, 
std::string_view name) {
+    std::string lower_name(name);
+    std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+                   [](unsigned char c) { return 
static_cast<char>(std::tolower(c)); });
+    for (const auto& child : field_schema.children) {
+        if (child.name == name || child.lower_case_name == lower_name) {
+            return &child;
+        }
+    }
+    return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field, 
std::set<uint64_t>& column_ids) {
+    if (const auto* metadata = find_child_by_name(variant_field, "metadata")) {
+        add_column_id_range(*metadata, column_ids);
+    }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>& 
column_ids) {
+    add_variant_metadata(variant_field, column_ids);
+    if (const auto* value = find_child_by_name(variant_field, "value")) {
+        add_column_id_range(*value, column_ids);
+    }
+}
+
+bool extract_nested_column_ids_by_name(const FieldSchema& field_schema,
+                                       const 
std::vector<std::vector<std::string>>& paths,
+                                       std::set<uint64_t>& column_ids);
+
+bool extract_variant_nested_column_ids(const FieldSchema& variant_field,
+                                       const 
std::vector<std::vector<std::string>>& paths,
+                                       std::set<uint64_t>& column_ids) {
+    const auto* typed_value = find_child_by_name(variant_field, "typed_value");
+    bool has_child_columns = false;
+
+    for (const auto& path : paths) {
+        if (path.empty()) {
+            add_column_id_range(variant_field, column_ids);
+            has_child_columns = true;
+            continue;
+        }
+
+        add_variant_metadata(variant_field, column_ids);
+        bool found_typed_path = false;
+        if (typed_value != nullptr) {
+            if (const auto* typed_child = find_child_by_name(*typed_value, 
path[0])) {
+                if (path.size() == 1) {
+                    add_column_id_range(*typed_child, column_ids);
+                    found_typed_path = true;
+                } else {
+                    std::vector<std::vector<std::string>> child_paths {
+                            std::vector<std::string>(path.begin() + 1, 
path.end())};
+                    found_typed_path = 
extract_nested_column_ids_by_name(*typed_child, child_paths,
+                                                                         
column_ids);
+                }
+                if (found_typed_path) {
+                    column_ids.insert(typed_value->get_column_id());
+                }
+            }
+        }
+
+        if (!found_typed_path) {
+            add_variant_value(variant_field, column_ids);

Review Comment:
   This pruning drops the unshredded `value` leaf whenever a matching 
`typed_value` path is found. For shredded VARIANT, `value` is the residual 
fallback for fields that are not representable in the typed shard on a given 
row, so a file can have `typed_value.metric` for numeric rows while rows with 
`metric` stored as another type remain only in `value`. With the current 
selection, `VariantColumnReader` sees `typed_value.metric` as null and has no 
`value` column to decode, so `v['metric']` is returned as missing/null even 
though the data exists. Please keep the residual `value` leaf whenever pruning 
a VARIANT subpath unless the format provides a guarantee that the typed leaf is 
complete for all rows, and add a mixed typed/residual regression case.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to