github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3229304233
##########
be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp:
##########
@@ -18,21 +18,173 @@
#include "format/table/iceberg/iceberg_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <iostream>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema,
std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_name(variant_field, "metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_name(variant_field, "value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+bool extract_nested_column_ids_by_name(const FieldSchema& field_schema,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids);
+
+bool extract_variant_nested_column_ids(const FieldSchema& variant_field,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids) {
+ const auto* typed_value = find_child_by_name(variant_field, "typed_value");
+ bool has_child_columns = false;
+
+ for (const auto& path : paths) {
+ if (path.empty()) {
+ add_column_id_range(variant_field, column_ids);
+ has_child_columns = true;
+ continue;
+ }
+
+ add_variant_metadata(variant_field, column_ids);
+ bool found_typed_path = false;
+ if (typed_value != nullptr) {
+ if (const auto* typed_child = find_child_by_name(*typed_value,
path[0])) {
+ if (path.size() == 1) {
+ add_column_id_range(*typed_child, column_ids);
+ found_typed_path = true;
+ } else {
+ std::vector<std::vector<std::string>> child_paths {
+ std::vector<std::string>(path.begin() + 1,
path.end())};
+ found_typed_path =
extract_nested_column_ids_by_name(*typed_child, child_paths,
+
column_ids);
+ }
+ if (found_typed_path) {
+ column_ids.insert(typed_value->get_column_id());
+ }
+ }
+ }
+
+ if (!found_typed_path) {
+ add_variant_value(variant_field, column_ids);
Review Comment:
Same correctness issue as the standalone/Hive Parquet path: once a typed
child is found, the Iceberg reader prunes away the residual `value` leaf.
Iceberg VARIANT shredding can leave a selected field in `value` for rows that
do not fit the typed shard, so Iceberg table scans can silently lose those
values when `v['path']` is projected or filtered. The Iceberg path needs the
same fallback handling and test coverage.
##########
be/src/format/table/hive/hive_parquet_nested_column_utils.cpp:
##########
@@ -18,20 +18,172 @@
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema,
std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_name(variant_field, "metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_name(variant_field, "value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+bool extract_nested_column_ids_by_name(const FieldSchema& field_schema,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids);
+
+bool extract_variant_nested_column_ids(const FieldSchema& variant_field,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids) {
+ const auto* typed_value = find_child_by_name(variant_field, "typed_value");
+ bool has_child_columns = false;
+
+ for (const auto& path : paths) {
+ if (path.empty()) {
+ add_column_id_range(variant_field, column_ids);
+ has_child_columns = true;
+ continue;
+ }
+
+ add_variant_metadata(variant_field, column_ids);
+ bool found_typed_path = false;
+ if (typed_value != nullptr) {
+ if (const auto* typed_child = find_child_by_name(*typed_value,
path[0])) {
+ if (path.size() == 1) {
+ add_column_id_range(*typed_child, column_ids);
+ found_typed_path = true;
+ } else {
+ std::vector<std::vector<std::string>> child_paths {
+ std::vector<std::string>(path.begin() + 1,
path.end())};
+ found_typed_path =
extract_nested_column_ids_by_name(*typed_child, child_paths,
+
column_ids);
+ }
+ if (found_typed_path) {
+ column_ids.insert(typed_value->get_column_id());
+ }
+ }
+ }
+
+ if (!found_typed_path) {
+ add_variant_value(variant_field, column_ids);
Review Comment:
This pruning drops the unshredded `value` leaf whenever a matching
`typed_value` path is found. For shredded VARIANT, `value` is the residual
fallback for fields that are not representable in the typed shard on a given
row, so a file can have `typed_value.metric` for numeric rows while rows with
`metric` stored as another type remain only in `value`. With the current
selection, `VariantColumnReader` sees `typed_value.metric` as null and has no
`value` column to decode, so `v['metric']` is returned as missing/null even
though the data exists. Please keep the residual `value` leaf whenever pruning
a VARIANT subpath unless the format provides a guarantee that the typed leaf is
complete for all rows, and add a mixed typed/residual regression case.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]