eldenmoon commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3229421164
##########
be/src/format/table/hive/hive_parquet_nested_column_utils.cpp:
##########
@@ -18,20 +18,172 @@
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_name(const FieldSchema& field_schema,
std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_name(variant_field, "metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_name(variant_field, "value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+bool extract_nested_column_ids_by_name(const FieldSchema& field_schema,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids);
+
+bool extract_variant_nested_column_ids(const FieldSchema& variant_field,
+ const
std::vector<std::vector<std::string>>& paths,
+ std::set<uint64_t>& column_ids) {
+ const auto* typed_value = find_child_by_name(variant_field, "typed_value");
+ bool has_child_columns = false;
+
+ for (const auto& path : paths) {
+ if (path.empty()) {
+ add_column_id_range(variant_field, column_ids);
+ has_child_columns = true;
+ continue;
+ }
+
+ add_variant_metadata(variant_field, column_ids);
+ bool found_typed_path = false;
+ if (typed_value != nullptr) {
+ if (const auto* typed_child = find_child_by_name(*typed_value,
path[0])) {
+ if (path.size() == 1) {
+ add_column_id_range(*typed_child, column_ids);
+ found_typed_path = true;
+ } else {
+ std::vector<std::vector<std::string>> child_paths {
+ std::vector<std::string>(path.begin() + 1,
path.end())};
+ found_typed_path =
extract_nested_column_ids_by_name(*typed_child, child_paths,
+
column_ids);
+ }
+ if (found_typed_path) {
+ column_ids.insert(typed_value->get_column_id());
+ }
+ }
+ }
+
+ if (!found_typed_path) {
+ add_variant_value(variant_field, column_ids);
Review Comment:
Handled by documenting the Parquet VARIANT shredding invariant and
tightening the regression observable. Per the Parquet Variant Shredding spec,
object field residual fallback is stored at typed_value.<field>.value, while
the top-level value column stores non-shredded object fields and must not
contain shredded field keys. The regression now asserts that v['metric'] reads
v.typed_value.metric.typed_value and v.typed_value.metric.value, does not read
v.value, and does not read sibling v.typed_value.name. Source:
https://parquet.apache.org/docs/file-format/types/variantshredding/
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]