github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3252707483
##########
be/src/format/table/hive/hive_parquet_nested_column_utils.cpp:
##########
@@ -18,20 +18,294 @@
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
+#include "core/data_type/data_type_nullable.h"
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_structural_name(const FieldSchema&
field_schema,
+ std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+const FieldSchema* find_child_by_exact_name(const FieldSchema& field_schema,
+ std::string_view name) {
+ for (const auto& child : field_schema.children) {
+ if (child.name == name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_structural_name(variant_field,
"metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_structural_name(variant_field,
"value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+struct VariantColumnIdExtractionResult {
+ bool has_child_columns = false;
+ bool needs_metadata = false;
+};
+
+bool is_shredded_variant_field(const FieldSchema& field_schema) {
+ bool has_value = false;
+ const FieldSchema* typed_value = nullptr;
+ for (const auto& child : field_schema.children) {
+ if (child.lower_case_name == "value") {
+ if (child.physical_type != tparquet::Type::BYTE_ARRAY) {
+ return false;
+ }
+ has_value = true;
+ continue;
+ }
+ if (child.lower_case_name == "typed_value") {
+ typed_value = &child;
+ continue;
+ }
+ return false;
+ }
+ if (has_value) {
+ return typed_value != nullptr;
+ }
Review Comment:
The Hive/local pruning helper has the same residual-only gap. For a valid
shredded field group like `typed_value.metric.value` with no
`typed_value.metric.typed_value`, an access such as `v['metric']['x']` reaches
`extract_variant_typed_nested_column_ids(metric, [['x']])`; this predicate
returns false because `typed_value == nullptr`, the generic struct path finds
no child `x`, and the caller falls back to top-level `v.value` instead of
selecting `v.typed_value.metric.value`. That silently prunes the only column
containing the selected field's residual object. Please treat `value`-only
shredded field groups as variant-like and add a pruning/profile assertion for
this case.
##########
be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp:
##########
@@ -18,21 +18,295 @@
#include "format/table/iceberg/iceberg_parquet_nested_column_utils.h"
#include <algorithm>
+#include <cctype>
#include <iostream>
#include <memory>
#include <set>
#include <string>
+#include <string_view>
#include <unordered_map>
#include <vector>
+#include "core/data_type/data_type_nullable.h"
#include "format/parquet/schema_desc.h"
#include "format/table/table_schema_change_helper.h"
namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>&
column_ids) {
+ const uint64_t start_id = field_schema.get_column_id();
+ const uint64_t max_column_id = field_schema.get_max_column_id();
+ for (uint64_t id = start_id; id <= max_column_id; ++id) {
+ column_ids.insert(id);
+ }
+}
+
+const FieldSchema* find_child_by_structural_name(const FieldSchema&
field_schema,
+ std::string_view name) {
+ std::string lower_name(name);
+ std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+ [](unsigned char c) { return
static_cast<char>(std::tolower(c)); });
+ for (const auto& child : field_schema.children) {
+ if (child.name == name || child.lower_case_name == lower_name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+const FieldSchema* find_child_by_exact_name(const FieldSchema& field_schema,
+ std::string_view name) {
+ for (const auto& child : field_schema.children) {
+ if (child.name == name) {
+ return &child;
+ }
+ }
+ return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field,
std::set<uint64_t>& column_ids) {
+ if (const auto* metadata = find_child_by_structural_name(variant_field,
"metadata")) {
+ add_column_id_range(*metadata, column_ids);
+ }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>&
column_ids) {
+ add_variant_metadata(variant_field, column_ids);
+ if (const auto* value = find_child_by_structural_name(variant_field,
"value")) {
+ add_column_id_range(*value, column_ids);
+ }
+}
+
+struct VariantColumnIdExtractionResult {
+ bool has_child_columns = false;
+ bool needs_metadata = false;
+};
+
+bool is_shredded_variant_field(const FieldSchema& field_schema) {
+ bool has_value = false;
+ const FieldSchema* typed_value = nullptr;
+ for (const auto& child : field_schema.children) {
+ if (child.lower_case_name == "value") {
+ if (child.physical_type != tparquet::Type::BYTE_ARRAY) {
+ return false;
+ }
+ has_value = true;
+ continue;
+ }
+ if (child.lower_case_name == "typed_value") {
+ typed_value = &child;
+ continue;
+ }
+ return false;
+ }
+ if (has_value) {
+ return typed_value != nullptr;
+ }
Review Comment:
The duplicated Iceberg pruning helper also misses value-only field-level
residual groups. In Iceberg Parquet, a shredded field can have
`typed_value.<field>.value` without a nested `typed_value`; deeper accesses
under that field currently fall through to top-level `v.value`, so Iceberg
scans can skip the only residual column for that field. Please apply the same
value-only wrapper recognition and regression/profile coverage here as in the
Hive/local helper.
##########
be/src/format/parquet/vparquet_column_reader.cpp:
##########
@@ -103,6 +119,857 @@ static void fill_array_offset(FieldSchema* field,
ColumnArray::Offsets64& offset
}
}
+static constexpr int64_t UNIX_EPOCH_DAYNR = 719528;
+static constexpr int64_t MICROS_PER_SECOND = 1000000;
+
+static int64_t variant_date_value(const VecDateTimeValue& value) {
+ return value.daynr() - UNIX_EPOCH_DAYNR;
+}
+
+static int64_t variant_date_value(const DateV2Value<DateV2ValueType>& value) {
+ return value.daynr() - UNIX_EPOCH_DAYNR;
+}
+
+static int64_t variant_datetime_value(const VecDateTimeValue& value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND;
+}
+
+static int64_t variant_datetime_value(const DateV2Value<DateTimeV2ValueType>&
value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND + value.microsecond();
+}
+
+static int64_t variant_datetime_value(const TimestampTzValue& value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND + value.microsecond();
+}
+
+static int find_child_idx(const FieldSchema& field, std::string_view name) {
+ for (int i = 0; i < field.children.size(); ++i) {
+ if (field.children[i].lower_case_name == name) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+static bool is_variant_wrapper_typed_value_child(const FieldSchema& field) {
+ auto type = remove_nullable(field.data_type);
+ return type->get_primitive_type() == TYPE_STRUCT ||
type->get_primitive_type() == TYPE_ARRAY;
+}
+
+static bool is_variant_wrapper_field(const FieldSchema& field,
+ bool
allow_scalar_typed_value_only_wrapper) {
+ auto type = remove_nullable(field.data_type);
+ if (type->get_primitive_type() != TYPE_STRUCT &&
type->get_primitive_type() != TYPE_VARIANT) {
+ return false;
+ }
+
+ bool has_metadata = false;
+ bool has_value = false;
+ const FieldSchema* typed_value = nullptr;
+ for (const auto& child : field.children) {
+ if (child.lower_case_name == "metadata") {
+ if (child.physical_type != tparquet::Type::BYTE_ARRAY) {
+ return false;
+ }
+ has_metadata = true;
+ continue;
+ }
+ if (child.lower_case_name == "value") {
+ if (child.physical_type != tparquet::Type::BYTE_ARRAY) {
+ return false;
+ }
+ has_value = true;
+ continue;
+ }
+ if (child.lower_case_name == "typed_value") {
+ typed_value = &child;
+ continue;
+ }
+ return false;
+ }
+ if (has_metadata && has_value) {
+ return type->get_primitive_type() == TYPE_VARIANT || typed_value !=
nullptr;
+ }
+ if (has_value) {
+ return typed_value != nullptr;
Review Comment:
This rejects a valid field-level residual-only shredded VARIANT wrapper. A
shredded object field can be represented as a group containing only `value`
when there is no typed shard for that field, e.g. `v.typed_value.metric {
optional binary value; }`. With `has_value && typed_value == nullptr`,
`is_variant_wrapper_field()` returns false, so
`shredded_field_to_variant_map()` treats the group as an ordinary struct,
appends a user path named `value`, and then tries to materialize the raw
encoded bytes instead of decoding them with the inherited metadata. The result
is either an unsupported/corrupt value or the wrong path (`metric.value`
instead of `metric`). Please recognize `value`-only field-level wrappers when
inherited metadata is available and add coverage for a shredded field whose
residual is stored only in `typed_value.<field>.value`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]