Re: [PR] [feature](iceberg) Support reading Iceberg variant from Parquet [doris]

via GitHub Tue, 12 May 2026 15:37:53 -0700


eldenmoon commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3230284680



##########
be/src/format/table/iceberg/iceberg_parquet_nested_column_utils.cpp:
##########
@@ -18,21 +18,241 @@
 #include "format/table/iceberg/iceberg_parquet_nested_column_utils.h"
 
 #include <algorithm>
+#include <cctype>
 #include <iostream>
 #include <memory>
 #include <set>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 #include <vector>
 
 #include "format/parquet/schema_desc.h"
 #include "format/table/table_schema_change_helper.h"
 
 namespace doris {
+namespace {
+
+void add_column_id_range(const FieldSchema& field_schema, std::set<uint64_t>& 
column_ids) {
+    const uint64_t start_id = field_schema.get_column_id();
+    const uint64_t max_column_id = field_schema.get_max_column_id();
+    for (uint64_t id = start_id; id <= max_column_id; ++id) {
+        column_ids.insert(id);
+    }
+}
+
+const FieldSchema* find_child_by_structural_name(const FieldSchema& 
field_schema,
+                                                 std::string_view name) {
+    std::string lower_name(name);
+    std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(),
+                   [](unsigned char c) { return 
static_cast<char>(std::tolower(c)); });
+    for (const auto& child : field_schema.children) {
+        if (child.name == name || child.lower_case_name == lower_name) {
+            return &child;
+        }
+    }
+    return nullptr;
+}
+
+const FieldSchema* find_child_by_exact_name(const FieldSchema& field_schema,
+                                            std::string_view name) {
+    for (const auto& child : field_schema.children) {
+        if (child.name == name) {
+            return &child;
+        }
+    }
+    return nullptr;
+}
+
+void add_variant_metadata(const FieldSchema& variant_field, 
std::set<uint64_t>& column_ids) {
+    if (const auto* metadata = find_child_by_structural_name(variant_field, 
"metadata")) {
+        add_column_id_range(*metadata, column_ids);
+    }
+}
+
+void add_variant_value(const FieldSchema& variant_field, std::set<uint64_t>& 
column_ids) {
+    add_variant_metadata(variant_field, column_ids);
+    if (const auto* value = find_child_by_structural_name(variant_field, 
"value")) {
+        add_column_id_range(*value, column_ids);
+    }
+}
+
+bool is_shredded_variant_field(const FieldSchema& field_schema) {

Review Comment:
   Handled in the latest head. The Iceberg helper now mirrors the Hive/local 
typed-value-only pruning behavior. The regression includes a typed-only nested 
profile assertion that reads v.typed_value.nested.typed_value.x and does not 
read v.typed_value.nested.value or top-level v.value.



##########
be/src/format/parquet/vparquet_column_reader.cpp:
##########
@@ -1001,6 +1125,382 @@ Status StructColumnReader::read_column_data(
     return Status::OK();
 }
 
+Status VariantColumnReader::init(io::FileReaderSPtr file, FieldSchema* field,
+                                 const tparquet::RowGroup& row_group, size_t 
max_buf_size,
+                                 std::unordered_map<int, 
tparquet::OffsetIndex>& col_offsets,
+                                 RuntimeState* state, bool in_collection,
+                                 const std::set<uint64_t>& column_ids,
+                                 const std::set<uint64_t>& filter_column_ids) {
+    _field_schema = field;
+    _variant_struct_field = std::make_unique<FieldSchema>(*field);
+
+    DataTypes child_types;
+    Strings child_names;
+    child_types.reserve(field->children.size());
+    child_names.reserve(field->children.size());
+    for (const auto& child : field->children) {
+        child_types.push_back(make_nullable(child.data_type));
+        child_names.push_back(child.name);
+    }
+    _variant_struct_type = std::make_shared<DataTypeStruct>(child_types, 
child_names);
+    if (field->data_type->is_nullable()) {
+        _variant_struct_type = make_nullable(_variant_struct_type);
+    }
+    _variant_struct_field->data_type = _variant_struct_type;
+
+    RETURN_IF_ERROR(ParquetColumnReader::create(file, 
_variant_struct_field.get(), row_group,
+                                                _row_ranges, _ctz, _io_ctx, 
_struct_reader,
+                                                max_buf_size, col_offsets, 
state, in_collection,
+                                                column_ids, 
filter_column_ids));
+    _struct_reader->set_column_in_nested();
+    return Status::OK();
+}
+
+Status VariantColumnReader::_get_binary_field(const Field& field, std::string* 
value,
+                                              bool* present) const {
+    if (field.is_null()) {
+        *present = false;
+        return Status::OK();
+    }
+    *present = true;
+    switch (field.get_type()) {
+    case TYPE_STRING:
+        *value = field.get<TYPE_STRING>();
+        return Status::OK();
+    case TYPE_CHAR:
+        *value = field.get<TYPE_CHAR>();
+        return Status::OK();
+    case TYPE_VARCHAR:
+        *value = field.get<TYPE_VARCHAR>();
+        return Status::OK();
+    case TYPE_VARBINARY: {
+        auto ref = field.get<TYPE_VARBINARY>().to_string_ref();
+        value->assign(ref.data, ref.size);
+        return Status::OK();
+    }
+    default:
+        return Status::Corruption("Parquet VARIANT binary field has unexpected 
Doris type {}",
+                                  field.get_type_name());
+    }
+}
+
+Status VariantColumnReader::_field_to_json(const FieldSchema& field_schema, 
const Field& field,
+                                           std::string* json, bool* present) 
const {
+    if (field.is_null()) {
+        *present = false;
+        return Status::OK();
+    }
+    *present = true;
+    const DataTypePtr& type = remove_nullable(field_schema.data_type);
+    switch (type->get_primitive_type()) {
+    case TYPE_BOOLEAN:
+    case TYPE_TINYINT:
+    case TYPE_SMALLINT:
+    case TYPE_INT:
+    case TYPE_BIGINT:
+    case TYPE_LARGEINT:
+    case TYPE_DECIMALV2:
+    case TYPE_DECIMAL32:
+    case TYPE_DECIMAL64:
+    case TYPE_DECIMAL128I:
+    case TYPE_DECIMAL256:
+        json->append(field.to_debug_string(type->get_scale()));
+        return Status::OK();
+    case TYPE_FLOAT: {
+        const auto value = field.get<TYPE_FLOAT>();
+        json->append(std::isfinite(value) ? 
field.to_debug_string(type->get_scale()) : "null");
+        return Status::OK();
+    }
+    case TYPE_DOUBLE: {
+        const auto value = field.get<TYPE_DOUBLE>();
+        json->append(std::isfinite(value) ? 
field.to_debug_string(type->get_scale()) : "null");
+        return Status::OK();
+    }
+    case TYPE_TIMEV2:
+        json->append(std::to_string(field.get<TYPE_TIMEV2>()));
+        return Status::OK();
+    case TYPE_DATE:
+    case TYPE_DATETIME:

Review Comment:
   Handled in the latest head. Shredded temporal typed values now use Parquet 
VARIANT raw numeric semantics: date as epoch day, time as microseconds, and 
timestamp/timestamptz as UTC epoch microseconds. The regression joins 
unshredded and shredded temporal files and asserts date/time/timestamp parity, 
with profile checks proving only the typed temporal leaves are read.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [feature](iceberg) Support reading Iceberg variant from Parquet [doris]

Reply via email to