github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3234215558


##########
be/src/format/parquet/vparquet_column_reader.cpp:
##########
@@ -1001,6 +1784,140 @@ Status StructColumnReader::read_column_data(
     return Status::OK();
 }
 
+Status VariantColumnReader::init(io::FileReaderSPtr file, FieldSchema* field,
+                                 const tparquet::RowGroup& row_group, size_t 
max_buf_size,
+                                 std::unordered_map<int, 
tparquet::OffsetIndex>& col_offsets,
+                                 RuntimeState* state, bool in_collection,
+                                 const std::set<uint64_t>& column_ids,
+                                 const std::set<uint64_t>& filter_column_ids) {
+    _field_schema = field;
+    _column_ids = column_ids;
+    _variant_struct_field = std::make_unique<FieldSchema>(*field);
+
+    DataTypes child_types;
+    Strings child_names;
+    child_types.reserve(field->children.size());
+    child_names.reserve(field->children.size());
+    for (const auto& child : field->children) {
+        child_types.push_back(make_nullable(child.data_type));
+        child_names.push_back(child.name);
+    }
+    DataTypePtr variant_struct_type = 
std::make_shared<DataTypeStruct>(child_types, child_names);
+    if (field->data_type->is_nullable()) {
+        variant_struct_type = make_nullable(variant_struct_type);
+    }
+    _variant_struct_field->data_type = variant_struct_type;
+
+    RETURN_IF_ERROR(ParquetColumnReader::create(file, 
_variant_struct_field.get(), row_group,
+                                                _row_ranges, _ctz, _io_ctx, 
_struct_reader,
+                                                max_buf_size, col_offsets, 
state, in_collection,
+                                                column_ids, 
filter_column_ids));
+    _struct_reader->set_column_in_nested();
+    return Status::OK();
+}
+
+Status VariantColumnReader::read_column_data(
+        ColumnPtr& doris_column, const DataTypePtr& type,
+        const std::shared_ptr<TableSchemaChangeHelper::Node>& root_node, 
FilterMap& filter_map,
+        size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter,
+        int64_t real_column_size) {
+    (void)root_node;
+    if (remove_nullable(type)->get_primitive_type() != 
PrimitiveType::TYPE_VARIANT) {
+        return Status::Corruption(
+                "Wrong data type for column '{}', expected Variant type, 
actual type: {}.",
+                _field_schema->name, type->get_name());
+    }
+
+    const auto& variant_struct_type = _variant_struct_field->data_type;
+    ColumnPtr struct_column = variant_struct_type->create_column();
+    const size_t old_struct_rows = struct_column->size();
+    auto const_node = TableSchemaChangeHelper::ConstNode::get_instance();
+    RETURN_IF_ERROR(_struct_reader->read_column_data(struct_column, 
variant_struct_type, const_node,
+                                                     filter_map, batch_size, 
read_rows, eof,
+                                                     is_dict_filter, 
real_column_size));
+
+    const size_t new_struct_rows = struct_column->size() - old_struct_rows;
+    if (new_struct_rows == 0) {
+        return Status::OK();
+    }
+
+    MutableColumnPtr variant_column_ptr;
+    NullMap* null_map_ptr = nullptr;
+    auto mutable_column = doris_column->assume_mutable();
+    if (doris_column->is_nullable()) {
+        auto* nullable_column = 
assert_cast<ColumnNullable*>(mutable_column.get());
+        variant_column_ptr = nullable_column->get_nested_column_ptr();
+        null_map_ptr = &nullable_column->get_null_map_data();
+    } else {
+        if (_field_schema->data_type->is_nullable()) {
+            return Status::Corruption("Not nullable column has null values in 
parquet file");
+        }
+        variant_column_ptr = std::move(mutable_column);
+    }
+    auto* variant_column = 
assert_cast<ColumnVariant*>(variant_column_ptr.get());
+
+    const IColumn* variant_struct_source = struct_column.get();
+    const NullMap* struct_null_map = nullptr;
+    if (const auto* nullable_struct = 
check_and_get_column<ColumnNullable>(variant_struct_source)) {
+        struct_null_map = &nullable_struct->get_null_map_data();
+        variant_struct_source = &nullable_struct->get_nested_column();
+    }
+    const auto& variant_struct_column = assert_cast<const 
ColumnStruct&>(*variant_struct_source);
+
+    const int value_idx = find_child_idx(*_field_schema, "value");
+    const int typed_value_idx = find_child_idx(*_field_schema, "typed_value");
+    if (value_idx < 0 && typed_value_idx >= 0 &&
+        can_direct_read_typed_value(_field_schema->children[typed_value_idx], 
false, _column_ids)) {
+        MutableColumnPtr batch_variant_column =

Review Comment:
   This direct typed-only branch can be taken even when pruning selected no 
typed_value leaf. For example, on a typed-only VARIANT file with `metadata` 
plus `typed_value` and no top-level `value`, a query like `v['missing']` makes 
`extract_variant_nested_column_ids()` miss the typed child and fall back to 
`add_variant_value()`, which can only add the root/metadata ids because `value` 
is absent. Then `can_direct_read_typed_value(..., _column_ids)` returns true 
because `has_selected_column(typed_value, column_ids)` is false, 
`append_direct_typed_column_to_batch()` adds no subcolumns, and the following 
`insert_range_from(*batch_variant_column, 1, new_struct_rows)` copies past the 
one default row in the batch. Please either avoid the direct path unless at 
least one typed leaf under `typed_value` is selected, or make the 
empty-selection case append `new_struct_rows` empty/missing VARIANT rows, and 
add coverage for a typed-only file queried with a non-existent/non-shredded 
key. This is dis
 tinct from the earlier full-projection and structural-name threads because 
here the requested path is absent from the typed shard, leaving the fast-path 
batch empty.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to