github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3253691117


##########
be/src/format/parquet/vparquet_column_reader.cpp:
##########
@@ -1001,6 +2645,132 @@ Status StructColumnReader::read_column_data(
     return Status::OK();
 }
 
+Status VariantColumnReader::init(io::FileReaderSPtr file, FieldSchema* field,
+                                 const tparquet::RowGroup& row_group, size_t 
max_buf_size,
+                                 std::unordered_map<int, 
tparquet::OffsetIndex>& col_offsets,
+                                 RuntimeState* state, bool in_collection,
+                                 const std::set<uint64_t>& column_ids,
+                                 const std::set<uint64_t>& filter_column_ids) {
+    _field_schema = field;
+    _column_ids = column_ids;
+    _variant_struct_field = std::make_unique<FieldSchema>(*field);
+
+    DataTypePtr variant_struct_type = make_variant_struct_reader_type(*field);
+    _variant_struct_field->data_type = variant_struct_type;
+
+    RETURN_IF_ERROR(ParquetColumnReader::create(file, 
_variant_struct_field.get(), row_group,
+                                                _row_ranges, _ctz, _io_ctx, 
_struct_reader,
+                                                max_buf_size, col_offsets, 
state, in_collection,
+                                                column_ids, 
filter_column_ids));
+    _struct_reader->set_column_in_nested();
+    return Status::OK();
+}
+
+// NOLINTNEXTLINE(readability-function-size): existing variant read path stays 
local to avoid churn.
+Status VariantColumnReader::read_column_data(
+        ColumnPtr& doris_column, const DataTypePtr& type,
+        const std::shared_ptr<TableSchemaChangeHelper::Node>& root_node, 
FilterMap& filter_map,
+        size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter,
+        int64_t real_column_size) {
+    (void)root_node;
+    if (remove_nullable(type)->get_primitive_type() != 
PrimitiveType::TYPE_VARIANT) {
+        return Status::Corruption(
+                "Wrong data type for column '{}', expected Variant type, 
actual type: {}.",
+                _field_schema->name, type->get_name());
+    }
+
+    const auto& variant_struct_type = _variant_struct_field->data_type;
+    ColumnPtr struct_column = make_variant_struct_read_column(*_field_schema, 
variant_struct_type);
+    const size_t old_struct_rows = struct_column->size();
+    auto const_node = TableSchemaChangeHelper::ConstNode::get_instance();
+    RETURN_IF_ERROR(_struct_reader->read_column_data(struct_column, 
variant_struct_type, const_node,
+                                                     filter_map, batch_size, 
read_rows, eof,
+                                                     is_dict_filter, 
real_column_size));
+
+    const size_t new_struct_rows = struct_column->size() - old_struct_rows;
+    if (new_struct_rows == 0) {
+        return Status::OK();
+    }
+
+    MutableColumnPtr variant_column_ptr;
+    NullMap* null_map_ptr = nullptr;
+    auto mutable_column = doris_column->assume_mutable();
+    if (doris_column->is_nullable()) {
+        auto* nullable_column = 
assert_cast<ColumnNullable*>(mutable_column.get());
+        variant_column_ptr = nullable_column->get_nested_column_ptr();
+        null_map_ptr = &nullable_column->get_null_map_data();
+    } else {
+        if (_field_schema->data_type->is_nullable()) {
+            return Status::Corruption("Not nullable column has null values in 
parquet file");
+        }
+        variant_column_ptr = std::move(mutable_column);
+    }
+    auto* variant_column = 
assert_cast<ColumnVariant*>(variant_column_ptr.get());
+
+    const IColumn* variant_struct_source = struct_column.get();

Review Comment:
   The direct typed-only path does not materialize a root VARIANT null when the 
selected complex `typed_value` itself is null. For a typed-only schema like 
`metadata, optional group typed_value { optional int64 a; }`, 
`can_use_direct_typed_only_value()` is true, this call records the nullable 
`typed_value` null map only as a parent null, and 
`append_direct_typed_empty_object_markers()` skips the marker for that row. The 
SQL null map below is populated only from the outer VARIANT struct null map, so 
the row is returned as a non-SQL-null VARIANT with no root null marker/empty 
payload instead of the row-wise behavior at lines 2759-2760, which inserts 
`PathInData()` with `Field()` when no value or typed value is present. Please 
make the direct path emit the same root VARIANT null for absent complex 
`typed_value` rows (or fall back to row-wise for this case), and add coverage 
for typed-only optional struct/list/map `typed_value` null rows.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to