github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3234215558
##########
be/src/format/parquet/vparquet_column_reader.cpp:
##########
@@ -1001,6 +1784,140 @@ Status StructColumnReader::read_column_data(
return Status::OK();
}
+Status VariantColumnReader::init(io::FileReaderSPtr file, FieldSchema* field,
+ const tparquet::RowGroup& row_group, size_t
max_buf_size,
+ std::unordered_map<int,
tparquet::OffsetIndex>& col_offsets,
+ RuntimeState* state, bool in_collection,
+ const std::set<uint64_t>& column_ids,
+ const std::set<uint64_t>& filter_column_ids) {
+ _field_schema = field;
+ _column_ids = column_ids;
+ _variant_struct_field = std::make_unique<FieldSchema>(*field);
+
+ DataTypes child_types;
+ Strings child_names;
+ child_types.reserve(field->children.size());
+ child_names.reserve(field->children.size());
+ for (const auto& child : field->children) {
+ child_types.push_back(make_nullable(child.data_type));
+ child_names.push_back(child.name);
+ }
+ DataTypePtr variant_struct_type =
std::make_shared<DataTypeStruct>(child_types, child_names);
+ if (field->data_type->is_nullable()) {
+ variant_struct_type = make_nullable(variant_struct_type);
+ }
+ _variant_struct_field->data_type = variant_struct_type;
+
+ RETURN_IF_ERROR(ParquetColumnReader::create(file,
_variant_struct_field.get(), row_group,
+ _row_ranges, _ctz, _io_ctx,
_struct_reader,
+ max_buf_size, col_offsets,
state, in_collection,
+ column_ids,
filter_column_ids));
+ _struct_reader->set_column_in_nested();
+ return Status::OK();
+}
+
+Status VariantColumnReader::read_column_data(
+ ColumnPtr& doris_column, const DataTypePtr& type,
+ const std::shared_ptr<TableSchemaChangeHelper::Node>& root_node,
FilterMap& filter_map,
+ size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter,
+ int64_t real_column_size) {
+ (void)root_node;
+ if (remove_nullable(type)->get_primitive_type() !=
PrimitiveType::TYPE_VARIANT) {
+ return Status::Corruption(
+ "Wrong data type for column '{}', expected Variant type,
actual type: {}.",
+ _field_schema->name, type->get_name());
+ }
+
+ const auto& variant_struct_type = _variant_struct_field->data_type;
+ ColumnPtr struct_column = variant_struct_type->create_column();
+ const size_t old_struct_rows = struct_column->size();
+ auto const_node = TableSchemaChangeHelper::ConstNode::get_instance();
+ RETURN_IF_ERROR(_struct_reader->read_column_data(struct_column,
variant_struct_type, const_node,
+ filter_map, batch_size,
read_rows, eof,
+ is_dict_filter,
real_column_size));
+
+ const size_t new_struct_rows = struct_column->size() - old_struct_rows;
+ if (new_struct_rows == 0) {
+ return Status::OK();
+ }
+
+ MutableColumnPtr variant_column_ptr;
+ NullMap* null_map_ptr = nullptr;
+ auto mutable_column = doris_column->assume_mutable();
+ if (doris_column->is_nullable()) {
+ auto* nullable_column =
assert_cast<ColumnNullable*>(mutable_column.get());
+ variant_column_ptr = nullable_column->get_nested_column_ptr();
+ null_map_ptr = &nullable_column->get_null_map_data();
+ } else {
+ if (_field_schema->data_type->is_nullable()) {
+ return Status::Corruption("Not nullable column has null values in
parquet file");
+ }
+ variant_column_ptr = std::move(mutable_column);
+ }
+ auto* variant_column =
assert_cast<ColumnVariant*>(variant_column_ptr.get());
+
+ const IColumn* variant_struct_source = struct_column.get();
+ const NullMap* struct_null_map = nullptr;
+ if (const auto* nullable_struct =
check_and_get_column<ColumnNullable>(variant_struct_source)) {
+ struct_null_map = &nullable_struct->get_null_map_data();
+ variant_struct_source = &nullable_struct->get_nested_column();
+ }
+ const auto& variant_struct_column = assert_cast<const
ColumnStruct&>(*variant_struct_source);
+
+ const int value_idx = find_child_idx(*_field_schema, "value");
+ const int typed_value_idx = find_child_idx(*_field_schema, "typed_value");
+ if (value_idx < 0 && typed_value_idx >= 0 &&
+ can_direct_read_typed_value(_field_schema->children[typed_value_idx],
false, _column_ids)) {
+ MutableColumnPtr batch_variant_column =
Review Comment:
This direct typed-only branch can be taken even when pruning selected no
typed_value leaf. For example, on a typed-only VARIANT file with `metadata`
plus `typed_value` and no top-level `value`, a query like `v['missing']` makes
`extract_variant_nested_column_ids()` miss the typed child and fall back to
`add_variant_value()`, which can only add the root/metadata ids because `value`
is absent. Then `can_direct_read_typed_value(..., _column_ids)` returns true
because `has_selected_column(typed_value, column_ids)` is false,
`append_direct_typed_column_to_batch()` adds no subcolumns, and the following
`insert_range_from(*batch_variant_column, 1, new_struct_rows)` copies past the
one default row in the batch. Please either avoid the direct path unless at
least one typed leaf under `typed_value` is selected, or make the
empty-selection case append `new_struct_rows` empty/missing VARIANT rows, and
add coverage for a typed-only file queried with a non-existent/non-shredded
key. This is dis
tinct from the earlier full-projection and structural-name threads because
here the requested path is absent from the typed shard, leaving the fast-path
batch empty.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]