eldenmoon commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3234320583
##########
be/src/format/parquet/vparquet_column_reader.cpp:
##########
@@ -1001,6 +1784,140 @@ Status StructColumnReader::read_column_data(
return Status::OK();
}
+Status VariantColumnReader::init(io::FileReaderSPtr file, FieldSchema* field,
+ const tparquet::RowGroup& row_group, size_t
max_buf_size,
+ std::unordered_map<int,
tparquet::OffsetIndex>& col_offsets,
+ RuntimeState* state, bool in_collection,
+ const std::set<uint64_t>& column_ids,
+ const std::set<uint64_t>& filter_column_ids) {
+ _field_schema = field;
+ _column_ids = column_ids;
+ _variant_struct_field = std::make_unique<FieldSchema>(*field);
+
+ DataTypes child_types;
+ Strings child_names;
+ child_types.reserve(field->children.size());
+ child_names.reserve(field->children.size());
+ for (const auto& child : field->children) {
+ child_types.push_back(make_nullable(child.data_type));
+ child_names.push_back(child.name);
+ }
+ DataTypePtr variant_struct_type =
std::make_shared<DataTypeStruct>(child_types, child_names);
+ if (field->data_type->is_nullable()) {
+ variant_struct_type = make_nullable(variant_struct_type);
+ }
+ _variant_struct_field->data_type = variant_struct_type;
+
+ RETURN_IF_ERROR(ParquetColumnReader::create(file,
_variant_struct_field.get(), row_group,
+ _row_ranges, _ctz, _io_ctx,
_struct_reader,
+ max_buf_size, col_offsets,
state, in_collection,
+ column_ids,
filter_column_ids));
+ _struct_reader->set_column_in_nested();
+ return Status::OK();
+}
+
+Status VariantColumnReader::read_column_data(
+ ColumnPtr& doris_column, const DataTypePtr& type,
+ const std::shared_ptr<TableSchemaChangeHelper::Node>& root_node,
FilterMap& filter_map,
+ size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter,
+ int64_t real_column_size) {
+ (void)root_node;
+ if (remove_nullable(type)->get_primitive_type() !=
PrimitiveType::TYPE_VARIANT) {
+ return Status::Corruption(
+ "Wrong data type for column '{}', expected Variant type,
actual type: {}.",
+ _field_schema->name, type->get_name());
+ }
+
+ const auto& variant_struct_type = _variant_struct_field->data_type;
+ ColumnPtr struct_column = variant_struct_type->create_column();
+ const size_t old_struct_rows = struct_column->size();
+ auto const_node = TableSchemaChangeHelper::ConstNode::get_instance();
+ RETURN_IF_ERROR(_struct_reader->read_column_data(struct_column,
variant_struct_type, const_node,
+ filter_map, batch_size,
read_rows, eof,
+ is_dict_filter,
real_column_size));
+
+ const size_t new_struct_rows = struct_column->size() - old_struct_rows;
+ if (new_struct_rows == 0) {
+ return Status::OK();
+ }
+
+ MutableColumnPtr variant_column_ptr;
+ NullMap* null_map_ptr = nullptr;
+ auto mutable_column = doris_column->assume_mutable();
+ if (doris_column->is_nullable()) {
+ auto* nullable_column =
assert_cast<ColumnNullable*>(mutable_column.get());
+ variant_column_ptr = nullable_column->get_nested_column_ptr();
+ null_map_ptr = &nullable_column->get_null_map_data();
+ } else {
+ if (_field_schema->data_type->is_nullable()) {
+ return Status::Corruption("Not nullable column has null values in
parquet file");
+ }
+ variant_column_ptr = std::move(mutable_column);
+ }
+ auto* variant_column =
assert_cast<ColumnVariant*>(variant_column_ptr.get());
+
+ const IColumn* variant_struct_source = struct_column.get();
+ const NullMap* struct_null_map = nullptr;
+ if (const auto* nullable_struct =
check_and_get_column<ColumnNullable>(variant_struct_source)) {
+ struct_null_map = &nullable_struct->get_null_map_data();
+ variant_struct_source = &nullable_struct->get_nested_column();
+ }
+ const auto& variant_struct_column = assert_cast<const
ColumnStruct&>(*variant_struct_source);
+
+ const int value_idx = find_child_idx(*_field_schema, "value");
+ const int typed_value_idx = find_child_idx(*_field_schema, "typed_value");
+ if (value_idx < 0 && typed_value_idx >= 0 &&
+ can_direct_read_typed_value(_field_schema->children[typed_value_idx],
false, _column_ids)) {
+ MutableColumnPtr batch_variant_column =
Review Comment:
Fixed in current head `7aef30cdbcdebd9735846948a34f710fcb054f2b`. The
typed-only fast path now requires at least one selected direct typed leaf under
`typed_value`; if pruning only selected root/metadata for a missing key, it
falls back to the row-wise path and appends the correct number of missing/null
VARIANT rows instead of copying past the default batch row. Added
`ParquetVariantReaderTest.DirectTypedOnlyRequiresSelectedTypedLeaf` plus
`typed_only_missing_field` regression and profile assertion that `v['missing']`
reads only `v.metadata` and no `v.typed_value*`/`v.value`. Verified locally
with `./run-be-ut.sh --run
--filter="ParquetVariantReaderTest.*:NestedColumnAccessHelperTest.*"`,
`./build.sh --be`, `./run-regression-test.sh --run --conf
tmp/regression-conf.auto.groovy -d external_table_p0/tvf -s
test_local_tvf_iceberg_variant -forceGenOut`, the same regression without
`-forceGenOut`, `build-support/check-format.sh`, and `git diff --check`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]