github-actions[bot] commented on code in PR #63192:
URL: https://github.com/apache/doris/pull/63192#discussion_r3253219373
##########
be/src/format/parquet/vparquet_column_reader.cpp:
##########
@@ -103,6 +127,1534 @@ static void fill_array_offset(FieldSchema* field,
ColumnArray::Offsets64& offset
}
}
+static constexpr int64_t UNIX_EPOCH_DAYNR = 719528;
+static constexpr int64_t MICROS_PER_SECOND = 1000000;
+
+static int64_t variant_date_value(const VecDateTimeValue& value) {
+ return value.daynr() - UNIX_EPOCH_DAYNR;
+}
+
+static int64_t variant_date_value(const DateV2Value<DateV2ValueType>& value) {
+ return value.daynr() - UNIX_EPOCH_DAYNR;
+}
+
+static int64_t variant_datetime_value(const VecDateTimeValue& value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND;
+}
+
+static int64_t variant_datetime_value(const DateV2Value<DateTimeV2ValueType>&
value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND + value.microsecond();
+}
+
+static int64_t variant_datetime_value(const TimestampTzValue& value) {
+ int64_t timestamp = 0;
+ value.unix_timestamp(×tamp, cctz::utc_time_zone());
+ return timestamp * MICROS_PER_SECOND + value.microsecond();
+}
+
+static int find_child_idx(const FieldSchema& field, std::string_view name) {
+ for (int i = 0; i < field.children.size(); ++i) {
+ if (field.children[i].lower_case_name == name) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+static bool is_variant_wrapper_typed_value_child(const FieldSchema& field) {
+ auto type = remove_nullable(field.data_type);
+ return type->get_primitive_type() == TYPE_STRUCT ||
type->get_primitive_type() == TYPE_ARRAY;
+}
+
+static bool is_variant_wrapper_field(const FieldSchema& field,
+ bool
allow_scalar_typed_value_only_wrapper) {
+ auto type = remove_nullable(field.data_type);
+ if (type->get_primitive_type() != TYPE_STRUCT &&
type->get_primitive_type() != TYPE_VARIANT) {
+ return false;
+ }
+
+ bool has_metadata = false;
+ bool has_value = false;
+ const FieldSchema* typed_value = nullptr;
+ for (const auto& child : field.children) {
+ if (child.lower_case_name == "metadata") {
+ if (child.physical_type != tparquet::Type::BYTE_ARRAY) {
+ return false;
+ }
+ has_metadata = true;
+ continue;
+ }
+ if (child.lower_case_name == "value") {
+ if (child.physical_type != tparquet::Type::BYTE_ARRAY) {
+ return false;
+ }
+ has_value = true;
+ continue;
+ }
+ if (child.lower_case_name == "typed_value") {
+ typed_value = &child;
+ continue;
+ }
+ return false;
+ }
+ if (has_metadata && has_value) {
+ return type->get_primitive_type() == TYPE_VARIANT || typed_value !=
nullptr;
+ }
+ if (has_value) {
+ return typed_value != nullptr;
+ }
+ return typed_value != nullptr && (allow_scalar_typed_value_only_wrapper ||
+
is_variant_wrapper_typed_value_child(*typed_value));
+}
+
+static bool is_value_only_variant_wrapper_candidate(const FieldSchema& field) {
+ auto type = remove_nullable(field.data_type);
+ if (type->get_primitive_type() != TYPE_STRUCT &&
type->get_primitive_type() != TYPE_VARIANT) {
+ return false;
+ }
+
+ bool has_value = false;
+ for (const auto& child : field.children) {
+ if (child.lower_case_name == "value" && child.physical_type ==
tparquet::Type::BYTE_ARRAY) {
+ has_value = true;
+ continue;
+ }
+ return false;
+ }
+ return has_value;
+}
+
+static Status get_binary_field(const Field& field, std::string* value, bool*
present) {
+ if (field.is_null()) {
+ *present = false;
+ return Status::OK();
+ }
+ *present = true;
+ switch (field.get_type()) {
+ case TYPE_STRING:
+ *value = field.get<TYPE_STRING>();
+ return Status::OK();
+ case TYPE_CHAR:
+ *value = field.get<TYPE_CHAR>();
+ return Status::OK();
+ case TYPE_VARCHAR:
+ *value = field.get<TYPE_VARCHAR>();
+ return Status::OK();
+ case TYPE_VARBINARY: {
+ auto ref = field.get<TYPE_VARBINARY>().to_string_ref();
+ value->assign(ref.data, ref.size);
+ return Status::OK();
+ }
+ default:
+ return Status::Corruption("Parquet VARIANT binary field has unexpected
Doris type {}",
+ field.get_type_name());
+ }
+}
+
+static PathInData append_path(const PathInData& prefix, const PathInData&
suffix) {
+ if (prefix.empty()) {
+ return suffix;
+ }
+ if (suffix.empty()) {
+ return prefix;
+ }
+ PathInDataBuilder builder;
+ builder.append(prefix.get_parts(), false);
+ builder.append(suffix.get_parts(), false);
+ return builder.build();
+}
+
+static Status make_jsonb_field(std::string_view json, FieldWithDataType*
value) {
+ JsonBinaryValue jsonb_value;
+ RETURN_IF_ERROR(jsonb_value.from_json_string(json.data(), json.size()));
+ value->field =
+ Field::create_field<TYPE_JSONB>(JsonbField(jsonb_value.value(),
jsonb_value.size()));
+ value->base_scalar_type_id = TYPE_JSONB;
+ value->num_dimensions = 0;
+ value->precision = 0;
+ value->scale = 0;
+ return Status::OK();
+}
+
+static std::string make_null_array_json(size_t elements) {
+ std::string json = "[";
+ for (size_t i = 0; i < elements; ++i) {
+ if (i != 0) {
+ json.push_back(',');
+ }
+ json.append("null");
+ }
+ json.push_back(']');
+ return json;
+}
+
+static Status make_empty_object_field(Field* field) {
+ FieldWithDataType value;
+ RETURN_IF_ERROR(make_jsonb_field("{}", &value));
+ *field = std::move(value.field);
+ return Status::OK();
+}
+
+static Status insert_jsonb_value(const PathInData& path, std::string_view json,
+ VariantMap* values) {
+ FieldWithDataType value;
+ RETURN_IF_ERROR(make_jsonb_field(json, &value));
+ (*values)[path] = std::move(value);
+ return Status::OK();
+}
+
+static Status insert_empty_object_marker(const PathInData& path, VariantMap*
values) {
+ return insert_jsonb_value(path, "{}", values);
+}
+
+static bool is_empty_object_marker(const FieldWithDataType& value) {
+ if (value.field.get_type() != TYPE_JSONB) {
+ return false;
+ }
+ const auto& jsonb = value.field.get<TYPE_JSONB>();
+ const JsonbDocument* document = nullptr;
+ Status st =
+ JsonbDocument::checkAndCreateDocument(jsonb.get_value(),
jsonb.get_size(), &document);
+ if (!st.ok() || document == nullptr || document->getValue() == nullptr ||
+ !document->getValue()->isObject()) {
+ return false;
+ }
+ return document->getValue()->unpack<ObjectVal>()->numElem() == 0;
+}
+
+static Status collect_empty_object_markers(const rapidjson::Value& value,
PathInDataBuilder* path,
+ VariantMap* values) {
+ if (!value.IsObject()) {
+ return Status::OK();
+ }
+ if (value.MemberCount() == 0) {
+ return insert_empty_object_marker(path->build(), values);
+ }
+ for (auto it = value.MemberBegin(); it != value.MemberEnd(); ++it) {
+ if (it->value.IsObject()) {
+ path->append(std::string_view(it->name.GetString(),
it->name.GetStringLength()), false);
+ RETURN_IF_ERROR(collect_empty_object_markers(it->value, path,
values));
+ path->pop_back();
+ }
+ }
+ return Status::OK();
+}
+
+static Status add_empty_object_markers_from_json(const std::string& json,
const PathInData& prefix,
+ VariantMap* values) {
+ if (json.find("{}") == std::string::npos) {
+ return Status::OK();
+ }
+ rapidjson::Document document;
+ document.Parse(json.data(), json.size());
+ if (document.HasParseError()) {
+ return Status::Corruption("Invalid Parquet VARIANT decoded JSON");
+ }
+ PathInDataBuilder path;
+ path.append(prefix.get_parts(), false);
+ return collect_empty_object_markers(document, &path, values);
+}
+
+static Status parse_json_to_variant_map(const std::string& json, const
PathInData& prefix,
+ VariantMap* values) {
+ auto parsed_column = ColumnVariant::create(0, false);
+ ParseConfig parse_config;
+ StringRef json_ref(json.data(), json.size());
+ RETURN_IF_CATCH_EXCEPTION(
+ variant_util::parse_json_to_variant(*parsed_column, json_ref,
nullptr, parse_config));
+ Field parsed = (*parsed_column)[0];
+ if (!parsed.is_null()) {
+ auto& parsed_values = parsed.get<TYPE_VARIANT>();
+ for (auto& [path, value] : parsed_values) {
+ (*values)[append_path(prefix, path)] = std::move(value);
+ }
+ }
+ RETURN_IF_ERROR(add_empty_object_markers_from_json(json, prefix, values));
+ return Status::OK();
+}
+
+static Status variant_map_to_json(VariantMap values, std::string* json) {
+ auto variant_column = ColumnVariant::create(0, false);
+ RETURN_IF_CATCH_EXCEPTION(
+
variant_column->insert(Field::create_field<TYPE_VARIANT>(std::move(values))));
+ DataTypeSerDe::FormatOptions options;
+ variant_column->serialize_one_row_to_string(0, json, options);
+ return Status::OK();
+}
+
+static bool path_has_prefix(const PathInData& path, const PathInData& prefix) {
+ const auto& parts = path.get_parts();
+ const auto& prefix_parts = prefix.get_parts();
+ if (parts.size() < prefix_parts.size()) {
+ return false;
+ }
+ for (size_t i = 0; i < prefix_parts.size(); ++i) {
+ if (parts[i] != prefix_parts[i]) {
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool has_descendant_path(const VariantMap& values, const PathInData&
prefix) {
+ const size_t prefix_size = prefix.get_parts().size();
+ return std::ranges::any_of(values, [&](const auto& entry) {
+ const auto& path = entry.first;
+ return path.get_parts().size() > prefix_size && path_has_prefix(path,
prefix);
+ });
+}
+
+static void erase_shadowed_empty_object_markers(VariantMap* values,
+ const VariantMap&
shadowing_values) {
+ for (auto it = values->begin(); it != values->end();) {
+ if (is_empty_object_marker(it->second) &&
+ (has_descendant_path(*values, it->first) ||
+ has_descendant_path(shadowing_values, it->first))) {
+ it = values->erase(it);
+ continue;
+ }
+ ++it;
+ }
+}
+
+static void erase_shadowed_empty_object_markers(VariantMap* value_values,
+ VariantMap* typed_values) {
+ erase_shadowed_empty_object_markers(value_values, *typed_values);
+ erase_shadowed_empty_object_markers(typed_values, *value_values);
+}
+
+static Status check_no_shredded_value_typed_duplicates(const VariantMap&
value_values,
+ const VariantMap&
typed_values,
+ const PathInData&
prefix) {
+ const size_t prefix_size = prefix.get_parts().size();
+ for (const auto& value_entry : value_values) {
+ const auto& value_path = value_entry.first;
+ if (!path_has_prefix(value_path, prefix)) {
+ continue;
+ }
+ if (value_path.get_parts().size() == prefix_size) {
+ if (is_empty_object_marker(value_entry.second) &&
+ !has_descendant_path(typed_values, value_path)) {
+ continue;
+ }
+ if (!typed_values.empty()) {
+ return Status::Corruption(
+ "Parquet VARIANT residual value conflicts with
typed_value at path {}",
+ value_path.get_path());
+ }
+ continue;
+ }
+ for (const auto& typed_entry : typed_values) {
+ const auto& typed_path = typed_entry.first;
+ if (!path_has_prefix(typed_path, prefix)) {
+ continue;
+ }
+ if (typed_path.get_parts().size() == prefix_size) {
+ if (is_empty_object_marker(typed_entry.second) &&
+ !has_descendant_path(value_values, typed_path)) {
+ continue;
+ }
+ return Status::Corruption(
+ "Parquet VARIANT residual value and typed_value
contain duplicate field {}",
+ value_path.get_parts()[prefix_size].key);
+ }
+ if (value_path.get_parts()[prefix_size] ==
typed_path.get_parts()[prefix_size]) {
+ if (value_path == typed_path &&
is_empty_object_marker(value_entry.second) &&
+ is_empty_object_marker(typed_entry.second)) {
+ continue;
+ }
+ return Status::Corruption(
+ "Parquet VARIANT residual value and typed_value
contain duplicate field {}",
+ value_path.get_parts()[prefix_size].key);
+ }
+ }
+ }
+ return Status::OK();
+}
+
+static bool has_direct_typed_parent_null(const std::vector<const NullMap*>&
null_maps, size_t row) {
+ return std::ranges::any_of(null_maps, [&](const NullMap* null_map) {
+ DCHECK_LT(row, null_map->size());
+ return (*null_map)[row];
+ });
+}
+
+static void insert_direct_typed_leaf_range(const IColumn& column, size_t
start, size_t rows,
+ const std::vector<const NullMap*>&
parent_null_maps,
+ IColumn* variant_leaf) {
+ auto& nullable_leaf = assert_cast<ColumnNullable&>(*variant_leaf);
+ const IColumn* value_column = &column;
+ const NullMap* leaf_null_map = nullptr;
+ if (const auto* nullable_column =
check_and_get_column<ColumnNullable>(&column)) {
+ value_column = &nullable_column->get_nested_column();
+ leaf_null_map = &nullable_column->get_null_map_data();
+ }
+
+ nullable_leaf.get_nested_column().insert_range_from(*value_column, start,
rows);
+ auto& null_map = nullable_leaf.get_null_map_data();
+ null_map.reserve(null_map.size() + rows);
+ for (size_t i = 0; i < rows; ++i) {
+ const size_t row = start + i;
+ const bool leaf_is_null = leaf_null_map != nullptr &&
(*leaf_null_map)[row];
+ null_map.push_back(leaf_is_null ||
has_direct_typed_parent_null(parent_null_maps, row));
+ }
+}
+
+static bool is_temporal_variant_leaf_type(PrimitiveType type) {
+ switch (type) {
+ case TYPE_TIMEV2:
+ case TYPE_DATE:
+ case TYPE_DATETIME:
+ case TYPE_DATEV2:
+ case TYPE_DATETIMEV2:
+ case TYPE_TIMESTAMPTZ:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool is_uuid_typed_value_field(const FieldSchema& field_schema);
+static bool contains_uuid_typed_value_field(const FieldSchema& field_schema);
+
+static DataTypePtr direct_variant_leaf_type(const DataTypePtr& data_type) {
+ const auto& type = remove_nullable(data_type);
+ if (is_temporal_variant_leaf_type(type->get_primitive_type())) {
+ return std::make_shared<DataTypeInt64>();
+ }
+ return type;
+}
+
+static DataTypePtr direct_variant_leaf_type(const FieldSchema& field_schema) {
+ if (is_uuid_typed_value_field(field_schema)) {
+ return std::make_shared<DataTypeString>();
+ }
+ return direct_variant_leaf_type(field_schema.data_type);
+}
+
+static bool contains_temporal_variant_leaf_type(const DataTypePtr& data_type) {
+ const auto& type = remove_nullable(data_type);
+ if (is_temporal_variant_leaf_type(type->get_primitive_type())) {
+ return true;
+ }
+ if (type->get_primitive_type() == TYPE_ARRAY) {
+ return contains_temporal_variant_leaf_type(
+ assert_cast<const
DataTypeArray*>(type.get())->get_nested_type());
+ }
+ return false;
+}
+
+static int64_t direct_temporal_variant_value(PrimitiveType type, const
IColumn& column,
+ size_t row) {
+ switch (type) {
+ case TYPE_TIMEV2:
+ return static_cast<int64_t>(
+ std::llround(assert_cast<const
ColumnTimeV2&>(column).get_data()[row]));
+ case TYPE_DATE:
+ return variant_date_value(assert_cast<const
ColumnDate&>(column).get_data()[row]);
+ case TYPE_DATETIME:
+ return variant_datetime_value(assert_cast<const
ColumnDateTime&>(column).get_data()[row]);
+ case TYPE_DATEV2:
+ return variant_date_value(assert_cast<const
ColumnDateV2&>(column).get_data()[row]);
+ case TYPE_DATETIMEV2:
+ return variant_datetime_value(assert_cast<const
ColumnDateTimeV2&>(column).get_data()[row]);
+ case TYPE_TIMESTAMPTZ:
+ return variant_datetime_value(
+ assert_cast<const ColumnTimeStampTz&>(column).get_data()[row]);
+ default:
+ DORIS_CHECK(false);
+ return 0;
+ }
+}
+
+static void insert_direct_typed_temporal_leaf_range(
+ PrimitiveType type, const IColumn& column, size_t start, size_t rows,
+ const std::vector<const NullMap*>& parent_null_maps, IColumn*
variant_leaf) {
+ auto& nullable_leaf = assert_cast<ColumnNullable&>(*variant_leaf);
+ const IColumn* value_column = &column;
+ const NullMap* leaf_null_map = nullptr;
+ if (const auto* nullable_column =
check_and_get_column<ColumnNullable>(&column)) {
+ value_column = &nullable_column->get_nested_column();
+ leaf_null_map = &nullable_column->get_null_map_data();
+ }
+
+ auto& data =
assert_cast<ColumnInt64&>(nullable_leaf.get_nested_column()).get_data();
+ data.reserve(data.size() + rows);
+ auto& null_map = nullable_leaf.get_null_map_data();
+ null_map.reserve(null_map.size() + rows);
+ for (size_t i = 0; i < rows; ++i) {
+ const size_t row = start + i;
+ data.push_back(direct_temporal_variant_value(type, *value_column,
row));
+ const bool leaf_is_null = leaf_null_map != nullptr &&
(*leaf_null_map)[row];
+ null_map.push_back(leaf_is_null ||
has_direct_typed_parent_null(parent_null_maps, row));
+ }
+}
+
+static Status insert_direct_typed_uuid_leaf_range(
+ const IColumn& column, size_t start, size_t rows,
+ const std::vector<const NullMap*>& parent_null_maps, IColumn*
variant_leaf) {
+ auto& nullable_leaf = assert_cast<ColumnNullable&>(*variant_leaf);
+ const IColumn* value_column = &column;
+ const NullMap* leaf_null_map = nullptr;
+ if (const auto* nullable_column =
check_and_get_column<ColumnNullable>(&column)) {
+ value_column = &nullable_column->get_nested_column();
+ leaf_null_map = &nullable_column->get_null_map_data();
+ }
+
+ auto& data = assert_cast<ColumnString&>(nullable_leaf.get_nested_column());
+ auto& null_map = nullable_leaf.get_null_map_data();
+ null_map.reserve(null_map.size() + rows);
+ for (size_t i = 0; i < rows; ++i) {
+ const size_t row = start + i;
+ const bool leaf_is_null = leaf_null_map != nullptr &&
(*leaf_null_map)[row];
+ const bool is_null = leaf_is_null ||
has_direct_typed_parent_null(parent_null_maps, row);
+ if (is_null) {
+ data.insert_default();
+ null_map.push_back(1);
+ continue;
+ }
+ StringRef bytes = value_column->get_data_at(row);
+ if (bytes.size != 16) {
+ return Status::Corruption("Parquet VARIANT UUID typed_value has
invalid length {}",
+ bytes.size);
+ }
+ std::string uuid =
+ parquet::format_variant_uuid(reinterpret_cast<const
uint8_t*>(bytes.data));
+ data.insert_data(uuid.data(), uuid.size());
+ null_map.push_back(0);
+ }
+ return Status::OK();
+}
+
+static void append_json_string(std::string_view value, std::string* json) {
+ auto column = ColumnString::create();
+ VectorBufferWriter writer(*column);
+ writer.write_json_string(value);
+ writer.commit();
+ json->append(column->get_data_at(0).data, column->get_data_at(0).size);
+}
+
+static bool is_column_selected(const FieldSchema& field_schema,
+ const std::set<uint64_t>& column_ids) {
+ return column_ids.empty() || column_ids.find(field_schema.get_column_id())
!= column_ids.end();
+}
+
+static bool has_selected_column(const FieldSchema& field_schema,
+ const std::set<uint64_t>& column_ids) {
+ if (is_column_selected(field_schema, column_ids)) {
+ return true;
+ }
+ return std::any_of(field_schema.children.begin(),
field_schema.children.end(),
+ [&column_ids](const FieldSchema& child) {
+ return has_selected_column(child, column_ids);
+ });
+}
+
+static bool is_direct_variant_leaf_type(const DataTypePtr& data_type) {
+ const auto& type = remove_nullable(data_type);
+ switch (type->get_primitive_type()) {
+ case TYPE_BOOLEAN:
+ case TYPE_TINYINT:
+ case TYPE_SMALLINT:
+ case TYPE_INT:
+ case TYPE_BIGINT:
+ case TYPE_LARGEINT:
+ case TYPE_DECIMALV2:
+ case TYPE_DECIMAL32:
+ case TYPE_DECIMAL64:
+ case TYPE_DECIMAL128I:
+ case TYPE_DECIMAL256:
+ case TYPE_STRING:
+ case TYPE_CHAR:
+ case TYPE_VARCHAR:
+ case TYPE_VARBINARY:
+ return true;
+ case TYPE_TIMEV2:
+ case TYPE_DATE:
+ case TYPE_DATETIME:
+ case TYPE_DATEV2:
+ case TYPE_DATETIMEV2:
+ case TYPE_TIMESTAMPTZ:
+ return true;
+ case TYPE_ARRAY: {
+ const auto* array_type = assert_cast<const DataTypeArray*>(type.get());
+ return
!contains_temporal_variant_leaf_type(array_type->get_nested_type()) &&
+ is_direct_variant_leaf_type(array_type->get_nested_type());
+ }
+ default:
+ return false;
+ }
+}
+
+static bool can_direct_read_typed_value(const FieldSchema& field_schema, bool
allow_variant_wrapper,
+ const std::set<uint64_t>& column_ids) {
+ if (!has_selected_column(field_schema, column_ids)) {
+ return true;
+ }
+ if (allow_variant_wrapper && is_variant_wrapper_field(field_schema,
false)) {
+ const int value_idx = find_child_idx(field_schema, "value");
+ const int typed_value_idx = find_child_idx(field_schema,
"typed_value");
+ return (value_idx < 0 ||
+ !has_selected_column(field_schema.children[value_idx],
column_ids)) &&
+ typed_value_idx >= 0 &&
+
can_direct_read_typed_value(field_schema.children[typed_value_idx], false,
+ column_ids);
+ }
+
+ const auto& type = remove_nullable(field_schema.data_type);
+ if (type->get_primitive_type() == TYPE_STRUCT) {
+ return std::all_of(field_schema.children.begin(),
field_schema.children.end(),
+ [&column_ids](const FieldSchema& child) {
+ return can_direct_read_typed_value(child, true,
column_ids);
+ });
+ }
+ if (type->get_primitive_type() == TYPE_ARRAY &&
contains_uuid_typed_value_field(field_schema)) {
+ return false;
+ }
+ return is_direct_variant_leaf_type(field_schema.data_type);
+}
+
+static bool has_selected_direct_typed_leaf(const FieldSchema& field_schema,
+ bool allow_variant_wrapper,
+ const std::set<uint64_t>&
column_ids) {
+ if (!has_selected_column(field_schema, column_ids)) {
+ return false;
+ }
+ if (allow_variant_wrapper && is_variant_wrapper_field(field_schema,
false)) {
+ const int typed_value_idx = find_child_idx(field_schema,
"typed_value");
+ DCHECK_GE(typed_value_idx, 0);
+ return
has_selected_direct_typed_leaf(field_schema.children[typed_value_idx], false,
+ column_ids);
+ }
+
+ const auto& type = remove_nullable(field_schema.data_type);
+ if (type->get_primitive_type() == TYPE_STRUCT) {
+ return std::any_of(field_schema.children.begin(),
field_schema.children.end(),
+ [&column_ids](const FieldSchema& child) {
+ return has_selected_direct_typed_leaf(child,
true, column_ids);
+ });
+ }
+ if (type->get_primitive_type() == TYPE_ARRAY &&
contains_uuid_typed_value_field(field_schema)) {
+ return false;
+ }
+ return is_direct_variant_leaf_type(field_schema.data_type);
+}
+
+static bool can_use_direct_typed_only_value(const FieldSchema& variant_field,
+ const std::set<uint64_t>&
column_ids) {
+ const int value_idx = find_child_idx(variant_field, "value");
+ const int typed_value_idx = find_child_idx(variant_field, "typed_value");
+ return (value_idx < 0 ||
!has_selected_column(variant_field.children[value_idx], column_ids)) &&
+ typed_value_idx >= 0 &&
+
has_selected_direct_typed_leaf(variant_field.children[typed_value_idx], false,
+ column_ids) &&
+
can_direct_read_typed_value(variant_field.children[typed_value_idx], false,
column_ids);
+}
+
+static DataTypePtr make_variant_struct_reader_type(const FieldSchema& field) {
+ DataTypes child_types;
+ Strings child_names;
+ child_types.reserve(field.children.size());
+ child_names.reserve(field.children.size());
+ for (const auto& child : field.children) {
+ child_types.push_back(make_nullable(child.data_type));
+ child_names.push_back(child.name);
+ }
+ return std::make_shared<DataTypeStruct>(child_types, child_names);
+}
+
+static ColumnPtr make_variant_struct_read_column(const FieldSchema& field,
+ const DataTypePtr&
variant_struct_type) {
+ if (field.data_type->is_nullable()) {
+ return make_nullable(variant_struct_type)->create_column();
+ }
+ return variant_struct_type->create_column();
+}
+
+static void fill_variant_field_info(FieldWithDataType* value) {
+ FieldInfo info;
+ variant_util::get_field_info(value->field, &info);
+ DCHECK_LE(info.num_dimensions, std::numeric_limits<uint8_t>::max());
+ value->base_scalar_type_id = info.scalar_type_id;
+ value->num_dimensions = static_cast<uint8_t>(info.num_dimensions);
+}
+
+static void fill_variant_leaf_type_info(const DataTypePtr& data_type,
FieldWithDataType* value) {
+ auto leaf_type = remove_nullable(data_type);
+ while (leaf_type->get_primitive_type() == TYPE_ARRAY) {
+ leaf_type = remove_nullable(
+ assert_cast<const
DataTypeArray*>(leaf_type.get())->get_nested_type());
+ }
+ if (is_decimal(leaf_type->get_primitive_type())) {
+ value->precision = leaf_type->get_precision();
+ value->scale = leaf_type->get_scale();
+ }
+}
+
+template <PrimitiveType Primitive>
+static Status fill_floating_point_variant_field(const Field& field,
FieldWithDataType* value) {
+ const auto typed_value = field.get<Primitive>();
+ if (!std::isfinite(typed_value)) {
+ return Status::NotSupported(
+ "Parquet VARIANT non-finite floating point typed_value is not
supported");
+ }
+ value->field = field;
+ fill_variant_field_info(value);
+ return Status::OK();
+}
+
+static bool is_uuid_typed_value_field(const FieldSchema& field_schema) {
+ return field_schema.parquet_schema.__isset.logicalType &&
+ field_schema.parquet_schema.logicalType.__isset.UUID;
+}
+
+static bool contains_uuid_typed_value_field(const FieldSchema& field_schema) {
+ return is_uuid_typed_value_field(field_schema) ||
+ std::any_of(
+ field_schema.children.begin(), field_schema.children.end(),
+ [](const FieldSchema& child) { return
contains_uuid_typed_value_field(child); });
+}
+
+static Status uuid_field_to_string(const Field& field, std::string* uuid) {
+ StringRef bytes;
+ switch (field.get_type()) {
+ case TYPE_STRING:
+ bytes = StringRef(field.get<TYPE_STRING>());
+ break;
+ case TYPE_CHAR:
+ bytes = StringRef(field.get<TYPE_CHAR>());
+ break;
+ case TYPE_VARCHAR:
+ bytes = StringRef(field.get<TYPE_VARCHAR>());
+ break;
+ case TYPE_VARBINARY:
+ bytes = field.get<TYPE_VARBINARY>().to_string_ref();
+ break;
+ default:
+ return Status::Corruption("Parquet VARIANT UUID typed_value has
unexpected Doris type {}",
+ field.get_type_name());
+ }
+ if (bytes.size != 16) {
+ return Status::Corruption("Parquet VARIANT UUID typed_value has
invalid length {}",
+ bytes.size);
+ }
+ *uuid = parquet::format_variant_uuid(reinterpret_cast<const
uint8_t*>(bytes.data));
+ return Status::OK();
+}
+
+static Status fill_uuid_variant_field(const Field& field, FieldWithDataType*
value) {
+ std::string uuid;
+ RETURN_IF_ERROR(uuid_field_to_string(field, &uuid));
+ value->field = Field::create_field<TYPE_STRING>(std::move(uuid));
+ value->base_scalar_type_id = TYPE_STRING;
+ return Status::OK();
+}
+
+static uint8_t direct_array_dimensions(const DataTypePtr& data_type) {
+ uint8_t num_dimensions = 0;
+ auto type = remove_nullable(data_type);
+ while (type->get_primitive_type() == TYPE_ARRAY) {
+ ++num_dimensions;
+ type = remove_nullable(assert_cast<const
DataTypeArray*>(type.get())->get_nested_type());
+ }
+ return num_dimensions;
+}
+
+static Status convert_uuid_array_value(const FieldSchema& field_schema, const
Field& field,
+ Field* converted) {
+ if (field.is_null()) {
+ *converted = Field();
+ return Status::OK();
+ }
+
+ const auto& type = remove_nullable(field_schema.data_type);
+ if (type->get_primitive_type() == TYPE_ARRAY) {
+ if (field_schema.children.empty()) {
+ return Status::Corruption(
+ "Parquet VARIANT UUID array typed_value has no element
schema");
+ }
+ Array converted_elements;
+ const auto& elements = field.get<TYPE_ARRAY>();
+ converted_elements.reserve(elements.size());
+ for (const auto& element : elements) {
+ Field converted_element;
+ RETURN_IF_ERROR(convert_uuid_array_value(field_schema.children[0],
element,
+ &converted_element));
+ converted_elements.push_back(std::move(converted_element));
+ }
+ *converted =
Field::create_field<TYPE_ARRAY>(std::move(converted_elements));
+ return Status::OK();
+ }
+
+ if (!is_uuid_typed_value_field(field_schema)) {
+ return Status::Corruption("Parquet VARIANT UUID array has non-UUID
element schema");
+ }
+ FieldWithDataType value;
+ RETURN_IF_ERROR(fill_uuid_variant_field(field, &value));
+ *converted = std::move(value.field);
+ return Status::OK();
+}
+
+static Status fill_uuid_array_variant_field(const FieldSchema& field_schema,
const Field& field,
+ FieldWithDataType* value, bool*
present) {
+ if (field.is_null()) {
+ *present = false;
+ return Status::OK();
+ }
+ *present = true;
+ RETURN_IF_ERROR(convert_uuid_array_value(field_schema, field,
&value->field));
+ value->base_scalar_type_id = TYPE_STRING;
+ value->num_dimensions = direct_array_dimensions(field_schema.data_type);
+ return Status::OK();
+}
+
+static Status field_to_variant_field(const FieldSchema& field_schema, const
Field& field,
+ FieldWithDataType* value, bool* present) {
+ if (field.is_null()) {
+ *present = false;
+ return Status::OK();
+ }
+ *present = true;
+ if (is_uuid_typed_value_field(field_schema)) {
+ return fill_uuid_variant_field(field, value);
+ }
+ const DataTypePtr& type = remove_nullable(field_schema.data_type);
+ switch (type->get_primitive_type()) {
+ case TYPE_BOOLEAN:
+ case TYPE_TINYINT:
+ case TYPE_SMALLINT:
+ case TYPE_INT:
+ case TYPE_BIGINT:
+ case TYPE_LARGEINT:
+ case TYPE_DECIMALV2:
+ case TYPE_DECIMAL32:
+ case TYPE_DECIMAL64:
+ case TYPE_DECIMAL128I:
+ case TYPE_DECIMAL256:
+ case TYPE_STRING:
+ case TYPE_CHAR:
+ case TYPE_VARCHAR:
+ case TYPE_VARBINARY:
+ case TYPE_ARRAY:
+ value->field = field;
+ fill_variant_field_info(value);
+ fill_variant_leaf_type_info(type, value);
Review Comment:
This direct typed-array branch derives `base_scalar_type_id` and dimensions
from the row value before applying schema metadata. For a valid typed leaf such
as `typed_value: array<decimal(18,2)>`, rows containing `[]` or `[null]` have
no non-null element, so `variant_util::get_field_info()` leaves the scalar type
as `INVALID_TYPE`/`Nothing`; `fill_variant_leaf_type_info()` only fills
precision/scale and never restores the array element type or dimensions from
`field_schema.data_type`. Those rows are then inserted with different/invalid
VARIANT type metadata from non-empty rows, losing the Parquet typed-array
fidelity. This is distinct from the existing decimal-array JSON round-trip
thread because it happens in the direct typed leaf path even without
serializing through JSON. Please fill the array scalar type/dimensions from the
typed schema when field-derived info is invalid, and add coverage for empty and
all-null typed decimal arrays.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]