This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 98119b95dfb [fix](variant) return raw string for element_at on
scalar-string variant (#64103)
98119b95dfb is described below
commit 98119b95dfbcd62556961b9b41337a8c4d777681
Author: Chenyang Sun <[email protected]>
AuthorDate: Mon Jun 8 10:15:31 2026 +0800
[fix](variant) return raw string for element_at on scalar-string variant
(#64103)
---
be/src/exprs/function/function_variant_element.cpp | 9 +++++
.../function/function_variant_element_test.cpp | 38 ++++++++++++++++++++++
.../data/variant_p0/sql/select_from_value.out | 2 +-
.../suites/variant_p0/element_function.groovy | 28 ++++++++++++++++
4 files changed, 76 insertions(+), 1 deletion(-)
diff --git a/be/src/exprs/function/function_variant_element.cpp
b/be/src/exprs/function/function_variant_element.cpp
index 4736342f4ee..c2d984885de 100644
--- a/be/src/exprs/function/function_variant_element.cpp
+++ b/be/src/exprs/function/function_variant_element.cpp
@@ -388,6 +388,15 @@ private:
}
break;
}
+ case simdjson::ondemand::json_type::string: {
+ // Extract the raw (unescaped) string value rather than its JSON
+ // representation. simdjson::to_json_string would keep the
surrounding
+ // double quotes (e.g. "2026-05-20"), which leaks into the result
and
+ // makes scalar-string variants inconsistent with structured ones.
+ std::string_view value_str = value.get_string().value();
+ column->insert_data(value_str.data(), value_str.length());
+ break;
+ }
default: {
auto value_str = simdjson::to_json_string(value).value();
column->insert_data(value_str.data(), value_str.length());
diff --git a/be/test/exprs/function/function_variant_element_test.cpp
b/be/test/exprs/function/function_variant_element_test.cpp
index 1a8d6985167..7db931af14b 100644
--- a/be/test/exprs/function/function_variant_element_test.cpp
+++ b/be/test/exprs/function/function_variant_element_test.cpp
@@ -61,4 +61,42 @@ TEST(function_variant_element_test,
extract_from_sparse_column) {
EXPECT_EQ(result_string, "{\"age\":\"John\",\"name\":\"John\"}");
}
+// CIR-20498: extracting a string property from a scalar-string-root variant
+// (the shape produced by `cast(text as variant)`) must return the raw string,
+// not its JSON token with surrounding double quotes.
+TEST(function_variant_element_test, extract_string_from_scalar_root) {
+ auto variant_column = ColumnVariant::create(0 /*max_subcolumns_count*/,
false);
+ auto root_column = ColumnString::create();
+ std::string doc = R"({"wsn":"SRFSPXFDVY","uploadTimeValue":"2026-05-20
18:40:02","n":49.98})";
+ root_column->insert_data(doc.data(), doc.size());
+ variant_column->create_root(std::make_shared<DataTypeString>(),
std::move(root_column));
+ variant_column->set_num_rows(1);
+ ASSERT_TRUE(variant_column->is_scalar_variant());
+
+ DataTypeSerDe::FormatOptions options;
+ auto tz = cctz::utc_time_zone();
+ options.timezone = &tz;
+
+ auto extract = [&](const std::string& key) {
+ ColumnPtr index_inner = ColumnString::create();
+ assert_cast<ColumnString*>(index_inner->assert_mutable().get())
+ ->insert_data(key.data(), key.size());
+ ColumnPtr index_column = ColumnConst::create(index_inner, 1);
+ ColumnPtr result;
+ auto status =
+ FunctionVariantElement::get_element_column(*variant_column,
index_column, &result);
+ EXPECT_TRUE(status.ok());
+ std::string out;
+ assert_cast<const ColumnVariant&>(*result.get())
+ .serialize_one_row_to_string(0, &out, options);
+ return out;
+ };
+
+ // string values: no surrounding quotes
+ EXPECT_EQ(extract("wsn"), "SRFSPXFDVY");
+ EXPECT_EQ(extract("uploadTimeValue"), "2026-05-20 18:40:02");
+ // non-string scalars keep their JSON representation
+ EXPECT_EQ(extract("n"), "49.98");
+}
+
} // namespace doris
diff --git a/regression-test/data/variant_p0/sql/select_from_value.out
b/regression-test/data/variant_p0/sql/select_from_value.out
index ef562a658e9..1fe3c49651b 100644
--- a/regression-test/data/variant_p0/sql/select_from_value.out
+++ b/regression-test/data/variant_p0/sql/select_from_value.out
@@ -1,4 +1,4 @@
-- This file is automatically generated. You should know what you did if you
want to edit this
-- !select_from_value --
-"b"
+b
diff --git a/regression-test/suites/variant_p0/element_function.groovy
b/regression-test/suites/variant_p0/element_function.groovy
index 7b5e55ea53b..fb183db9690 100644
--- a/regression-test/suites/variant_p0/element_function.groovy
+++ b/regression-test/suites/variant_p0/element_function.groovy
@@ -29,4 +29,32 @@ suite("regression_test_variant_element_at", "p0") {
sql """insert into element_fn_test values (1, '{"arr1" : [1, 2, 3]}',
'{"arr2" : [4, 5, 6]}')"""
qt_sql """select array_first((x,y) -> (x - y) < 0, cast(v['arr1'] as
array<int>), cast(v1['arr2'] as array<int>)) from element_fn_test"""
+
+ // CIR-20498: extracting a string property from a scalar-string variant
+ // (e.g. `cast(text as variant)['key']`) must not leak the surrounding JSON
+ // double quotes. The root of such a variant is a raw JSON string, so the
+ // extraction goes through the simdjson document path; a string value must
be
+ // returned unescaped, consistently with the structured-subcolumn path.
+ def scalar = sql """select
cast('{"wsn":"SRFSPXFDVY","uploadTimeValue":"2026-05-20 18:40:02"}' as
variant)['wsn']"""
+ assertEquals("SRFSPXFDVY", scalar[0][0])
+
+ def sub = sql """select substring(cast('{"uploadTimeValue":"2026-05-20
18:40:02"}' as variant)['uploadTimeValue'], 1, 10)"""
+ assertEquals("2026-05-20", sub[0][0])
+
+ // values containing escaped characters must be unescaped, not kept as raw
JSON tokens
+ def escaped = sql """select cast('{"k":"a\\\\"b"}' as variant)['k']"""
+ assertEquals("a\"b", escaped[0][0])
+
+ // non-string scalars keep their existing JSON representation
+ def num = sql """select cast('{"n":49.98}' as variant)['n']"""
+ assertEquals("49.98", num[0][0])
+
+ // array / object values must keep their JSON text representation (no
unquoting):
+ // only the top-level string scalar is unquoted; quotes nested inside JSON
are
+ // part of the value and must be preserved.
+ def arr = sql """select cast('{"a":[1,2,3]}' as variant)['a']"""
+ assertEquals("[1,2,3]", arr[0][0])
+
+ def obj = sql """select cast('{"o":{"name":"john"}}' as variant)['o']"""
+ assertEquals('{"name":"john"}', obj[0][0])
}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]