This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 2e447160fd4 [fix](split_by_string) Fix split by string core on column
string (#28030) (#28365)
2e447160fd4 is described below
commit 2e447160fd405c3ed0829f12409185d882864c90
Author: zhiqiang <[email protected]>
AuthorDate: Thu Dec 14 13:21:27 2023 +0800
[fix](split_by_string) Fix split by string core on column string (#28030)
(#28365)
---
be/src/vec/functions/function_string.h | 102 +++++++++++++++++----
.../string_functions/test_split_by_string.out | 20 ++++
.../string_functions/test_split_by_string.groovy | 60 ++++++++++++
3 files changed, 165 insertions(+), 17 deletions(-)
diff --git a/be/src/vec/functions/function_string.h
b/be/src/vec/functions/function_string.h
index 37a21a3ea5b..e48fbd263ec 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -1803,6 +1803,7 @@ public:
const auto& [right_column, right_const] =
unpack_if_const(block.get_by_position(arguments[1]).column);
+ DataTypePtr right_column_type =
block.get_by_position(arguments[1]).type;
DataTypePtr src_column_type = block.get_by_position(arguments[0]).type;
auto dest_column_ptr =
ColumnArray::create(make_nullable(src_column_type)->create_column(),
ColumnArray::ColumnOffsets::create());
@@ -1818,27 +1819,42 @@ public:
dest_nested_column = dest_nullable_col->get_nested_column_ptr();
dest_nested_null_map =
&dest_nullable_col->get_null_map_column().get_data();
- if (auto col_left =
check_and_get_column<ColumnString>(src_column.get())) {
- if (auto col_right =
check_and_get_column<ColumnString>(right_column.get())) {
- if (right_const) {
- _execute_constant(*col_left, col_right->get_data_at(0),
*dest_nested_column,
- dest_offsets, dest_nested_null_map);
- } else {
- _execute_vector(*col_left, *col_right,
*dest_nested_column, dest_offsets,
- dest_nested_null_map);
- }
+ auto col_left = check_and_get_column<ColumnString>(src_column.get());
+ if (!col_left) {
+ return Status::InternalError("Left operator of function {} can not
be {}", get_name(),
+ src_column_type->get_name());
+ }
- block.replace_by_position(result, std::move(dest_column_ptr));
- return Status::OK();
- }
+ auto col_right =
check_and_get_column<ColumnString>(right_column.get());
+ if (!col_right) {
+ return Status::InternalError("Right operator of function {} can
not be {}", get_name(),
+ right_column_type->get_name());
+ }
+
+ // split_by_string(ColumnString, "xxx")
+ if (right_const) {
+ _execute_constant_delimiter(*col_left, col_right->get_data_at(0),
*dest_nested_column,
+ dest_offsets, dest_nested_null_map);
+ } else if (left_const) {
+ // split_by_string("xxx", ColumnString)
+ _execute_constant_src_string(col_left->get_data_at(0), *col_right,
*dest_nested_column,
+ dest_offsets, dest_nested_null_map);
+ } else {
+ // split_by_string(ColumnString, ColumnString)
+ _execute_vector(*col_left, *col_right, *dest_nested_column,
dest_offsets,
+ dest_nested_null_map);
}
- return Status::RuntimeError("unimplements function {}", get_name());
+
+ block.replace_by_position(result, std::move(dest_column_ptr));
+
+ return Status::OK();
}
private:
- void _execute_constant(const ColumnString& src_column_string, const
StringRef& delimiter_ref,
- IColumn& dest_nested_column,
ColumnArray::Offsets64& dest_offsets,
- NullMapType* dest_nested_null_map) {
+ void _execute_constant_delimiter(const ColumnString& src_column_string,
+ const StringRef& delimiter_ref, IColumn&
dest_nested_column,
+ ColumnArray::Offsets64& dest_offsets,
+ NullMapType* dest_nested_null_map) const {
ColumnString& dest_column_string =
reinterpret_cast<ColumnString&>(dest_nested_column);
ColumnString::Chars& column_string_chars =
dest_column_string.get_chars();
ColumnString::Offsets& column_string_offsets =
dest_column_string.get_offsets();
@@ -1958,7 +1974,59 @@ private:
}
}
- size_t split_str(size_t& pos, const StringRef str_ref, StringRef
delimiter_ref) {
+ void _execute_constant_src_string(const StringRef& str_ref, const
ColumnString& delimiter_col,
+ IColumn& dest_nested_column,
+ ColumnArray::Offsets64& dest_offsets,
+ NullMapType* dest_nested_null_map) const
{
+ ColumnString& dest_column_string =
reinterpret_cast<ColumnString&>(dest_nested_column);
+ ColumnString::Chars& column_string_chars =
dest_column_string.get_chars();
+ ColumnString::Offsets& column_string_offsets =
dest_column_string.get_offsets();
+ column_string_chars.reserve(0);
+
+ ColumnArray::Offset64 string_pos = 0;
+ ColumnArray::Offset64 dest_pos = 0;
+ const ColumnArray::Offset64 delimiter_offsets_size =
delimiter_col.get_offsets().size();
+
+ for (size_t i = 0; i < delimiter_offsets_size; ++i) {
+ const StringRef delimiter_ref = delimiter_col.get_data_at(i);
+
+ if (delimiter_ref.size == 0) {
+ for (size_t str_pos = 0; str_pos < str_ref.size;) {
+ const size_t str_offset = str_pos;
+ const size_t old_size = column_string_chars.size();
+ str_pos++;
+ const size_t new_size = old_size + 1;
+ column_string_chars.resize(new_size);
+ memcpy(column_string_chars.data() + old_size, str_ref.data
+ str_offset, 1);
+ (*dest_nested_null_map).push_back(false);
+ string_pos++;
+ dest_pos++;
+ column_string_offsets.push_back(string_pos);
+ }
+ } else {
+ for (size_t str_pos = 0; str_pos <= str_ref.size;) {
+ const size_t str_offset = str_pos;
+ const size_t old_size = column_string_chars.size();
+ const size_t split_part_size = split_str(str_pos, str_ref,
delimiter_ref);
+ str_pos += delimiter_ref.size;
+ const size_t new_size = old_size + split_part_size;
+ column_string_chars.resize(new_size);
+ if (split_part_size > 0) {
+ memcpy_small_allow_read_write_overflow15(
+ column_string_chars.data() + old_size,
str_ref.data + str_offset,
+ split_part_size);
+ }
+ (*dest_nested_null_map).push_back(false);
+ string_pos += split_part_size;
+ dest_pos++;
+ column_string_offsets.push_back(string_pos);
+ }
+ }
+ dest_offsets.push_back(dest_pos);
+ }
+ }
+
+ size_t split_str(size_t& pos, const StringRef str_ref, StringRef
delimiter_ref) const {
size_t old_size = pos;
size_t str_size = str_ref.size;
while (pos < str_size &&
diff --git
a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
index 00d9ad99781..c46fa2bd27e 100644
---
a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
+++
b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
@@ -87,3 +87,23 @@
9 a,b,c, , ["a", "b", "c", ""]
10 \N , \N
+-- !sql_1 --
+1 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
+2 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
+
+-- !sql_2 --
+3 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
+4 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
+
+-- !sql_3 --
+1 [] [] [] []
+2 [] [] [] []
+3 ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a",
",", "b", ",", "c"] ["a", ",", "b", ",", "c"]
+4 ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a",
",", "b", ",", "c"] ["a", ",", "b", ",", "c"]
+
+-- !sql_4 --
+1 [] [] [] []
+2 [] [] [] []
+3 [""] [""] [""] [""]
+4 [""] [""] [""] [""]
+
diff --git
a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
index d3f05885181..2ec70e36124 100644
---
a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
+++
b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
@@ -102,4 +102,64 @@ suite("test_split_by_string") {
qt_sql "SELECT *, split_by_string(v1, v2) FROM ${tableName2} ORDER BY k1"
+
+ // Case where both of operator are column string is covered by above test.
+ sql """DROP TABLE IF EXISTS test_split_by_string_2"""
+ sql """
+ CREATE TABLE IF NOT EXISTS test_split_by_string_2 (
+ `rid` INT NULL,
+ `str` TEXT NULL,
+ `vc` VARCHAR(5) NULL,
+ `chr` CHAR(5) NULL,
+ `txt` TEXT NULL
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`rid`)
+ DISTRIBUTED BY HASH(`rid`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "storage_format" = "V2"
+ )
+ """
+ sql """ INSERT INTO test_split_by_string_2
+ VALUES (1, "", "", "", ""),
+ (2, "", "", "", ""),
+ (3, "a,b,c", "a,b,c", "a,b,c", "a,b,c"),
+ (4, "a,b,c", "a,b,c", "a,b,c", "a,b,c")
+ """
+ // Left operator is const, right operator is column string
+ qt_sql_1 """
+ SELECT rid,
+ split_by_string("abc", str),
+ split_by_string("abc", vc),
+ split_by_string("abc", chr),
+ split_by_string("abc", txt)
+ FROM test_split_by_string_2 WHERE rid=1 OR rid=2 ORDER BY rid;
+ """
+ // Left operator is column string, right operator is const
+ qt_sql_2 """
+ SELECT rid,
+ split_by_string(str, ","),
+ split_by_string(vc, ","),
+ split_by_string(chr, ","),
+ split_by_string(txt, ",")
+ FROM test_split_by_string_2 WHERE rid=3 OR rid=4 ORDER BY rid;
+ """
+
+ // Empty string
+ qt_sql_3 """
+ SELECT rid,
+ split_by_string(str, ""),
+ split_by_string(vc, ""),
+ split_by_string(chr, ""),
+ split_by_string(txt, "")
+ FROM test_split_by_string_2 ORDER BY rid;
+ """
+ qt_sql_4 """
+ SELECT rid,
+ split_by_string("", str),
+ split_by_string("", vc),
+ split_by_string("", chr),
+ split_by_string("", txt)
+ FROM test_split_by_string_2 ORDER BY rid;
+ """
}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]