This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new 2d14d48ee7f [feature](function) support is_valid_utf8 function
(#62515) (#64781)
2d14d48ee7f is described below
commit 2d14d48ee7fb3139286e5870988eb97e2f1a5eec
Author: Mryange <[email protected]>
AuthorDate: Thu Jun 25 18:15:49 2026 +0800
[feature](function) support is_valid_utf8 function (#62515) (#64781)
Add `is_valid_utf8(s)` / `isValidUTF8(s)` function that returns `true`
if the input is valid UTF-8, `false` otherwise. Also adds
`is_valid_utf8()` method to `ColumnStr` for column-level UTF-8
validation.
(cherry picked from commit 08b6ecd5fdc524745b86ff1a86e8c979da93afed)
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/core/column/column_string.cpp | 15 ++++
be/src/core/column/column_string.h | 4 +
be/src/exprs/function/function_string.cpp | 27 +++++++
be/test/core/column/column_string_test.cpp | 85 ++++++++++++++++++++++
be/test/exprs/function/function_string_test.cpp | 30 ++++++++
.../doris/catalog/BuiltinScalarFunctions.java | 2 +
.../expressions/functions/scalar/IsValidUtf8.java | 76 +++++++++++++++++++
.../expressions/visitor/ScalarFunctionVisitor.java | 5 ++
.../string_functions/test_is_valid_utf8.out | 59 +++++++++++++++
.../string_functions/test_is_valid_utf8.groovy | 80 ++++++++++++++++++++
10 files changed, 383 insertions(+)
diff --git a/be/src/core/column/column_string.cpp
b/be/src/core/column/column_string.cpp
index 9906f84c12b..b222030c293 100644
--- a/be/src/core/column/column_string.cpp
+++ b/be/src/core/column/column_string.cpp
@@ -36,6 +36,7 @@
#include "util/simd/bits.h"
#include "util/simd/vstring_function.h"
#include "util/unaligned.h"
+#include "util/utf8_check.h"
namespace doris {
#include "common/compile_check_begin.h"
@@ -694,6 +695,20 @@ bool ColumnStr<T>::is_ascii() const {
return simd::VStringFunctions::is_ascii(StringRef(chars.data(),
chars.size()));
}
+template <typename T>
+bool ColumnStr<T>::is_valid_utf8() const {
+ const auto num_rows = offsets.size();
+ const char* data = reinterpret_cast<const char*>(chars.data());
+ for (size_t i = 0; i < num_rows; ++i) {
+ auto str_offset = offset_at(i);
+ auto str_size = size_at(i);
+ if (!validate_utf8(data + str_offset, str_size)) {
+ return false;
+ }
+ }
+ return true;
+}
+
template class ColumnStr<uint32_t>;
template class ColumnStr<uint64_t>;
} // namespace doris
diff --git a/be/src/core/column/column_string.h
b/be/src/core/column/column_string.h
index 608ccb7bc5b..5b748ae18e9 100644
--- a/be/src/core/column/column_string.h
+++ b/be/src/core/column/column_string.h
@@ -52,6 +52,9 @@ class Arena;
class ColumnSorter;
/** Column for String values.
+ * Note: In string functions, we assume that ColumnStr contains valid UTF-8
encoded data.
+ * However, ColumnStr is not guaranteed to always hold valid UTF-8, since it
is also used
+ * as a serialization container where the content may be arbitrary binary
data.
*/
template <typename T>
class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> {
@@ -514,6 +517,7 @@ public:
}
bool is_ascii() const;
+ bool is_valid_utf8() const;
Chars& get_chars() { return chars; }
const Chars& get_chars() const { return chars; }
diff --git a/be/src/exprs/function/function_string.cpp
b/be/src/exprs/function/function_string.cpp
index adb90591e15..b080f22622e 100644
--- a/be/src/exprs/function/function_string.cpp
+++ b/be/src/exprs/function/function_string.cpp
@@ -44,6 +44,7 @@
#include "exprs/function/string_hex_util.h"
#include "util/string_search.hpp"
#include "util/url_coding.h"
+#include "util/utf8_check.h"
namespace doris {
#include "common/compile_check_begin.h"
@@ -226,6 +227,29 @@ struct StringUtf8LengthImpl {
}
};
+struct NameIsValidUTF8 {
+ static constexpr auto name = "is_valid_utf8";
+};
+
+struct IsValidUTF8Impl {
+ using ReturnType = DataTypeUInt8;
+ static constexpr auto PrimitiveTypeImpl = PrimitiveType::TYPE_STRING;
+ using Type = String;
+ using ReturnColumnType = ColumnUInt8;
+
+ static Status vector(const ColumnString::Chars& data, const
ColumnString::Offsets& offsets,
+ PaddedPODArray<UInt8>& res) {
+ auto size = offsets.size();
+ res.resize(size);
+ for (size_t i = 0; i < size; ++i) {
+ const char* raw_str = reinterpret_cast<const
char*>(&data[offsets[i - 1]]);
+ size_t str_size = offsets[i] - offsets[i - 1];
+ res[i] = validate_utf8(raw_str, str_size) ? 1 : 0;
+ }
+ return Status::OK();
+ }
+};
+
struct NameStartsWith {
static constexpr auto name = "starts_with";
};
@@ -1305,6 +1329,7 @@ using FunctionStringLength =
FunctionUnaryToType<StringLengthImpl, NameStringLen
using FunctionCrc32 = FunctionUnaryToType<Crc32Impl, NameCrc32>;
using FunctionStringUTF8Length = FunctionUnaryToType<StringUtf8LengthImpl,
NameStringUtf8Length>;
using FunctionStringSpace = FunctionUnaryToType<StringSpace, NameStringSpace>;
+using FunctionIsValidUTF8 = FunctionUnaryToType<IsValidUTF8Impl,
NameIsValidUTF8>;
using FunctionStringStartsWith =
FunctionBinaryToType<DataTypeString, DataTypeString,
StringStartsWithImpl, NameStartsWith>;
using FunctionStringEndsWith =
@@ -1411,7 +1436,9 @@ void register_function_string(SimpleFunctionFactory&
factory) {
factory.register_function<FunctionSubReplace<SubReplaceThreeImpl>>();
factory.register_function<FunctionSubReplace<SubReplaceFourImpl>>();
factory.register_function<FunctionOverlay>();
+ factory.register_function<FunctionIsValidUTF8>();
+ factory.register_alias(FunctionIsValidUTF8::name, "isValidUTF8");
factory.register_alias(FunctionToLower::name, "lcase");
factory.register_alias(FunctionToUpper::name, "ucase");
factory.register_alias(FunctionStringUTF8Length::name, "character_length");
diff --git a/be/test/core/column/column_string_test.cpp
b/be/test/core/column/column_string_test.cpp
index 511ee87dff0..cf807f88615 100644
--- a/be/test/core/column/column_string_test.cpp
+++ b/be/test/core/column/column_string_test.cpp
@@ -1403,4 +1403,89 @@ TEST_F(ColumnStringTest, is_ascii) {
}
}
+TEST_F(ColumnStringTest, is_valid_utf8) {
+ // all ASCII strings are valid UTF-8
+ {
+ auto column = ColumnString::create();
+ column->insert_data("hello", 5);
+ column->insert_data("world", 5);
+ column->insert_data("123!@#", 6);
+ EXPECT_TRUE(column->is_valid_utf8());
+ }
+ // empty column is valid
+ {
+ auto column = ColumnString::create();
+ EXPECT_TRUE(column->is_valid_utf8());
+ }
+ // empty strings are valid UTF-8
+ {
+ auto column = ColumnString::create();
+ column->insert_data("", 0);
+ column->insert_data("", 0);
+ EXPECT_TRUE(column->is_valid_utf8());
+ }
+ // multi-byte UTF-8 characters
+ {
+ auto column = ColumnString::create();
+ column->insert_data("Hello, 世界", strlen("Hello, 世界"));
+ column->insert_data("こんにちは", strlen("こんにちは"));
+ column->insert_data("😀", strlen("😀"));
+ EXPECT_TRUE(column->is_valid_utf8());
+ }
+ // invalid: lone continuation byte 0x80
+ {
+ auto column = ColumnString::create();
+ const char data[] = {'\x80'};
+ column->insert_data(data, 1);
+ EXPECT_FALSE(column->is_valid_utf8());
+ }
+ // invalid: bad 2-byte sequence 0xC3 0x28
+ {
+ auto column = ColumnString::create();
+ const char data[] = {'\xc3', '\x28'};
+ column->insert_data(data, 2);
+ EXPECT_FALSE(column->is_valid_utf8());
+ }
+ // invalid: overlong encoding 0xC0 0xAF
+ {
+ auto column = ColumnString::create();
+ const char data[] = {'\xc0', '\xaf'};
+ column->insert_data(data, 2);
+ EXPECT_FALSE(column->is_valid_utf8());
+ }
+ // invalid: 0xFE byte
+ {
+ auto column = ColumnString::create();
+ const char data[] = {'\xfe'};
+ column->insert_data(data, 1);
+ EXPECT_FALSE(column->is_valid_utf8());
+ }
+ // invalid: truncated 3-byte sequence 0xE4 0xB8
+ {
+ auto column = ColumnString::create();
+ const char data[] = {'\xe4', '\xb8'};
+ column->insert_data(data, 2);
+ EXPECT_FALSE(column->is_valid_utf8());
+ }
+ // mixed: one invalid byte makes the whole column invalid
+ {
+ auto column = ColumnString::create();
+ column->insert_data("hello", 5);
+ const char bad[] = {'\xff'};
+ column->insert_data(bad, 1);
+ column->insert_data("world", 5);
+ EXPECT_FALSE(column->is_valid_utf8());
+ }
+ // cross-row concatenation: "\xE4" + "\xB8\x96" form valid UTF-8 (世) when
+ // concatenated, but each row is invalid individually. Must validate
per-row.
+ {
+ auto column = ColumnString::create();
+ const char row1[] = {'\xe4'};
+ const char row2[] = {'\xb8', '\x96'};
+ column->insert_data(row1, 1);
+ column->insert_data(row2, 2);
+ EXPECT_FALSE(column->is_valid_utf8());
+ }
+}
+
} // namespace doris
\ No newline at end of file
diff --git a/be/test/exprs/function/function_string_test.cpp
b/be/test/exprs/function/function_string_test.cpp
index 90456da258a..4aa9c2e0416 100644
--- a/be/test/exprs/function/function_string_test.cpp
+++ b/be/test/exprs/function/function_string_test.cpp
@@ -994,6 +994,36 @@ TEST(function_string_test, function_ascii_test) {
check_function_all_arg_comb<DataTypeInt32, true>(func_name, input_types,
data_set);
}
+TEST(function_string_test, function_is_valid_utf8_test) {
+ std::string func_name = "is_valid_utf8";
+
+ InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR};
+
+ DataSet data_set = {
+ // valid UTF-8 strings
+ {{std::string("hello")}, std::uint8_t(1)},
+ {{std::string("")}, std::uint8_t(1)},
+ {{std::string("Hello, 世界")}, std::uint8_t(1)},
+ {{std::string("こんにちは")}, std::uint8_t(1)},
+ {{std::string("123!@#")}, std::uint8_t(1)},
+ {{std::string("\xc3\xb1")}, std::uint8_t(1)}, // ñ
+ {{std::string("\xe2\x82\xac")}, std::uint8_t(1)}, // €
+ {{std::string("\xf0\x9f\x98\x80")}, std::uint8_t(1)}, // 😀
+ // invalid UTF-8 strings
+ {{std::string("\x80")}, std::uint8_t(0)}, // invalid
leading byte
+ {{std::string("\xc3\x28")}, std::uint8_t(0)}, // invalid
2-byte sequence
+ {{std::string("\xe2\x28\xa1")}, std::uint8_t(0)}, // invalid
3-byte sequence
+ {{std::string("\xf0\x28\x8c\xbc")}, std::uint8_t(0)}, // invalid
4-byte sequence
+ {{std::string("\xfe")}, std::uint8_t(0)}, // invalid
byte 0xFE
+ {{std::string("\xff")}, std::uint8_t(0)}, // invalid
byte 0xFF
+ {{std::string("abc\xc0\xaf")}, std::uint8_t(0)}, // overlong
encoding
+ // NULL
+ {{Null()}, Null()},
+ };
+
+ check_function_all_arg_comb<DataTypeUInt8, true>(func_name, input_types,
data_set);
+}
+
TEST(function_string_test, function_char_length_test) {
std::string func_name = "char_length";
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index 21b25db7661..1f8b9c7b19a 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -279,6 +279,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv4Mapped;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv4String;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains;
@@ -841,6 +842,7 @@ public class BuiltinScalarFunctions implements
FunctionHelper {
scalar(IsIpv6String.class, "is_ipv6_string", "is_ipv6"),
scalar(IsIpAddressInRange.class, "is_ip_address_in_range"),
scalar(IsNan.class, "isnan"),
+ scalar(IsValidUtf8.class, "is_valid_utf8", "isValidUTF8"),
scalar(IsInf.class, "isinf"),
scalar(Ipv4CIDRToRange.class, "ipv4_cidr_to_range"),
scalar(Ipv6CIDRToRange.class, "ipv6_cidr_to_range"),
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/IsValidUtf8.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/IsValidUtf8.java
new file mode 100644
index 00000000000..0c045182785
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/IsValidUtf8.java
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.BooleanType;
+import org.apache.doris.nereids.types.StringType;
+import org.apache.doris.nereids.types.VarcharType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'is_valid_utf8'.
+ */
+public class IsValidUtf8 extends ScalarFunction
+ implements UnaryExpression, ExplicitlyCastableSignature,
PropagateNullable {
+
+ public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+
FunctionSignature.ret(BooleanType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT),
+
FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE)
+ );
+
+ /**
+ * constructor with 1 argument.
+ */
+ public IsValidUtf8(Expression arg) {
+ super("is_valid_utf8", arg);
+ }
+
+ /** constructor for withChildren and reuse signature */
+ private IsValidUtf8(ScalarFunctionParams functionParams) {
+ super(functionParams);
+ }
+
+ /**
+ * withChildren.
+ */
+ @Override
+ public IsValidUtf8 withChildren(List<Expression> children) {
+ Preconditions.checkArgument(children.size() == 1);
+ return new IsValidUtf8(getFunctionParams(children));
+ }
+
+ @Override
+ public List<FunctionSignature> getSignatures() {
+ return SIGNATURES;
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitIsValidUtf8(this, context);
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index 737f815561b..4b827b35dcc 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -298,6 +298,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv4Mapped;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv4String;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String;
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8;
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains;
@@ -1702,6 +1703,10 @@ public interface ScalarFunctionVisitor<R, C> {
return visitScalarFunction(isNan, context);
}
+ default R visitIsValidUtf8(IsValidUtf8 isValidUtf8, C context) {
+ return visitScalarFunction(isValidUtf8, context);
+ }
+
default R visitIsInf(IsInf isInf, C context) {
return visitScalarFunction(isInf, context);
}
diff --git
a/regression-test/data/query_p0/sql_functions/string_functions/test_is_valid_utf8.out
b/regression-test/data/query_p0/sql_functions/string_functions/test_is_valid_utf8.out
new file mode 100644
index 00000000000..4d1ba0a2de8
--- /dev/null
+++
b/regression-test/data/query_p0/sql_functions/string_functions/test_is_valid_utf8.out
@@ -0,0 +1,59 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !valid_1 --
+true
+
+-- !valid_2 --
+true
+
+-- !valid_3 --
+true
+
+-- !valid_4 --
+true
+
+-- !valid_5 --
+true
+
+-- !null_1 --
+\N
+
+-- !invalid_1 --
+false
+
+-- !invalid_2 --
+false
+
+-- !invalid_3 --
+false
+
+-- !invalid_4 --
+false
+
+-- !invalid_5 --
+false
+
+-- !invalid_6 --
+false
+
+-- !invalid_7 --
+false
+
+-- !invalid_8 --
+false
+
+-- !alias_1 --
+true
+
+-- !alias_2 --
+true
+
+-- !alias_3 --
+false
+
+-- !table_1 --
+1 true
+2 true
+3 true
+4 \N
+5 false
+6 false
diff --git
a/regression-test/suites/query_p0/sql_functions/string_functions/test_is_valid_utf8.groovy
b/regression-test/suites/query_p0/sql_functions/string_functions/test_is_valid_utf8.groovy
new file mode 100644
index 00000000000..2883ab41976
--- /dev/null
+++
b/regression-test/suites/query_p0/sql_functions/string_functions/test_is_valid_utf8.groovy
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_is_valid_utf8") {
+ // basic valid UTF-8 strings
+ qt_valid_1 "SELECT is_valid_utf8('hello');"
+ qt_valid_2 "SELECT is_valid_utf8('');"
+ qt_valid_3 "SELECT is_valid_utf8('Hello, 世界');"
+ qt_valid_4 "SELECT is_valid_utf8('こんにちは');"
+ qt_valid_5 "SELECT is_valid_utf8('123!@#');"
+
+ // NULL handling
+ qt_null_1 "SELECT is_valid_utf8(NULL);"
+
+ // invalid UTF-8 strings constructed via unhex
+ // 0x80: lone continuation byte
+ qt_invalid_1 "SELECT is_valid_utf8(unhex('80'));"
+ // 0xC3 0x28: invalid 2-byte sequence (second byte not continuation)
+ qt_invalid_2 "SELECT is_valid_utf8(unhex('C328'));"
+ // 0xE2 0x28 0xA1: invalid 3-byte sequence (second byte not continuation)
+ qt_invalid_3 "SELECT is_valid_utf8(unhex('E228A1'));"
+ // 0xF0 0x28 0x8C 0xBC: invalid 4-byte sequence (second byte not
continuation)
+ qt_invalid_4 "SELECT is_valid_utf8(unhex('F0288CBC'));"
+ // 0xFE: not valid in UTF-8
+ qt_invalid_5 "SELECT is_valid_utf8(unhex('FE'));"
+ // 0xFF: not valid in UTF-8
+ qt_invalid_6 "SELECT is_valid_utf8(unhex('FF'));"
+ // overlong encoding of '/' (U+002F): 0xC0 0xAF
+ qt_invalid_7 "SELECT is_valid_utf8(unhex('C0AF'));"
+ // truncated 3-byte sequence: 0xE4 0xB8
+ qt_invalid_8 "SELECT is_valid_utf8(unhex('E4B8'));"
+
+ // alias isValidUTF8
+ qt_alias_1 "SELECT isValidUTF8('hello');"
+ qt_alias_2 "SELECT isValidUTF8('');"
+ // alias with invalid bytes
+ qt_alias_3 "SELECT isValidUTF8(unhex('80'));"
+
+ // test with table data (including invalid UTF-8 via unhex)
+ sql "DROP TABLE IF EXISTS test_is_valid_utf8_tbl"
+ sql """
+ CREATE TABLE test_is_valid_utf8_tbl (
+ id INT,
+ val VARCHAR(200)
+ ) DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO test_is_valid_utf8_tbl VALUES
+ (1, 'hello'),
+ (2, ''),
+ (3, 'Hello, 世界'),
+ (4, NULL);
+ """
+ sql "INSERT INTO test_is_valid_utf8_tbl VALUES (5, unhex('C0AF'));"
+ sql "INSERT INTO test_is_valid_utf8_tbl VALUES (6, unhex('FF'));"
+
+ order_qt_table_1 "SELECT id, is_valid_utf8(val) FROM
test_is_valid_utf8_tbl ORDER BY id;"
+
+ // test fold const
+ testFoldConst("SELECT is_valid_utf8('hello');")
+ testFoldConst("SELECT is_valid_utf8('');")
+ testFoldConst("SELECT is_valid_utf8(NULL);")
+ testFoldConst("SELECT isValidUTF8('hello');")
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]