This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 2d14d48ee7f [feature](function) support is_valid_utf8 function 
(#62515) (#64781)
2d14d48ee7f is described below

commit 2d14d48ee7fb3139286e5870988eb97e2f1a5eec
Author: Mryange <[email protected]>
AuthorDate: Thu Jun 25 18:15:49 2026 +0800

    [feature](function) support is_valid_utf8 function (#62515) (#64781)
    
    Add `is_valid_utf8(s)` / `isValidUTF8(s)` function that returns `true`
    if the input is valid UTF-8, `false` otherwise. Also adds
    `is_valid_utf8()` method to `ColumnStr` for column-level UTF-8
    validation.
    
    (cherry picked from commit 08b6ecd5fdc524745b86ff1a86e8c979da93afed)
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary:
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/core/column/column_string.cpp               | 15 ++++
 be/src/core/column/column_string.h                 |  4 +
 be/src/exprs/function/function_string.cpp          | 27 +++++++
 be/test/core/column/column_string_test.cpp         | 85 ++++++++++++++++++++++
 be/test/exprs/function/function_string_test.cpp    | 30 ++++++++
 .../doris/catalog/BuiltinScalarFunctions.java      |  2 +
 .../expressions/functions/scalar/IsValidUtf8.java  | 76 +++++++++++++++++++
 .../expressions/visitor/ScalarFunctionVisitor.java |  5 ++
 .../string_functions/test_is_valid_utf8.out        | 59 +++++++++++++++
 .../string_functions/test_is_valid_utf8.groovy     | 80 ++++++++++++++++++++
 10 files changed, 383 insertions(+)

diff --git a/be/src/core/column/column_string.cpp 
b/be/src/core/column/column_string.cpp
index 9906f84c12b..b222030c293 100644
--- a/be/src/core/column/column_string.cpp
+++ b/be/src/core/column/column_string.cpp
@@ -36,6 +36,7 @@
 #include "util/simd/bits.h"
 #include "util/simd/vstring_function.h"
 #include "util/unaligned.h"
+#include "util/utf8_check.h"
 namespace doris {
 #include "common/compile_check_begin.h"
 
@@ -694,6 +695,20 @@ bool ColumnStr<T>::is_ascii() const {
     return simd::VStringFunctions::is_ascii(StringRef(chars.data(), 
chars.size()));
 }
 
+template <typename T>
+bool ColumnStr<T>::is_valid_utf8() const {
+    const auto num_rows = offsets.size();
+    const char* data = reinterpret_cast<const char*>(chars.data());
+    for (size_t i = 0; i < num_rows; ++i) {
+        auto str_offset = offset_at(i);
+        auto str_size = size_at(i);
+        if (!validate_utf8(data + str_offset, str_size)) {
+            return false;
+        }
+    }
+    return true;
+}
+
 template class ColumnStr<uint32_t>;
 template class ColumnStr<uint64_t>;
 } // namespace doris
diff --git a/be/src/core/column/column_string.h 
b/be/src/core/column/column_string.h
index 608ccb7bc5b..5b748ae18e9 100644
--- a/be/src/core/column/column_string.h
+++ b/be/src/core/column/column_string.h
@@ -52,6 +52,9 @@ class Arena;
 class ColumnSorter;
 
 /** Column for String values.
+  * Note: In string functions, we assume that ColumnStr contains valid UTF-8 
encoded data.
+  * However, ColumnStr is not guaranteed to always hold valid UTF-8, since it 
is also used
+  * as a serialization container where the content may be arbitrary binary 
data.
   */
 template <typename T>
 class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> {
@@ -514,6 +517,7 @@ public:
     }
 
     bool is_ascii() const;
+    bool is_valid_utf8() const;
 
     Chars& get_chars() { return chars; }
     const Chars& get_chars() const { return chars; }
diff --git a/be/src/exprs/function/function_string.cpp 
b/be/src/exprs/function/function_string.cpp
index adb90591e15..b080f22622e 100644
--- a/be/src/exprs/function/function_string.cpp
+++ b/be/src/exprs/function/function_string.cpp
@@ -44,6 +44,7 @@
 #include "exprs/function/string_hex_util.h"
 #include "util/string_search.hpp"
 #include "util/url_coding.h"
+#include "util/utf8_check.h"
 
 namespace doris {
 #include "common/compile_check_begin.h"
@@ -226,6 +227,29 @@ struct StringUtf8LengthImpl {
     }
 };
 
+struct NameIsValidUTF8 {
+    static constexpr auto name = "is_valid_utf8";
+};
+
+struct IsValidUTF8Impl {
+    using ReturnType = DataTypeUInt8;
+    static constexpr auto PrimitiveTypeImpl = PrimitiveType::TYPE_STRING;
+    using Type = String;
+    using ReturnColumnType = ColumnUInt8;
+
+    static Status vector(const ColumnString::Chars& data, const 
ColumnString::Offsets& offsets,
+                         PaddedPODArray<UInt8>& res) {
+        auto size = offsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            const char* raw_str = reinterpret_cast<const 
char*>(&data[offsets[i - 1]]);
+            size_t str_size = offsets[i] - offsets[i - 1];
+            res[i] = validate_utf8(raw_str, str_size) ? 1 : 0;
+        }
+        return Status::OK();
+    }
+};
+
 struct NameStartsWith {
     static constexpr auto name = "starts_with";
 };
@@ -1305,6 +1329,7 @@ using FunctionStringLength = 
FunctionUnaryToType<StringLengthImpl, NameStringLen
 using FunctionCrc32 = FunctionUnaryToType<Crc32Impl, NameCrc32>;
 using FunctionStringUTF8Length = FunctionUnaryToType<StringUtf8LengthImpl, 
NameStringUtf8Length>;
 using FunctionStringSpace = FunctionUnaryToType<StringSpace, NameStringSpace>;
+using FunctionIsValidUTF8 = FunctionUnaryToType<IsValidUTF8Impl, 
NameIsValidUTF8>;
 using FunctionStringStartsWith =
         FunctionBinaryToType<DataTypeString, DataTypeString, 
StringStartsWithImpl, NameStartsWith>;
 using FunctionStringEndsWith =
@@ -1411,7 +1436,9 @@ void register_function_string(SimpleFunctionFactory& 
factory) {
     factory.register_function<FunctionSubReplace<SubReplaceThreeImpl>>();
     factory.register_function<FunctionSubReplace<SubReplaceFourImpl>>();
     factory.register_function<FunctionOverlay>();
+    factory.register_function<FunctionIsValidUTF8>();
 
+    factory.register_alias(FunctionIsValidUTF8::name, "isValidUTF8");
     factory.register_alias(FunctionToLower::name, "lcase");
     factory.register_alias(FunctionToUpper::name, "ucase");
     factory.register_alias(FunctionStringUTF8Length::name, "character_length");
diff --git a/be/test/core/column/column_string_test.cpp 
b/be/test/core/column/column_string_test.cpp
index 511ee87dff0..cf807f88615 100644
--- a/be/test/core/column/column_string_test.cpp
+++ b/be/test/core/column/column_string_test.cpp
@@ -1403,4 +1403,89 @@ TEST_F(ColumnStringTest, is_ascii) {
     }
 }
 
+TEST_F(ColumnStringTest, is_valid_utf8) {
+    // all ASCII strings are valid UTF-8
+    {
+        auto column = ColumnString::create();
+        column->insert_data("hello", 5);
+        column->insert_data("world", 5);
+        column->insert_data("123!@#", 6);
+        EXPECT_TRUE(column->is_valid_utf8());
+    }
+    // empty column is valid
+    {
+        auto column = ColumnString::create();
+        EXPECT_TRUE(column->is_valid_utf8());
+    }
+    // empty strings are valid UTF-8
+    {
+        auto column = ColumnString::create();
+        column->insert_data("", 0);
+        column->insert_data("", 0);
+        EXPECT_TRUE(column->is_valid_utf8());
+    }
+    // multi-byte UTF-8 characters
+    {
+        auto column = ColumnString::create();
+        column->insert_data("Hello, 世界", strlen("Hello, 世界"));
+        column->insert_data("こんにちは", strlen("こんにちは"));
+        column->insert_data("😀", strlen("😀"));
+        EXPECT_TRUE(column->is_valid_utf8());
+    }
+    // invalid: lone continuation byte 0x80
+    {
+        auto column = ColumnString::create();
+        const char data[] = {'\x80'};
+        column->insert_data(data, 1);
+        EXPECT_FALSE(column->is_valid_utf8());
+    }
+    // invalid: bad 2-byte sequence 0xC3 0x28
+    {
+        auto column = ColumnString::create();
+        const char data[] = {'\xc3', '\x28'};
+        column->insert_data(data, 2);
+        EXPECT_FALSE(column->is_valid_utf8());
+    }
+    // invalid: overlong encoding 0xC0 0xAF
+    {
+        auto column = ColumnString::create();
+        const char data[] = {'\xc0', '\xaf'};
+        column->insert_data(data, 2);
+        EXPECT_FALSE(column->is_valid_utf8());
+    }
+    // invalid: 0xFE byte
+    {
+        auto column = ColumnString::create();
+        const char data[] = {'\xfe'};
+        column->insert_data(data, 1);
+        EXPECT_FALSE(column->is_valid_utf8());
+    }
+    // invalid: truncated 3-byte sequence 0xE4 0xB8
+    {
+        auto column = ColumnString::create();
+        const char data[] = {'\xe4', '\xb8'};
+        column->insert_data(data, 2);
+        EXPECT_FALSE(column->is_valid_utf8());
+    }
+    // mixed: one invalid byte makes the whole column invalid
+    {
+        auto column = ColumnString::create();
+        column->insert_data("hello", 5);
+        const char bad[] = {'\xff'};
+        column->insert_data(bad, 1);
+        column->insert_data("world", 5);
+        EXPECT_FALSE(column->is_valid_utf8());
+    }
+    // cross-row concatenation: "\xE4" + "\xB8\x96" form valid UTF-8 (世) when
+    // concatenated, but each row is invalid individually. Must validate 
per-row.
+    {
+        auto column = ColumnString::create();
+        const char row1[] = {'\xe4'};
+        const char row2[] = {'\xb8', '\x96'};
+        column->insert_data(row1, 1);
+        column->insert_data(row2, 2);
+        EXPECT_FALSE(column->is_valid_utf8());
+    }
+}
+
 } // namespace doris
\ No newline at end of file
diff --git a/be/test/exprs/function/function_string_test.cpp 
b/be/test/exprs/function/function_string_test.cpp
index 90456da258a..4aa9c2e0416 100644
--- a/be/test/exprs/function/function_string_test.cpp
+++ b/be/test/exprs/function/function_string_test.cpp
@@ -994,6 +994,36 @@ TEST(function_string_test, function_ascii_test) {
     check_function_all_arg_comb<DataTypeInt32, true>(func_name, input_types, 
data_set);
 }
 
+TEST(function_string_test, function_is_valid_utf8_test) {
+    std::string func_name = "is_valid_utf8";
+
+    InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR};
+
+    DataSet data_set = {
+            // valid UTF-8 strings
+            {{std::string("hello")}, std::uint8_t(1)},
+            {{std::string("")}, std::uint8_t(1)},
+            {{std::string("Hello, 世界")}, std::uint8_t(1)},
+            {{std::string("こんにちは")}, std::uint8_t(1)},
+            {{std::string("123!@#")}, std::uint8_t(1)},
+            {{std::string("\xc3\xb1")}, std::uint8_t(1)},         // ñ
+            {{std::string("\xe2\x82\xac")}, std::uint8_t(1)},     // €
+            {{std::string("\xf0\x9f\x98\x80")}, std::uint8_t(1)}, // 😀
+            // invalid UTF-8 strings
+            {{std::string("\x80")}, std::uint8_t(0)},             // invalid 
leading byte
+            {{std::string("\xc3\x28")}, std::uint8_t(0)},         // invalid 
2-byte sequence
+            {{std::string("\xe2\x28\xa1")}, std::uint8_t(0)},     // invalid 
3-byte sequence
+            {{std::string("\xf0\x28\x8c\xbc")}, std::uint8_t(0)}, // invalid 
4-byte sequence
+            {{std::string("\xfe")}, std::uint8_t(0)},             // invalid 
byte 0xFE
+            {{std::string("\xff")}, std::uint8_t(0)},             // invalid 
byte 0xFF
+            {{std::string("abc\xc0\xaf")}, std::uint8_t(0)},      // overlong 
encoding
+            // NULL
+            {{Null()}, Null()},
+    };
+
+    check_function_all_arg_comb<DataTypeUInt8, true>(func_name, input_types, 
data_set);
+}
+
 TEST(function_string_test, function_char_length_test) {
     std::string func_name = "char_length";
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index 21b25db7661..1f8b9c7b19a 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -279,6 +279,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv4Mapped;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv4String;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains;
@@ -841,6 +842,7 @@ public class BuiltinScalarFunctions implements 
FunctionHelper {
             scalar(IsIpv6String.class, "is_ipv6_string", "is_ipv6"),
             scalar(IsIpAddressInRange.class, "is_ip_address_in_range"),
             scalar(IsNan.class, "isnan"),
+            scalar(IsValidUtf8.class, "is_valid_utf8", "isValidUTF8"),
             scalar(IsInf.class, "isinf"),
             scalar(Ipv4CIDRToRange.class, "ipv4_cidr_to_range"),
             scalar(Ipv6CIDRToRange.class, "ipv6_cidr_to_range"),
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/IsValidUtf8.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/IsValidUtf8.java
new file mode 100644
index 00000000000..0c045182785
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/IsValidUtf8.java
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import 
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.BooleanType;
+import org.apache.doris.nereids.types.StringType;
+import org.apache.doris.nereids.types.VarcharType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'is_valid_utf8'.
+ */
+public class IsValidUtf8 extends ScalarFunction
+        implements UnaryExpression, ExplicitlyCastableSignature, 
PropagateNullable {
+
+    public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+            
FunctionSignature.ret(BooleanType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT),
+            
FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE)
+    );
+
+    /**
+     * constructor with 1 argument.
+     */
+    public IsValidUtf8(Expression arg) {
+        super("is_valid_utf8", arg);
+    }
+
+    /** constructor for withChildren and reuse signature */
+    private IsValidUtf8(ScalarFunctionParams functionParams) {
+        super(functionParams);
+    }
+
+    /**
+     * withChildren.
+     */
+    @Override
+    public IsValidUtf8 withChildren(List<Expression> children) {
+        Preconditions.checkArgument(children.size() == 1);
+        return new IsValidUtf8(getFunctionParams(children));
+    }
+
+    @Override
+    public List<FunctionSignature> getSignatures() {
+        return SIGNATURES;
+    }
+
+    @Override
+    public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+        return visitor.visitIsValidUtf8(this, context);
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index 737f815561b..4b827b35dcc 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -298,6 +298,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv4Mapped;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv4String;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains;
@@ -1702,6 +1703,10 @@ public interface ScalarFunctionVisitor<R, C> {
         return visitScalarFunction(isNan, context);
     }
 
+    default R visitIsValidUtf8(IsValidUtf8 isValidUtf8, C context) {
+        return visitScalarFunction(isValidUtf8, context);
+    }
+
     default R visitIsInf(IsInf isInf, C context) {
         return visitScalarFunction(isInf, context);
     }
diff --git 
a/regression-test/data/query_p0/sql_functions/string_functions/test_is_valid_utf8.out
 
b/regression-test/data/query_p0/sql_functions/string_functions/test_is_valid_utf8.out
new file mode 100644
index 00000000000..4d1ba0a2de8
--- /dev/null
+++ 
b/regression-test/data/query_p0/sql_functions/string_functions/test_is_valid_utf8.out
@@ -0,0 +1,59 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !valid_1 --
+true
+
+-- !valid_2 --
+true
+
+-- !valid_3 --
+true
+
+-- !valid_4 --
+true
+
+-- !valid_5 --
+true
+
+-- !null_1 --
+\N
+
+-- !invalid_1 --
+false
+
+-- !invalid_2 --
+false
+
+-- !invalid_3 --
+false
+
+-- !invalid_4 --
+false
+
+-- !invalid_5 --
+false
+
+-- !invalid_6 --
+false
+
+-- !invalid_7 --
+false
+
+-- !invalid_8 --
+false
+
+-- !alias_1 --
+true
+
+-- !alias_2 --
+true
+
+-- !alias_3 --
+false
+
+-- !table_1 --
+1      true
+2      true
+3      true
+4      \N
+5      false
+6      false
diff --git 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_is_valid_utf8.groovy
 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_is_valid_utf8.groovy
new file mode 100644
index 00000000000..2883ab41976
--- /dev/null
+++ 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_is_valid_utf8.groovy
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_is_valid_utf8") {
+    // basic valid UTF-8 strings
+    qt_valid_1 "SELECT is_valid_utf8('hello');"
+    qt_valid_2 "SELECT is_valid_utf8('');"
+    qt_valid_3 "SELECT is_valid_utf8('Hello, 世界');"
+    qt_valid_4 "SELECT is_valid_utf8('こんにちは');"
+    qt_valid_5 "SELECT is_valid_utf8('123!@#');"
+
+    // NULL handling
+    qt_null_1 "SELECT is_valid_utf8(NULL);"
+
+    // invalid UTF-8 strings constructed via unhex
+    // 0x80: lone continuation byte
+    qt_invalid_1 "SELECT is_valid_utf8(unhex('80'));"
+    // 0xC3 0x28: invalid 2-byte sequence (second byte not continuation)
+    qt_invalid_2 "SELECT is_valid_utf8(unhex('C328'));"
+    // 0xE2 0x28 0xA1: invalid 3-byte sequence (second byte not continuation)
+    qt_invalid_3 "SELECT is_valid_utf8(unhex('E228A1'));"
+    // 0xF0 0x28 0x8C 0xBC: invalid 4-byte sequence (second byte not 
continuation)
+    qt_invalid_4 "SELECT is_valid_utf8(unhex('F0288CBC'));"
+    // 0xFE: not valid in UTF-8
+    qt_invalid_5 "SELECT is_valid_utf8(unhex('FE'));"
+    // 0xFF: not valid in UTF-8
+    qt_invalid_6 "SELECT is_valid_utf8(unhex('FF'));"
+    // overlong encoding of '/' (U+002F): 0xC0 0xAF
+    qt_invalid_7 "SELECT is_valid_utf8(unhex('C0AF'));"
+    // truncated 3-byte sequence: 0xE4 0xB8
+    qt_invalid_8 "SELECT is_valid_utf8(unhex('E4B8'));"
+
+    // alias isValidUTF8
+    qt_alias_1 "SELECT isValidUTF8('hello');"
+    qt_alias_2 "SELECT isValidUTF8('');"
+    // alias with invalid bytes
+    qt_alias_3 "SELECT isValidUTF8(unhex('80'));"
+
+    // test with table data (including invalid UTF-8 via unhex)
+    sql "DROP TABLE IF EXISTS test_is_valid_utf8_tbl"
+    sql """
+        CREATE TABLE test_is_valid_utf8_tbl (
+            id INT,
+            val VARCHAR(200)
+        ) DISTRIBUTED BY HASH(id) BUCKETS 1
+        PROPERTIES ("replication_num" = "1");
+    """
+
+    sql """
+        INSERT INTO test_is_valid_utf8_tbl VALUES
+        (1, 'hello'),
+        (2, ''),
+        (3, 'Hello, 世界'),
+        (4, NULL);
+    """
+    sql "INSERT INTO test_is_valid_utf8_tbl VALUES (5, unhex('C0AF'));"
+    sql "INSERT INTO test_is_valid_utf8_tbl VALUES (6, unhex('FF'));"
+
+    order_qt_table_1 "SELECT id, is_valid_utf8(val) FROM 
test_is_valid_utf8_tbl ORDER BY id;"
+
+    // test fold const
+    testFoldConst("SELECT is_valid_utf8('hello');")
+    testFoldConst("SELECT is_valid_utf8('');")
+    testFoldConst("SELECT is_valid_utf8(NULL);")
+    testFoldConst("SELECT isValidUTF8('hello');")
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to