github-actions[bot] commented on code in PR #60892:
URL: https://github.com/apache/doris/pull/60892#discussion_r2931632741


##########
be/src/vec/functions/function_string.h:
##########
@@ -2163,44 +2158,143 @@ class FunctionSplitByString : public IFunction {
         }
     }
 
-    void split_empty_delimiter(const StringRef& str_ref, ColumnString::Chars& 
column_string_chars,
-                               ColumnString::Offsets& column_string_offsets,
-                               ColumnArray::Offset64& string_pos,
-                               ColumnArray::Offset64& dest_pos) const {
+    static void split_empty_delimiter(const StringRef& str_ref,
+                                      ColumnString::Chars& column_string_chars,
+                                      ColumnString::Offsets& 
column_string_offsets,
+                                      ColumnArray::Offset64& string_pos,
+                                      ColumnArray::Offset64& dest_pos, Int32 
limit_value) {
         const size_t old_size = column_string_chars.size();
         const size_t new_size = old_size + str_ref.size;
         column_string_chars.resize(new_size);
         memcpy(column_string_chars.data() + old_size, str_ref.data, 
str_ref.size);
-        if (simd::VStringFunctions::is_ascii(str_ref)) {
-            const auto size = str_ref.size;
-
-            const auto nested_old_size = column_string_offsets.size();
-            const auto nested_new_size = nested_old_size + size;
-            column_string_offsets.resize(nested_new_size);
-            std::iota(column_string_offsets.data() + nested_old_size,
-                      column_string_offsets.data() + nested_new_size, 
string_pos + 1);
-
-            string_pos += size;
-            dest_pos += size;
-            // The above code is equivalent to the code in the following 
comment.
-            // for (size_t i = 0; i < str_ref.size; i++) {
-            //     string_pos++;
-            //     column_string_offsets.push_back(string_pos);
-            //     (*dest_nested_null_map).push_back(false);
-            //     dest_pos++;
-            // }
+
+        if (limit_value > 0) {
+            // With limit: split character by character up to limit-1, then 
remainder
+            Int32 split_count = 0;
+            size_t i = 0;
+            if (simd::VStringFunctions::is_ascii(str_ref)) {
+                for (; i < str_ref.size; i++) {
+                    if (split_count == limit_value - 1) {
+                        // remainder
+                        string_pos += str_ref.size - i;
+                        column_string_offsets.push_back(string_pos);
+                        dest_pos++;
+                        return;
+                    }
+                    string_pos++;
+                    column_string_offsets.push_back(string_pos);
+                    dest_pos++;
+                    split_count++;
+                }
+            } else {
+                for (size_t utf8_char_len = 0; i < str_ref.size; i += 
utf8_char_len) {
+                    utf8_char_len = UTF8_BYTE_LENGTH[(unsigned 
char)str_ref.data[i]];
+                    if (split_count == limit_value - 1) {
+                        // remainder
+                        string_pos += str_ref.size - i;
+                        column_string_offsets.push_back(string_pos);
+                        dest_pos++;
+                        return;
+                    }
+                    string_pos += utf8_char_len;
+                    column_string_offsets.push_back(string_pos);
+                    dest_pos++;
+                    split_count++;
+                }
+            }
         } else {
-            for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i += 
utf8_char_len) {
-                utf8_char_len = UTF8_BYTE_LENGTH[(unsigned 
char)str_ref.data[i]];
+            // No limit: original behavior
+            if (simd::VStringFunctions::is_ascii(str_ref)) {
+                const auto size = str_ref.size;
+
+                const auto nested_old_size = column_string_offsets.size();
+                const auto nested_new_size = nested_old_size + size;
+                column_string_offsets.resize(nested_new_size);
+                std::iota(column_string_offsets.data() + nested_old_size,
+                          column_string_offsets.data() + nested_new_size, 
string_pos + 1);
+
+                string_pos += size;
+                dest_pos += size;
+            } else {
+                for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i += 
utf8_char_len) {
+                    utf8_char_len = UTF8_BYTE_LENGTH[(unsigned 
char)str_ref.data[i]];
 
-                string_pos += utf8_char_len;
-                column_string_offsets.push_back(string_pos);
-                dest_pos++;
+                    string_pos += utf8_char_len;
+                    column_string_offsets.push_back(string_pos);
+                    dest_pos++;
+                }
             }
         }
     }
 };
 
+struct SplitByStringTwoArgImpl {
+    static DataTypes get_variadic_argument_types() {
+        return {std::make_shared<DataTypeString>(), 
std::make_shared<DataTypeString>()};
+    }
+
+    static Status execute_impl(FunctionContext* /*context*/, Block& block,
+                               const ColumnNumbers& arguments, uint32_t result,
+                               size_t input_rows_count) {
+        DCHECK_EQ(arguments.size(), 2);
+        return SplitByStringExecutor::execute_core(block, arguments, result, 
input_rows_count, -1);
+    }
+};
+
+struct SplitByStringThreeArgImpl {
+    static DataTypes get_variadic_argument_types() {
+        return {std::make_shared<DataTypeString>(), 
std::make_shared<DataTypeString>(),
+                std::make_shared<DataTypeInt32>()};
+    }
+
+    static Status execute_impl(FunctionContext* /*context*/, Block& block,
+                               const ColumnNumbers& arguments, uint32_t result,
+                               size_t input_rows_count) {
+        DCHECK_EQ(arguments.size(), 3);
+        const auto& [limit_column, limit_is_const] =
+                unpack_if_const(block.get_by_position(arguments[2]).column);
+        DCHECK(limit_is_const) << "limit argument of split_by_string must be a 
constant";
+        auto limit_value = assert_cast<const 
ColumnInt32&>(*limit_column).get_element(0);

Review Comment:
   **Issue [Medium]**: `DCHECK(limit_is_const)` disappears in RELEASE builds. 
If this invariant is ever violated in production, the code would silently read 
`get_element(0)` from a non-const column, using the first row's value as the 
limit for all rows — producing **silently wrong results** rather than crashing.
   
   While the FE enforces this constraint via `checkLegalityAfterRewrite()`, 
defensive checks in BE should not rely solely on FE validation. Per project 
standards, `DCHECK` should not be used for invariants that, when violated, 
would cause silent data corruption.
   
   Suggestion: Replace with `DORIS_CHECK` or add a proper error return:
   ```cpp
   if (!limit_is_const) {
       return Status::InvalidArgument("limit argument of split_by_string must 
be a constant");
   }
   ```
   
   Note: The rest of the file uses `DCHECK` consistently (file-wide technical 
debt), but this particular invariant violation would cause silent wrong results 
rather than just suboptimal behavior, making it higher priority to protect.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to