This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 570771343b GH-49438: [C++][Gandiva] Optimize LPAD/RPAD functions 
(#49439)
570771343b is described below

commit 570771343b2666e801e5d289e33cca22243a6bf1
Author: Dmitry Chirkov <[email protected]>
AuthorDate: Thu Mar 12 21:40:29 2026 -0700

    GH-49438: [C++][Gandiva] Optimize LPAD/RPAD functions (#49439)
    
    ### Rationale for this change
    The `lpad_utf8_int32_utf8` and `rpad_utf8_int32_utf8` functions have 
performance inefficiency and a potential memory safety issue:
    1. **Performance**: Single-byte fills iterate character-by-character when 
`memset` would suffice. Multi-byte fills use O(n) iterations instead of O(log 
n) with a doubling strategy.
    2. **Memory safety**: When the fill string is longer than the padding space 
needed, the code could write more bytes than allocated. Fixed preventatively.
    
    ### What changes are included in this PR?
    1. **Memory safety fix**: Use `std::min(fill_text_len, total_fill_bytes)` 
for the initial copy to prevent overflow
    2. **Fast path**: Add single-byte fill optimization using `memset`
    3. **General path**: Replace character-by-character loop with doubling 
strategy for multi-byte fills
    4. **Tests**: Add comprehensive tests for the new code paths
    
    ### Are these changes tested?
    Yes. Added tests covering:
    - Large UTF-8 fill characters (4-byte emoji, 3-byte Chinese)
    - Single-byte fill boundaries (1 char and 65536 char padding)
    - Content verification for fill patterns
    - Doubling strategy boundaries
    - Partial fill scenarios (fill text longer than padding needed)
    
    ### Are there any user-facing changes?
    No.
    * GitHub Issue: #49438
    
    Authored-by: Dmitry Chirkov <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/gandiva/precompiled/string_ops.cc      | 140 +++++++++++--------
 cpp/src/gandiva/precompiled/string_ops_test.cc | 186 +++++++++++++++++++++++++
 2 files changed, 266 insertions(+), 60 deletions(-)

diff --git a/cpp/src/gandiva/precompiled/string_ops.cc 
b/cpp/src/gandiva/precompiled/string_ops.cc
index 0b787f461c..54243085f7 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -1966,6 +1966,23 @@ gdv_int32 evaluate_return_char_length(gdv_int32 
text_len, gdv_int32 actual_text_
   return return_char_length;
 }
 
+// Fill a buffer with repeated fill_text using O(log n) doubling strategy
+static FORCE_INLINE void fill_buffer_with_pattern(gdv_binary dest,
+                                                  gdv_int32 total_fill_bytes,
+                                                  const char* fill_text,
+                                                  gdv_int32 fill_text_len) {
+  gdv_int32 initial_copy = std::min(fill_text_len, total_fill_bytes);
+  memcpy(dest, fill_text, initial_copy);
+  gdv_int32 written = initial_copy;
+  while (written * 2 <= total_fill_bytes) {
+    memcpy(dest + written, dest, written);
+    written *= 2;
+  }
+  if (written < total_fill_bytes) {
+    memcpy(dest + written, dest, total_fill_bytes - written);
+  }
+}
+
 FORCE_INLINE
 const char* lpad_utf8_int32_utf8(gdv_int64 context, const char* text, 
gdv_int32 text_len,
                                  gdv_int32 return_length, const char* 
fill_text,
@@ -1988,48 +2005,49 @@ const char* lpad_utf8_int32_utf8(gdv_int64 context, 
const char* text, gdv_int32
     // fill into text but "fill_text" is empty, then return text directly.
     *out_len = text_len;
     return text;
-  } else if (return_length < actual_text_len) {
+  }
+  if (return_length < actual_text_len) {
     // case where it truncates the result on return length.
     *out_len = utf8_byte_pos(context, text, text_len, return_length);
     return text;
-  } else {
-    // case (return_length > actual_text_len)
-    // case where it needs to copy "fill_text" on the string left. The total 
number
-    // of chars to copy is given by (return_length -  actual_text_len)
-    gdv_int32 return_char_length = evaluate_return_char_length(
-        text_len, actual_text_len, return_length, fill_text, fill_text_len);
-    char* ret = reinterpret_cast<gdv_binary>(
-        gdv_fn_context_arena_malloc(context, return_char_length));
+  }
+
+  gdv_int32 chars_to_pad = return_length - actual_text_len;
+
+  // FAST PATH: Single-byte fill (most common - space padding)
+  if (fill_text_len == 1) {
+    gdv_int32 out_len_bytes = chars_to_pad + text_len;
+    gdv_binary ret =
+        reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, 
out_len_bytes));
     if (ret == nullptr) {
       gdv_fn_context_set_error_msg(context,
                                    "Could not allocate memory for output 
string");
       *out_len = 0;
       return "";
     }
-    // try to fulfill the return string with the "fill_text" continuously
-    int32_t copied_chars_count = 0;
-    int32_t copied_chars_position = 0;
-    while (copied_chars_count < return_length - actual_text_len) {
-      int32_t char_len;
-      int32_t fill_index;
-      // for each char, evaluate its length to consider it when mem copying
-      for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len) 
{
-        if (copied_chars_count >= return_length - actual_text_len) {
-          break;
-        }
-        char_len = utf8_char_length(fill_text[fill_index]);
-        // ignore invalid char on the fill text, considering it as size 1
-        if (char_len == 0) char_len += 1;
-        copied_chars_count++;
-      }
-      memcpy(ret + copied_chars_position, fill_text, fill_index);
-      copied_chars_position += fill_index;
-    }
-    // after fulfilling the text, copy the main string
-    memcpy(ret + copied_chars_position, text, text_len);
-    *out_len = copied_chars_position + text_len;
+    memset(ret, fill_text[0], chars_to_pad);
+    memcpy(ret + chars_to_pad, text, text_len);
+    *out_len = out_len_bytes;
     return ret;
   }
+
+  // GENERAL PATH: Multi-byte fill - use evaluate_return_char_length for 
buffer size
+  gdv_int32 return_char_length = evaluate_return_char_length(
+      text_len, actual_text_len, return_length, fill_text, fill_text_len);
+  gdv_binary ret = reinterpret_cast<gdv_binary>(
+      gdv_fn_context_arena_malloc(context, return_char_length));
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return "";
+  }
+
+  // Fill padding region using doubling strategy, then append text
+  gdv_int32 total_fill_bytes = return_char_length - text_len;
+  fill_buffer_with_pattern(ret, total_fill_bytes, fill_text, fill_text_len);
+  memcpy(ret + total_fill_bytes, text, text_len);
+  *out_len = return_char_length;
+  return ret;
 }
 
 FORCE_INLINE
@@ -2054,47 +2072,49 @@ const char* rpad_utf8_int32_utf8(gdv_int64 context, 
const char* text, gdv_int32
     // fill into text but "fill_text" is empty, then return text directly.
     *out_len = text_len;
     return text;
-  } else if (return_length < actual_text_len) {
+  }
+  if (return_length < actual_text_len) {
     // case where it truncates the result on return length.
     *out_len = utf8_byte_pos(context, text, text_len, return_length);
     return text;
-  } else {
-    // case (return_length > actual_text_len)
-    // case where it needs to copy "fill_text" on the string right
-    gdv_int32 return_char_length = evaluate_return_char_length(
-        text_len, actual_text_len, return_length, fill_text, fill_text_len);
-    char* ret = reinterpret_cast<gdv_binary>(
-        gdv_fn_context_arena_malloc(context, return_char_length));
+  }
+
+  gdv_int32 chars_to_pad = return_length - actual_text_len;
+
+  // FAST PATH: Single-byte fill (most common - space padding)
+  if (fill_text_len == 1) {
+    gdv_int32 out_len_bytes = chars_to_pad + text_len;
+    gdv_binary ret =
+        reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, 
out_len_bytes));
     if (ret == nullptr) {
       gdv_fn_context_set_error_msg(context,
                                    "Could not allocate memory for output 
string");
       *out_len = 0;
       return "";
     }
-    // fulfill the initial text copying the main input string
     memcpy(ret, text, text_len);
-    // try to fulfill the return string with the "fill_text" continuously
-    int32_t copied_chars_count = 0;
-    int32_t copied_chars_position = 0;
-    while (actual_text_len + copied_chars_count < return_length) {
-      int32_t char_len;
-      int32_t fill_length;
-      // for each char, evaluate its length to consider it when mem copying
-      for (fill_length = 0; fill_length < fill_text_len; fill_length += 
char_len) {
-        if (actual_text_len + copied_chars_count >= return_length) {
-          break;
-        }
-        char_len = utf8_char_length(fill_text[fill_length]);
-        // ignore invalid char on the fill text, considering it as size 1
-        if (char_len == 0) char_len += 1;
-        copied_chars_count++;
-      }
-      memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
-      copied_chars_position += fill_length;
-    }
-    *out_len = copied_chars_position + text_len;
+    memset(ret + text_len, fill_text[0], chars_to_pad);
+    *out_len = out_len_bytes;
     return ret;
   }
+
+  // GENERAL PATH: Multi-byte fill - use evaluate_return_char_length for 
buffer size
+  gdv_int32 return_char_length = evaluate_return_char_length(
+      text_len, actual_text_len, return_length, fill_text, fill_text_len);
+  gdv_binary ret = reinterpret_cast<gdv_binary>(
+      gdv_fn_context_arena_malloc(context, return_char_length));
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return "";
+  }
+
+  // Copy text first, then fill padding region using doubling strategy
+  memcpy(ret, text, text_len);
+  gdv_int32 total_fill_bytes = return_char_length - text_len;
+  fill_buffer_with_pattern(ret + text_len, total_fill_bytes, fill_text, 
fill_text_len);
+  *out_len = return_char_length;
+  return ret;
 }
 
 FORCE_INLINE
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc 
b/cpp/src/gandiva/precompiled/string_ops_test.cc
index e0248667e3..51174c1113 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -1318,6 +1318,99 @@ TEST(TestStringOps, TestLpadString) {
 
   out_str = lpad_utf8_int32(ctx_ptr, "TestString", 10, -1, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "x", 1, 65536, "😀", 4, &out_len);
+  EXPECT_EQ(out_len, 65535 * 4 + 1);
+  EXPECT_FALSE(ctx.has_error());
+  EXPECT_EQ(out_str[out_len - 1], 'x');
+  EXPECT_EQ(std::string_view(out_str, 4), "😀");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "A", 1, 65536, "哈", 3, &out_len);
+  EXPECT_EQ(out_len, 65535 * 3 + 1);
+  EXPECT_FALSE(ctx.has_error());
+  EXPECT_EQ(out_str[out_len - 1], 'A');
+  EXPECT_EQ(std::string_view(out_str, 3), "哈");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, ".", 1, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), ".X");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 65536, "@", 1, &out_len);
+  EXPECT_EQ(out_len, 65536);
+  for (int i = 0; i < 100; i++) {
+    EXPECT_EQ(out_str[i], '@') << "Mismatch at position " << i;
+  }
+  EXPECT_EQ(out_str[out_len - 1], 'Z');
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "END", 3, 11, "ab", 2, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "ababababEND");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "END", 3, 10, "abc", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "abcabcaEND");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 5, "αβ", 4, &out_len);
+  EXPECT_EQ(out_len, 9);
+  EXPECT_EQ(std::string(out_str, out_len), "αβαβX");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 4, "中文", 6, &out_len);
+  EXPECT_EQ(out_len, 10);
+  EXPECT_EQ(std::string(out_str, out_len), "中文中Y");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 4, "abc", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "abcX");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 7, "abc", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "abcabcX");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 13, "abc", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "abcabcabcabcX");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 10, "abc", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "abcabcabcX");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "E", 1, 129, "ab", 2, &out_len);
+  EXPECT_EQ(out_len, 129);
+  EXPECT_EQ(out_str[0], 'a');
+  EXPECT_EQ(out_str[1], 'b');
+  EXPECT_EQ(out_str[126], 'a');
+  EXPECT_EQ(out_str[127], 'b');
+  EXPECT_EQ(out_str[128], 'E');
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "E", 1, 127, "ab", 2, &out_len);
+  EXPECT_EQ(out_len, 127);
+  EXPECT_EQ(out_str[0], 'a');
+  EXPECT_EQ(out_str[125], 'b');
+  EXPECT_EQ(out_str[126], 'E');
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, "abc", 3, &out_len);
+  EXPECT_EQ(out_len, 2);
+  EXPECT_EQ(std::string(out_str, out_len), "aX");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 3, "abcde", 5, &out_len);
+  EXPECT_EQ(out_len, 3);
+  EXPECT_EQ(std::string(out_str, out_len), "abY");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 2, "αβ", 4, &out_len);
+  EXPECT_EQ(out_len, 3);
+  EXPECT_EQ(std::string(out_str, out_len), "αZ");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "A", 1, 2, "中文字", 9, &out_len);
+  EXPECT_EQ(out_len, 4);
+  EXPECT_EQ(std::string(out_str, out_len), "中A");
+
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, "B", 1, 3, "中文字", 9, &out_len);
+  EXPECT_EQ(out_len, 7);
+  EXPECT_EQ(std::string(out_str, out_len), "中文B");
+
+  std::string large_text(5000, 'X');
+  std::string large_fill;
+  for (int i = 0; i < 50; ++i) {
+    large_fill += "α";
+  }
+  out_str = lpad_utf8_int32_utf8(ctx_ptr, large_text.c_str(), 5000, 5001,
+                                 large_fill.c_str(), 100, &out_len);
+  EXPECT_EQ(out_len, 5002);
+  EXPECT_EQ(std::string(out_str, 2), "α");
+  EXPECT_EQ(std::string(out_str + 2, 5000), large_text);
 }
 
 TEST(TestStringOps, TestRpadString) {
@@ -1396,6 +1489,99 @@ TEST(TestStringOps, TestRpadString) {
 
   out_str = rpad_utf8_int32(ctx_ptr, "TestString", 10, -1, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "x", 1, 65536, "😀", 4, &out_len);
+  EXPECT_EQ(out_len, 1 + 65535 * 4);
+  EXPECT_FALSE(ctx.has_error());
+  EXPECT_EQ(out_str[0], 'x');
+  EXPECT_EQ(std::string_view(out_str + out_len - 4, 4), "😀");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "A", 1, 65536, "哈", 3, &out_len);
+  EXPECT_EQ(out_len, 1 + 65535 * 3);
+  EXPECT_FALSE(ctx.has_error());
+  EXPECT_EQ(out_str[0], 'A');
+  EXPECT_EQ(std::string_view(out_str + out_len - 3, 3), "哈");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, ".", 1, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "X.");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 65536, "@", 1, &out_len);
+  EXPECT_EQ(out_len, 65536);
+  EXPECT_EQ(out_str[0], 'Z');
+  for (int i = 1; i < 100; i++) {
+    EXPECT_EQ(out_str[i], '@') << "Mismatch at position " << i;
+  }
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "BEG", 3, 11, "ab", 2, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "BEGabababab");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "BEG", 3, 10, "abc", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "BEGabcabca");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 5, "αβ", 4, &out_len);
+  EXPECT_EQ(out_len, 9);
+  EXPECT_EQ(std::string(out_str, out_len), "Xαβαβ");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 4, "中文", 6, &out_len);
+  EXPECT_EQ(out_len, 10);
+  EXPECT_EQ(std::string(out_str, out_len), "Y中文中");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 4, "abc", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Xabc");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 7, "abc", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Xabcabc");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 13, "abc", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Xabcabcabcabc");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 10, "abc", 3, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Xabcabcabc");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "S", 1, 129, "ab", 2, &out_len);
+  EXPECT_EQ(out_len, 129);
+  EXPECT_EQ(out_str[0], 'S');
+  EXPECT_EQ(out_str[1], 'a');
+  EXPECT_EQ(out_str[2], 'b');
+  EXPECT_EQ(out_str[127], 'a');
+  EXPECT_EQ(out_str[128], 'b');
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "S", 1, 127, "ab", 2, &out_len);
+  EXPECT_EQ(out_len, 127);
+  EXPECT_EQ(out_str[0], 'S');
+  EXPECT_EQ(out_str[125], 'a');
+  EXPECT_EQ(out_str[126], 'b');
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "X", 1, 2, "abc", 3, &out_len);
+  EXPECT_EQ(out_len, 2);
+  EXPECT_EQ(std::string(out_str, out_len), "Xa");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "Y", 1, 3, "abcde", 5, &out_len);
+  EXPECT_EQ(out_len, 3);
+  EXPECT_EQ(std::string(out_str, out_len), "Yab");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "Z", 1, 2, "αβ", 4, &out_len);
+  EXPECT_EQ(out_len, 3);
+  EXPECT_EQ(std::string(out_str, out_len), "Zα");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "A", 1, 2, "中文字", 9, &out_len);
+  EXPECT_EQ(out_len, 4);
+  EXPECT_EQ(std::string(out_str, out_len), "A中");
+
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, "B", 1, 3, "中文字", 9, &out_len);
+  EXPECT_EQ(out_len, 7);
+  EXPECT_EQ(std::string(out_str, out_len), "B中文");
+
+  std::string large_text(5000, 'X');
+  std::string large_fill;
+  for (int i = 0; i < 50; ++i) {
+    large_fill += "α";
+  }
+  out_str = rpad_utf8_int32_utf8(ctx_ptr, large_text.c_str(), 5000, 5001,
+                                 large_fill.c_str(), 100, &out_len);
+  EXPECT_EQ(out_len, 5002);
+  EXPECT_EQ(std::string(out_str, 5000), large_text);
+  EXPECT_EQ(std::string(out_str + 5000, 2), "α");
 }
 
 TEST(TestStringOps, TestRtrim) {

Reply via email to