Re: [PR] GH-49753: [C++][Gandiva] Fix overflow in string functions [arrow]

via GitHub Sun, 10 May 2026 14:10:11 -0700


Copilot commented on code in PR #49813:
URL: https://github.com/apache/arrow/pull/49813#discussion_r3215526041



##########
cpp/src/gandiva/precompiled/string_ops_test.cc:
##########
@@ -1165,6 +1165,18 @@ TEST(TestStringOps, TestQuote) {
   out_str = quote_utf8(ctx_ptr, "'''''''''", 9, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "'\\'\\'\\'\\'\\'\\'\\'\\'\\''");
   EXPECT_FALSE(ctx.has_error());
+
+  int32_t bad_in_len = std::numeric_limits<int32_t>::max() / 2 + 1;
+  out_str = quote_utf8(ctx_ptr, "YYZ", bad_in_len, &out_len);
+  EXPECT_EQ(ctx.get_error(), "Memory allocation size too large.");
+  EXPECT_EQ(out_len, 0);
+  EXPECT_STREQ(out_str, "");
+
+  bad_in_len = std::numeric_limits<int32_t>::max() / 2 + 20;
+  out_str = quote_utf8(ctx_ptr, "ABCDE", bad_in_len, &out_len);

Review Comment:
   `ExecutionContext::set_error_msg` records only the *first* error. Since 
`ctx.Reset()` isn't called between these two overflow cases, the second 
`EXPECT_EQ(ctx.get_error(), ...)` will still pass even if the second 
`quote_utf8` call fails to set an error. Reset the context before the second 
call (or assert `ctx.has_error()` immediately after each call) so both cases 
are independently validated.
   



##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -2824,13 +2777,31 @@ const char* elt_int32_utf8_utf8_utf8_utf8_utf8(
 FORCE_INLINE
 const char* to_hex_binary(int64_t context, const char* text, int32_t text_len,
                           int32_t* out_len) {
-  if (text_len == 0) {
+  if (ARROW_PREDICT_FALSE(text_len < 0)) {
+    gdv_fn_context_set_error_msg(context, "Text length invalid(negative).");
     *out_len = 0;
     return "";
   }
 
-  auto ret =
-      reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, text_len * 
2 + 1));
+  int32_t double_len = 0;
+  // Check multiply overflow for text_len
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::MultiplyWithOverflow(2, text_len, &double_len))) {
+    gdv_fn_context_set_error_msg(context, "Memory allocation size too large.");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t alloc_length = 0;
+  // Check add overflow for text_len
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::AddWithOverflow(1, double_len, &alloc_length))) {
+    gdv_fn_context_set_error_msg(context, "Memory allocation size too large");

Review Comment:
   Same as above: the `AddWithOverflow` branch uses "Memory allocation size too 
large" (no period) while the multiply branch uses "Memory allocation size too 
large.". Standardizing the message avoids inconsistent behavior depending on 
which overflow triggers.
   



##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -1924,9 +1924,27 @@ const char* quote_utf8(gdv_int64 context, const char* 
in, gdv_int32 in_len,
     *out_len = 0;
     return "";
   }
+
+  int32_t double_len = 0;
+  // Test multiply overflow for in_len
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::MultiplyWithOverflow(2, in_len, &double_len))) {
+    gdv_fn_context_set_error_msg(context, "Memory allocation size too large.");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t alloc_length = 0;
+  // Test add overflow for in_len
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::AddWithOverflow(2, double_len, &alloc_length))) {
+    gdv_fn_context_set_error_msg(context, "Memory allocation size too large");

Review Comment:
   The overflow error message is inconsistent: `MultiplyWithOverflow` uses 
"Memory allocation size too large." (with period) while the `AddWithOverflow` 
branch uses "Memory allocation size too large" (no period). This makes error 
handling/tests brittle depending on which overflow triggers. Please standardize 
the message across both branches.
   



##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -2444,182 +2462,173 @@ void concat_word(char* out_buf, int* out_idx, const 
char* in_buf, int in_len,
   *out_idx += in_len;
 }
 
-FORCE_INLINE
-const char* concat_ws_utf8_utf8(int64_t context, const char* separator,
-                                int32_t separator_len, bool separator_validity,
-                                const char* word1, int32_t word1_len, bool 
word1_validity,
-                                const char* word2, int32_t word2_len, bool 
word2_validity,
-                                bool* out_valid, int32_t* out_len) {
-  *out_len = 0;
-  int numValidInput = 0;
-  // If separator is null, always return null
-  if (!separator_validity) {
-    *out_len = 0;
-    *out_valid = false;
-    return "";
-  }
+// Helper structure to maintain state during safe length accumulation
+struct SafeLengthState {
+  int32_t total_len = 0;
+  int32_t num_valid = 0;
+  bool overflow = false;
+};
 
-  if (word1_validity) {
-    *out_len += word1_len;
-    numValidInput++;
+// Helper to safely add a word length
+static inline bool safe_accumulate_word(SafeLengthState& state, int32_t 
word_len,
+                                        bool word_validity) {
+  if (not word_validity) return true;
+
+  if (word_len < 0) {
+    return false;
   }
-  if (word2_validity) {
-    *out_len += word2_len;
-    numValidInput++;
+
+  int32_t temp = 0;
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::AddWithOverflow(state.total_len, word_len, &temp))) 
{
+    state.overflow = true;
+    return false;
   }
+  state.total_len = temp;
+  state.num_valid++;
+  return true;
+}
 
-  *out_len += separator_len * (numValidInput > 1 ? numValidInput - 1 : 0);
-  if (*out_len == 0) {
-    *out_valid = true;
-    return "";
+// Helper to safely add separators based on number of valid words
+static inline bool safe_add_separators(SafeLengthState* state, int32_t 
separator_len) {
+  if (state->num_valid <= 1) return true;
+
+  int32_t sep_total = 0;
+  int32_t temp = 0;
+
+  if (ARROW_PREDICT_FALSE(arrow::internal::MultiplyWithOverflow(
+          separator_len, state->num_valid - 1, &sep_total))) {
+    state->overflow = true;
+    return false;
   }
 
-  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
-  if (out == nullptr) {
-    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
-    *out_len = 0;
-    *out_valid = false;
-    return "";
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::AddWithOverflow(state->total_len, sep_total, 
&temp))) {
+    state->overflow = true;
+    return false;
   }
 
-  char* tmp = out;
-  int out_idx = 0;
-  bool seenAnyValidInput = false;
+  state->total_len = temp;
+  return true;
+}
 
-  concat_word(tmp, &out_idx, word1, word1_len, word1_validity, separator, 
separator_len,
-              &seenAnyValidInput);
-  concat_word(tmp, &out_idx, word2, word2_len, word2_validity, separator, 
separator_len,
-              &seenAnyValidInput);
+// Helper to handle overflow failure (sets output parameters and returns empty 
string)
+static inline const char* handle_overflow_failure(bool* out_valid, int32_t* 
out_len) {
+  *out_len = 0;
+  *out_valid = false;
+  return "";
+}
 
+// Helper to handle empty result (all words invalid)
+static inline const char* handle_empty_result(bool* out_valid, int32_t* 
out_len) {
+  *out_len = 0;
   *out_valid = true;
-  *out_len = out_idx;
-  return out;
+  return "";
 }
 
-FORCE_INLINE
-const char* concat_ws_utf8_utf8_utf8(
-    int64_t context, const char* separator, int32_t separator_len,
-    bool separator_validity, const char* word1, int32_t word1_len, bool 
word1_validity,
-    const char* word2, int32_t word2_len, bool word2_validity, const char* 
word3,
-    int32_t word3_len, bool word3_validity, bool* out_valid, int32_t* out_len) 
{
+struct WordArg {
+  const char* data;
+  int32_t len;
+  bool valid;
+};
+
+static inline const char* concat_ws_impl(int64_t context, const char* 
separator,
+                                         int32_t separator_len, bool 
separator_validity,
+                                         bool* out_valid, int32_t* out_len,
+                                         std::initializer_list<WordArg> words) 
{
   *out_len = 0;
-  int numValidInput = 0;
-  // If separator is null, always return null
-  if (!separator_validity) {
-    *out_len = 0;
+
+  // Separator validity check
+  if (not separator_validity) {
     *out_valid = false;
     return "";
   }
-
-  if (word1_validity) {
-    *out_len += word1_len;
-    numValidInput++;
-  }
-  if (word2_validity) {
-    *out_len += word2_len;
-    numValidInput++;
+  if (separator_len < 0) {
+    *out_valid = false;
+    return "";
   }
-  if (word3_validity) {
-    *out_len += word3_len;
-    numValidInput++;
+
+  SafeLengthState state;
+
+  // Accumulate all word lengths safely
+  for (const WordArg& w : words) {
+    if (not safe_accumulate_word(state, w.len, w.valid)) {
+      *out_len = 0;
+      *out_valid = false;
+      return "";
+    }

Review Comment:
   `safe_accumulate_word`/`safe_add_separators` detect negative lengths and 
arithmetic overflow, but `concat_ws_impl` returns an invalid result 
(`*out_valid = false`) without setting an error message in the execution 
context. Since other string functions in this TU set a context error on invalid 
negative lengths/overflow, consider setting an explicit error message here too 
(e.g., distinguish negative length vs overflow) to make failures diagnosable 
and consistent.



##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -2824,13 +2777,31 @@ const char* elt_int32_utf8_utf8_utf8_utf8_utf8(
 FORCE_INLINE
 const char* to_hex_binary(int64_t context, const char* text, int32_t text_len,
                           int32_t* out_len) {
-  if (text_len == 0) {
+  if (ARROW_PREDICT_FALSE(text_len < 0)) {
+    gdv_fn_context_set_error_msg(context, "Text length invalid(negative).");
     *out_len = 0;
     return "";
   }
 

Review Comment:
   `to_hex_binary` no longer special-cases `text_len == 0`, so it allocates a 
1-byte buffer and returns it without writing a NUL terminator. Since callers 
may treat the returned pointer as a C-string when `*out_len == 0`, this can 
expose uninitialized memory reads / non-terminated strings. Consider restoring 
the `text_len == 0` early-return (returning ""), or at least setting `ret[0] = 
'\0'` before returning.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-49753: [C++][Gandiva] Fix overflow in string functions [arrow]

Reply via email to