Copilot commented on code in PR #49813:
URL: https://github.com/apache/arrow/pull/49813#discussion_r3123127208


##########
cpp/src/gandiva/precompiled/string_ops_test.cc:
##########
@@ -1165,6 +1165,11 @@ TEST(TestStringOps, TestQuote) {
   out_str = quote_utf8(ctx_ptr, "'''''''''", 9, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "'\\'\\'\\'\\'\\'\\'\\'\\'\\''");
   EXPECT_FALSE(ctx.has_error());
+
+  int32_t bad_in_len = std::numeric_limits<int32_t>::max() / 2 + 20;
+  out_str = quote_utf8(ctx_ptr, "ABCDE", bad_in_len, &out_len);
+  EXPECT_EQ(out_len, 0);
+  EXPECT_EQ(out_str, "");

Review Comment:
   These assertions compare `const char*` pointers (`EXPECT_EQ(out_str, "")`) 
rather than string contents, which is brittle (string literal addresses are not 
guaranteed to match across compilation units). Use `EXPECT_STREQ(out_str, "")` 
or compare `std::string(out_str, out_len)` instead. Also consider asserting 
that the context error is set (and resetting it) since the new overflow path 
sets an error message.
   ```suggestion
     EXPECT_STREQ(out_str, "");
     EXPECT_TRUE(ctx.has_error());
   ```



##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -1924,9 +1924,19 @@ const char* quote_utf8(gdv_int64 context, const char* 
in, gdv_int32 in_len,
     *out_len = 0;
     return "";
   }
+
+  int32_t alloc_length = 0;
+  // Check overflow: 2 * in_len
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::MultiplyWithOverflow(2, in_len, &alloc_length))) {
+    gdv_fn_context_set_error_msg(context, "Would overflow maximum output 
size");
+    *out_len = 0;
+    return "";
+  }
+
   // try to allocate double size output string (worst case)
   auto out =
-      reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, (in_len * 
2) + 2));
+      reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
alloc_length + 2));
   if (out == nullptr) {

Review Comment:
   `quote_utf8` checks overflow for `2 * in_len`, but the allocation size is 
`alloc_length + 2`. For boundary values (e.g., `in_len == INT32_MAX/2`), `2 * 
in_len` fits in `int32_t` while `+ 2` overflows, which can lead to a 
negative/incorrect allocation size being passed to 
`gdv_fn_context_arena_malloc`. Please include the `+ 2` in the overflow-safe 
size computation (or separately check that `alloc_length <= INT32_MAX - 2`).



##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -686,10 +732,11 @@ const char* translate_utf8_utf8_utf8(int64_t context, 
const char* in, int32_t in
         }
       }
     }
-  } else {  // If there are no multibytes in the input, work with std::strings
+  } else {
+    // If there are no multibytes in the input, work with std::strings

Review Comment:
   The comment in the `has_multi_byte` branch is inverted: this `else` executes 
when multibyte UTF-8 is present, but the comment says "If there are no 
multibytes". Please correct the comment to match the control flow to avoid 
confusion during future maintenance.
   ```suggestion
       // If there are multibytes in the input, work with std::strings
   ```



##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -2829,8 +2839,17 @@ const char* to_hex_binary(int64_t context, const char* 
text, int32_t text_len,
     return "";
   }
 
+  int32_t alloc_length = 0;
+  // Check overflow: 2 * text_len
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::MultiplyWithOverflow(2, text_len, &alloc_length))) {
+    gdv_fn_context_set_error_msg(context, "Would overflow maximum output 
size");
+    *out_len = 0;
+    return "";
+  }
+
   auto ret =
-      reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, text_len * 
2 + 1));
+      reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
alloc_length + 1));
 

Review Comment:
   `to_hex_binary` only special-cases `text_len == 0`. If `text_len` is 
negative, the overflow check may not trigger and `alloc_length` can become 
negative, which then gets passed to `gdv_fn_context_arena_malloc` (takes an 
`int32_t size`) and can result in invalid/huge allocations or crashes. Treat 
negative lengths as invalid (or at least as empty) by changing the guard to 
`text_len <= 0` (and ideally setting an error for negative input, consistent 
with other functions).



##########
cpp/src/gandiva/precompiled/string_ops_test.cc:
##########
@@ -2498,6 +2503,11 @@ TEST(TestStringOps, TestToHex) {
   output = std::string(out_str, out_len);
   EXPECT_EQ(out_len, 2 * in_len);
   EXPECT_EQ(output, "090A090A090A090A0A0A092061206C657474405D6572");
+
+  int32_t bad_in_len = std::numeric_limits<int32_t>::max() / 2 + 20;
+  out_str = to_hex_binary(ctx_ptr, binary_string, bad_in_len, &out_len);
+  EXPECT_EQ(out_len, 0);
+  EXPECT_EQ(out_str, "");

Review Comment:
   These assertions compare `const char*` pointers (`EXPECT_EQ(out_str, "")`) 
rather than string contents, which can be non-deterministic. Use 
`EXPECT_STREQ(out_str, "")` or compare `std::string(out_str, out_len)` instead, 
and consider asserting/resetting the context error since the overflow path sets 
an error message.
   ```suggestion
     EXPECT_STREQ(out_str, "");
     EXPECT_TRUE(ctx.has_error());
     ctx.Reset();
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to