rkavanap commented on a change in pull request #11049:
URL: https://github.com/apache/arrow/pull/11049#discussion_r741133878
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1762,6 +1762,41 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context,
const char* text,
out_len);
}
+// Returns the quoted string (Includes escape character for any single quotes)
+// E.g. DONT -> 'DONT'
+// DON'T -> 'DON\'T'
+FORCE_INLINE
+const char* quote_utf8(gdv_int64 context, const char* in, gdv_int32 in_len,
+ gdv_int32* out_len) {
+ if (in_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ // try to allocate double size output string (worst case)
+ auto out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
in_len * 2));
Review comment:
shouldn't this be in_len * 2 + 1 for the worst case of the loop?
##########
File path: cpp/src/gandiva/precompiled/string_ops_test.cc
##########
@@ -912,6 +912,33 @@ TEST(TestStringOps, TestReverse) {
ctx.Reset();
}
+TEST(TestStringOps, TestQuote) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ out_str = quote_utf8(ctx_ptr, "dont", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "\'dont\'");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = quote_utf8(ctx_ptr, "abc", 3, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "\'abc\'");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = quote_utf8(ctx_ptr, "don't", 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "\'don\\'t\'");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = quote_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = quote_utf8(ctx_ptr, "'", 1, &out_len);
Review comment:
this is a good test. Maybe another one like this
"''''''''''''''''''''''" with more quotes?
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1762,6 +1762,41 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context,
const char* text,
out_len);
}
+// Returns the quoted string (Includes escape character for any single quotes)
+// E.g. DONT -> 'DONT'
+// DON'T -> 'DON\'T'
+FORCE_INLINE
+const char* quote_utf8(gdv_int64 context, const char* in, gdv_int32 in_len,
+ gdv_int32* out_len) {
+ if (in_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ // try to allocate double size output string (worst case)
+ auto out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
in_len * 2));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
+ *out_len = 0;
+ return "";
+ }
+ // The output string should start with a single quote
+ out[0] = '\'';
+ gdv_int32 counter = 1;
+ for (int i = 0; i < in_len; i++) {
+ if (memcmp(in + i, "'", 1) == 0) {
+ out[counter] = '\\';
+ counter++;
+ out[counter] = '\'';
+ } else {
+ out[counter] = in[i];
+ }
+ counter++;
+ }
+ out[counter] = '\'';
Review comment:
won't this overflow for the case we have all quotes. Say we have 2
quotes and in_len is 2.. we allocate only 4 bytes, but this logic may need 5
bytes as follows '\'\'' which is 5 chars and not 4.
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1762,6 +1762,41 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context,
const char* text,
out_len);
}
+// Returns the quoted string (Includes escape character for any single quotes)
+// E.g. DONT -> 'DONT'
+// DON'T -> 'DON\'T'
+FORCE_INLINE
+const char* quote_utf8(gdv_int64 context, const char* in, gdv_int32 in_len,
+ gdv_int32* out_len) {
+ if (in_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ // try to allocate double size output string (worst case)
+ auto out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
in_len * 2));
Review comment:
shouldn't this be in_len * 2 + 2 for the worst case of the loop? or does
in_len include the null termination of the string?
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1762,6 +1762,41 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context,
const char* text,
out_len);
}
+// Returns the quoted string (Includes escape character for any single quotes)
+// E.g. DONT -> 'DONT'
+// DON'T -> 'DON\'T'
+FORCE_INLINE
+const char* quote_utf8(gdv_int64 context, const char* in, gdv_int32 in_len,
+ gdv_int32* out_len) {
+ if (in_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ // try to allocate double size output string (worst case)
+ auto out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
in_len * 2));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
+ *out_len = 0;
+ return "";
+ }
+ // The output string should start with a single quote
+ out[0] = '\'';
+ gdv_int32 counter = 1;
+ for (int i = 0; i < in_len; i++) {
+ if (memcmp(in + i, "'", 1) == 0) {
+ out[counter] = '\\';
+ counter++;
+ out[counter] = '\'';
+ } else {
+ out[counter] = in[i];
+ }
+ counter++;
+ }
+ out[counter] = '\'';
Review comment:
won't this overflow for the case we have all quotes. Say we have 2
quotes and in_len is 2.. we allocate only 4 bytes, but this logic may need 5
bytes as follows '\'\'' which is 5 chars and not 4.
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1762,6 +1762,41 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context,
const char* text,
out_len);
}
+// Returns the quoted string (Includes escape character for any single quotes)
+// E.g. DONT -> 'DONT'
+// DON'T -> 'DON\'T'
+FORCE_INLINE
+const char* quote_utf8(gdv_int64 context, const char* in, gdv_int32 in_len,
+ gdv_int32* out_len) {
+ if (in_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ // try to allocate double size output string (worst case)
+ auto out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
in_len * 2));
Review comment:
shouldn't this be in_len * 2 + 2 for the worst case of the loop? or does
in_len include the null termination of the string? Actually I see at least in
the unit test that in_len does not include the null_termination of the string.
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1762,6 +1762,41 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context,
const char* text,
out_len);
}
+// Returns the quoted string (Includes escape character for any single quotes)
+// E.g. DONT -> 'DONT'
+// DON'T -> 'DON\'T'
+FORCE_INLINE
+const char* quote_utf8(gdv_int64 context, const char* in, gdv_int32 in_len,
+ gdv_int32* out_len) {
+ if (in_len <= 0) {
+ *out_len = 0;
+ return "";
+ }
+ // try to allocate double size output string (worst case)
+ auto out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
in_len * 2));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
+ *out_len = 0;
+ return "";
+ }
+ // The output string should start with a single quote
+ out[0] = '\'';
+ gdv_int32 counter = 1;
+ for (int i = 0; i < in_len; i++) {
+ if (memcmp(in + i, "'", 1) == 0) {
+ out[counter] = '\\';
+ counter++;
+ out[counter] = '\'';
+ } else {
+ out[counter] = in[i];
+ }
+ counter++;
+ }
+ out[counter] = '\'';
Review comment:
won't this overflow for a quote only string. let us say we have a string
with single quote of len 1,, then out is of len 2, but counter could be 3.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]