vvellanki commented on a change in pull request #11551:
URL: https://github.com/apache/arrow/pull/11551#discussion_r740942484



##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +795,83 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_first_n(int64_t context, const char* data, int32_t 
data_len,
+                                int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  std::string str(data, data_len);

Review comment:
       This looks to be doing a copy of the input data. Why do we need to copy 
the input?

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +795,83 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_first_n(int64_t context, const char* data, int32_t 
data_len,
+                                int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  std::string str(data, data_len);

Review comment:
       I would implement this as follows:
   1) Allocate memory for the output
   2) memcpy input to output n_to_mask number of bytes
   3) Do the masking
   
   This implementation has 2 memcpy's of the incoming data

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +795,83 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_first_n(int64_t context, const char* data, int32_t 
data_len,
+                                int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return "";

Review comment:
       Should this return null instead?

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +795,83 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_first_n(int64_t context, const char* data, int32_t 
data_len,
+                                int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  std::string str(data, data_len);
+  int32_t counter = 0;
+  for (char& c : str) {
+    if (n_to_mask > 0 && counter < n_to_mask && isalnum(c)) {
+      if (isdigit(c)) {
+        c = 'n';
+      } else {
+        if (isupper(c)) {
+          c = 'X';
+        } else {
+          c = 'x';
+        }
+      }
+      counter++;
+    }
+  }
+
+  *out_len = static_cast<int32_t>(str.size());
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return "";
+  }
+
+  memcpy(out, str.c_str(), *out_len);
+  return out;
+}
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_last_n(int64_t context, const char* data, int32_t 
data_len,

Review comment:
       Please change this to be similar to mask_first_n
   
   Use memcpy for the last_n; and the array for the digits that need masking

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +795,83 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_first_n(int64_t context, const char* data, int32_t 
data_len,
+                                int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  std::string str(data, data_len);
+  int32_t counter = 0;
+  for (char& c : str) {

Review comment:
       How about implementing this as follows:
   * Create an array of size 256 - mapping digits to 'n'; upper case to 'X'; 
and lower case to 'x'; and the remaining to null
   * Take the char value, index into the array
   * If you get a null, write input value into output
   * else write the value from the array into output
   
   This has only one if condition in the for loop and will be very fast

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +795,83 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_first_n(int64_t context, const char* data, int32_t 
data_len,
+                                int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  std::string str(data, data_len);
+  int32_t counter = 0;
+  for (char& c : str) {
+    if (n_to_mask > 0 && counter < n_to_mask && isalnum(c)) {
+      if (isdigit(c)) {
+        c = 'n';
+      } else {
+        if (isupper(c)) {
+          c = 'X';
+        } else {
+          c = 'x';
+        }
+      }
+      counter++;
+    }
+  }
+
+  *out_len = static_cast<int32_t>(str.size());
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return "";
+  }
+
+  memcpy(out, str.c_str(), *out_len);
+  return out;
+}
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_last_n(int64_t context, const char* data, int32_t 
data_len,
+                               int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  std::string str(data, data_len);
+  std::reverse(str.begin(), str.end());

Review comment:
       This is quite bad... why do we need to reverse? This is going to be poor 
in performance

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -1599,5 +1677,31 @@ void ExportedStubFunctions::AddMappings(Engine* engine) 
const {
   engine->AddGlobalMappingForFunc("gdv_fn_initcap_utf8",
                                   types->i8_ptr_type() /*return_type*/, args,
                                   
reinterpret_cast<void*>(gdv_fn_initcap_utf8));
+
+  // gdv_fn_mask_first_n
+  args = {

Review comment:
        Call this args_to_mask_fns. You dont have to reinitialise this in line 
1695

##########
File path: cpp/src/gandiva/gdv_function_stubs_test.cc
##########
@@ -766,4 +766,40 @@ TEST(TestGdvFnStubs, TestCastVarbinaryFloat8) {
   ctx.Reset();
 }
 
+TEST(TestGdvFnStubs, TestMaskFirstN) {

Review comment:
       Did you check what Dremio does if mask_first_n() is passed a negative 
number as an argument for the number of digits to mask? Does the planner handle 
that? or should the implementation of the function handle this case?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to