vvellanki commented on a change in pull request #11551:
URL: https://github.com/apache/arrow/pull/11551#discussion_r762833466
##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -832,6 +850,211 @@ const char* gdv_fn_initcap_utf8(int64_t context, const
char* data, int32_t data_
*out_len = out_idx;
return out;
}
+
+GANDIVA_EXPORT
+const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
+ int32_t data_len, int32_t n_to_mask,
+ int32_t* out_len) {
+ if (data_len <= 0) {
+ *out_len = 0;
+ return nullptr;
+ }
+
+ if (n_to_mask > data_len) {
+ n_to_mask = data_len;
+ }
+
+ *out_len = data_len;
+
+ if (n_to_mask <= 0) {
+ return data;
+ }
+
+ char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
*out_len));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
+ *out_len = 0;
+ return nullptr;
+ }
+
+ int bytes_masked;
+ for (bytes_masked = 0; bytes_masked < n_to_mask; bytes_masked++) {
+ unsigned char char_single_byte = data[bytes_masked];
+ if (char_single_byte > 127) {
+ // found a multi-byte utf-8 char
+ break;
+ }
+ out[bytes_masked] = mask_array[char_single_byte];
+ }
+
+ int chars_masked = bytes_masked;
+ int out_idx = bytes_masked;
+
+ // Handle multibyte utf8 characters
+ utf8proc_int32_t utf8_char;
+ while ((chars_masked < n_to_mask) && (bytes_masked < data_len)) {
+ auto char_len =
+ utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data +
bytes_masked),
+ data_len, &utf8_char);
+
+ if (char_len < 0) {
+ gdv_fn_context_set_error_msg(context, utf8proc_errmsg(char_len));
+ *out_len = 0;
+ return nullptr;
+ }
+
+ switch (utf8proc_category(utf8_char)) {
+ case 1:
+ out[out_idx] = 'X';
+ out_idx++;
+ break;
+ case 2:
+ out[out_idx] = 'x';
+ out_idx++;
+ break;
+ case 9:
+ out[out_idx] = 'n';
+ out_idx++;
+ break;
+ case 10:
+ out[out_idx] = 'n';
+ out_idx++;
+ break;
+ default:
+ memcpy(out + out_idx, data + bytes_masked, char_len);
+ out_idx += static_cast<int>(char_len);
+ break;
+ }
+ bytes_masked += static_cast<int>(char_len);
+ chars_masked++;
+ }
+
+ // Correct the out_len after masking multibyte characters with single byte
characters
+ *out_len = *out_len - (bytes_masked - out_idx);
+
+ if (bytes_masked < data_len) {
+ memcpy(out + out_idx, data + bytes_masked, data_len - bytes_masked);
+ }
+
+ return out;
+}
+
+GANDIVA_EXPORT
+const char* gdv_mask_last_n_utf8_int32(int64_t context, const char* data,
+ int32_t data_len, int32_t n_to_mask,
+ int32_t* out_len) {
+ if (data_len <= 0) {
+ *out_len = 0;
+ return nullptr;
+ }
+
+ if (n_to_mask > data_len) {
+ n_to_mask = data_len;
+ }
+
+ *out_len = data_len;
+
+ if (n_to_mask <= 0) {
+ return data;
+ }
+
+ char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
*out_len));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
+ *out_len = 0;
+ return nullptr;
+ }
+
+ bool has_multi_byte = false;
+ for (int i = 0; i < data_len; i++) {
+ unsigned char char_single_byte = data[i];
+ if (char_single_byte > 127) {
+ // found a multi-byte utf-8 char
+ has_multi_byte = true;
+ break;
+ }
+ }
+
+ if (!has_multi_byte) {
+ int start_idx = data_len - n_to_mask;
+ memcpy(out, data, start_idx);
+ for (int i = start_idx; i < data_len; ++i) {
+ unsigned char char_single_byte = data[i];
+ out[i] = mask_array[char_single_byte];
+ }
+ *out_len = data_len;
+ return out;
+ }
+
+ utf8proc_int32_t utf8_char_buffer;
+ int num_of_chars = static_cast<int>(
+ utf8proc_decompose(reinterpret_cast<const utf8proc_uint8_t*>(data),
data_len,
+ &utf8_char_buffer, 4, UTF8PROC_STABLE));
+ utf8proc_int32_t utf8_char;
+ int chars_counter = 0;
+ int bytes_read = 0;
+ while ((bytes_read < data_len) && (chars_counter < (num_of_chars -
n_to_mask))) {
+ auto char_len =
+ utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data +
bytes_read),
+ data_len, &utf8_char);
+
+ if (char_len < 0) {
+ gdv_fn_context_set_error_msg(context, utf8proc_errmsg(char_len));
+ *out_len = 0;
+ return nullptr;
+ }
+
+ chars_counter++;
+ bytes_read += static_cast<int>(char_len);
+ }
+
+ int out_idx = bytes_read;
+ int offset_idx = bytes_read;
+
+ // Populate the first chars, that are not masked
+ memcpy(out, data, offset_idx);
+
+ while (bytes_read < data_len) {
+ auto char_len =
+ utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data +
bytes_read),
+ data_len, &utf8_char);
+
+ if (char_len < 0) {
+ gdv_fn_context_set_error_msg(context, utf8proc_errmsg(char_len));
+ *out_len = 0;
+ return nullptr;
+ }
+
+ switch (utf8proc_category(utf8_char)) {
+ case 1:
+ out[out_idx] = 'X';
+ out_idx++;
+ break;
+ case 2:
+ out[out_idx] = 'x';
+ out_idx++;
+ break;
+ case 9:
+ out[out_idx] = 'n';
+ out_idx++;
+ break;
+ case 10:
+ out[out_idx] = 'n';
+ out_idx++;
+ break;
+ default:
+ memcpy(out + out_idx, data + bytes_read, char_len);
+ out_idx += static_cast<int>(char_len);
+ break;
+ }
+ bytes_read += static_cast<int>(char_len);
+ chars_counter++;
Review comment:
chars_counter is no longer used. No point updating this variable now
##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -832,6 +850,211 @@ const char* gdv_fn_initcap_utf8(int64_t context, const
char* data, int32_t data_
*out_len = out_idx;
return out;
}
+
+GANDIVA_EXPORT
+const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
+ int32_t data_len, int32_t n_to_mask,
+ int32_t* out_len) {
+ if (data_len <= 0) {
+ *out_len = 0;
+ return nullptr;
+ }
+
+ if (n_to_mask > data_len) {
+ n_to_mask = data_len;
+ }
+
+ *out_len = data_len;
+
+ if (n_to_mask <= 0) {
+ return data;
+ }
+
+ char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
*out_len));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
+ *out_len = 0;
+ return nullptr;
+ }
+
+ int bytes_masked;
+ for (bytes_masked = 0; bytes_masked < n_to_mask; bytes_masked++) {
+ unsigned char char_single_byte = data[bytes_masked];
+ if (char_single_byte > 127) {
+ // found a multi-byte utf-8 char
+ break;
+ }
+ out[bytes_masked] = mask_array[char_single_byte];
+ }
+
+ int chars_masked = bytes_masked;
+ int out_idx = bytes_masked;
+
+ // Handle multibyte utf8 characters
+ utf8proc_int32_t utf8_char;
+ while ((chars_masked < n_to_mask) && (bytes_masked < data_len)) {
+ auto char_len =
+ utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data +
bytes_masked),
+ data_len, &utf8_char);
+
+ if (char_len < 0) {
+ gdv_fn_context_set_error_msg(context, utf8proc_errmsg(char_len));
+ *out_len = 0;
+ return nullptr;
+ }
+
+ switch (utf8proc_category(utf8_char)) {
+ case 1:
+ out[out_idx] = 'X';
+ out_idx++;
+ break;
+ case 2:
+ out[out_idx] = 'x';
+ out_idx++;
+ break;
+ case 9:
+ out[out_idx] = 'n';
+ out_idx++;
+ break;
+ case 10:
+ out[out_idx] = 'n';
+ out_idx++;
+ break;
+ default:
+ memcpy(out + out_idx, data + bytes_masked, char_len);
+ out_idx += static_cast<int>(char_len);
+ break;
+ }
+ bytes_masked += static_cast<int>(char_len);
+ chars_masked++;
+ }
+
+ // Correct the out_len after masking multibyte characters with single byte
characters
+ *out_len = *out_len - (bytes_masked - out_idx);
+
+ if (bytes_masked < data_len) {
+ memcpy(out + out_idx, data + bytes_masked, data_len - bytes_masked);
+ }
+
+ return out;
+}
+
+GANDIVA_EXPORT
+const char* gdv_mask_last_n_utf8_int32(int64_t context, const char* data,
+ int32_t data_len, int32_t n_to_mask,
+ int32_t* out_len) {
+ if (data_len <= 0) {
+ *out_len = 0;
+ return nullptr;
+ }
+
+ if (n_to_mask > data_len) {
+ n_to_mask = data_len;
+ }
+
+ *out_len = data_len;
+
+ if (n_to_mask <= 0) {
+ return data;
+ }
+
+ char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
*out_len));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
+ *out_len = 0;
+ return nullptr;
+ }
+
+ bool has_multi_byte = false;
+ for (int i = 0; i < data_len; i++) {
+ unsigned char char_single_byte = data[i];
+ if (char_single_byte > 127) {
+ // found a multi-byte utf-8 char
+ has_multi_byte = true;
+ break;
+ }
+ }
+
+ if (!has_multi_byte) {
+ int start_idx = data_len - n_to_mask;
+ memcpy(out, data, start_idx);
+ for (int i = start_idx; i < data_len; ++i) {
+ unsigned char char_single_byte = data[i];
+ out[i] = mask_array[char_single_byte];
+ }
+ *out_len = data_len;
+ return out;
+ }
+
+ utf8proc_int32_t utf8_char_buffer;
+ int num_of_chars = static_cast<int>(
+ utf8proc_decompose(reinterpret_cast<const utf8proc_uint8_t*>(data),
data_len,
+ &utf8_char_buffer, 4, UTF8PROC_STABLE));
+ utf8proc_int32_t utf8_char;
+ int chars_counter = 0;
+ int bytes_read = 0;
+ while ((bytes_read < data_len) && (chars_counter < (num_of_chars -
n_to_mask))) {
+ auto char_len =
+ utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data +
bytes_read),
+ data_len, &utf8_char);
+
+ if (char_len < 0) {
+ gdv_fn_context_set_error_msg(context, utf8proc_errmsg(char_len));
+ *out_len = 0;
+ return nullptr;
+ }
+
+ chars_counter++;
+ bytes_read += static_cast<int>(char_len);
+ }
+
+ int out_idx = bytes_read;
+ int offset_idx = bytes_read;
+
+ // Populate the first chars, that are not masked
+ memcpy(out, data, offset_idx);
+
+ while (bytes_read < data_len) {
+ auto char_len =
+ utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data +
bytes_read),
+ data_len, &utf8_char);
+
+ if (char_len < 0) {
Review comment:
Is this required? This indicates invalid input - my guess is that if the
input is not utf-8, the call to utf8proc_decompose will fail with some error.
Can you check what utf8proc_decompose does if the input is not utf-8? and
can utf8proc_decompose return a negative number?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]