UTF-8)

GitBox Tue, 05 Jul 2022 08:39:33 -0700


anthonylouisbsb commented on code in PR #13497:
URL: https://github.com/apache/arrow/pull/13497#discussion_r913892153



##########
cpp/src/gandiva/tests/projector_test.cc:
##########
@@ -2907,4 +2907,125 @@ TEST_F(TestProjector, TestTranslate) {
   // Validate results
   EXPECT_ARROW_ARRAY_EQUALS(exp_translate, outputs.at(0));
 }
+TEST_F(TestProjector, TestENCODEFunction) {
+  // schema for input fields
+  auto field0 = field("f0", arrow::utf8());
+  auto field1 = field("f1", arrow::utf8());
+  auto schema = arrow::schema({field0, field1});
+
+  // output fields
+  auto field_base = field("encoder", arrow::utf8());
+
+  // Build expression
+  auto encoder = TreeExprBuilder::MakeExpression("encoder", {field0, field1}, 
field_base);
+
+  std::shared_ptr<Projector> projector;
+
+  auto status = Projector::Make(schema, {encoder}, TestConfiguration(), 
&projector);
+  EXPECT_TRUE(status.ok()) << status.message();
+
+  // Create a row-batch with some sample data
+  int num_records = 4;
+
+  auto array = MakeArrowArrayUtf8({"A", "B", "C", "D"}, {true, true, true, 
true});
+
+  auto array1 = MakeArrowArrayUtf8({"UTF-16BE", "UTF-16BE", "UTF-16BE", 
"UTF-16BE"},
+                                   {true, true, true, true});
+
+  auto exp_encoder =
+      MakeArrowArrayUtf8({"0041", "0042", "0043", "0044"}, {true, true, true, 
true});
+
+  auto in_batch0 = arrow::RecordBatch::Make(schema, num_records, {array, 
array1});
+
+  arrow::ArrayVector outputs0;
+
+  // Evaluate expression
+  status = projector->Evaluate(*in_batch0, pool_, &outputs0);
+  EXPECT_TRUE(status.ok());
+
+  EXPECT_ARROW_ARRAY_EQUALS(exp_encoder, outputs0.at(0));
+
+  std::shared_ptr<Projector> projector1;
+
+  status = Projector::Make(schema, {encoder}, TestConfiguration(), 
&projector1);
+
+  auto array2 = MakeArrowArrayUtf8({"A", "B", "C", "D"}, {true, true, true, 
true});
+
+  auto array3 = MakeArrowArrayUtf8({"UTF-16LE", "UTF-16LE", "UTF-16LE", 
"UTF-16LE"},
+                                   {true, true, true, true});
+
+  auto exp_encoder1 =
+      MakeArrowArrayUtf8({"4100", "4200", "4300", "4400"}, {true, true, true, 
true});
+
+  auto in_batch1 = arrow::RecordBatch::Make(schema, num_records, {array2, 
array3});
+
+  arrow::ArrayVector outputs1;
+
+  // Evaluate expression
+  status = projector1->Evaluate(*in_batch1, pool_, &outputs1);
+  EXPECT_TRUE(status.ok());
+
+  EXPECT_ARROW_ARRAY_EQUALS(exp_encoder1, outputs1.at(0));
+}
+
+TEST_F(TestProjector, TestDECODEFunction) {

Review Comment:
   I ran that function locally(in Linux) and it is not working:
   ```
   Value of: (exp_decoder)->Equals(outputs.at(0), 
arrow::EqualOptions().nans_equal(true))
     Actual: false
   Expected: true
   expected array: [
     "A",
     "B",
     "C",
     "D"
   ] actual array: [
     "0030003000340031",
     "0030003000340032",
     "0030003000340033",
     "0030003000340034"
   ]
   ```



##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const 
char* data, int32_t data_le
   return out;
 }
 
+//converts utf8 to utf16 encoding
+GANDIVA_EXPORT
+const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len,
+                          const char* charset, int32_t charset_len, int32_t* 
out_len) { 
+  if (data_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+  
+  char* out = reinterpret_cast<char*>(
+    gdv_fn_context_arena_malloc(context, 32 * data_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context,
+    "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t char_len;
+  uint32_t char_codepoint;
+  

Review Comment:
   You can test to check if your code has format problems locally following 
that command: 
https://github.com/apache/arrow/blob/3d6240c1ee7802829d2ed209f4135906e9413915/docker-compose.yml#L1650



##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const 
char* data, int32_t data_le
   return out;
 }
 
+//converts utf8 to utf16 encoding
+GANDIVA_EXPORT
+const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len,
+                          const char* charset, int32_t charset_len, int32_t* 
out_len) { 
+  if (data_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+  
+  char* out = reinterpret_cast<char*>(
+    gdv_fn_context_arena_malloc(context, 32 * data_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context,
+    "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t char_len;
+  uint32_t char_codepoint;
+  
+
+  if (memcmp(charset, "UTF-16BE", 8) == 0) {
+    char *tmp=out;
+
+    for (int32_t i = 0; i < data_len; i += char_len) {
+      
+      
+      char_len = gdv_fn_utf8_char_length(data[i]);
+
+      if (char_len == 1) {
+            
+            int32_t input = static_cast<int32_t>(data[i]);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.4X", input);
+
+            memcpy(tmp, hex_string, 4);
+            tmp+=4;
+      }else if (char_len == 2) {
+
+            const auto* in_char = (const uint8_t*)(data + i);
+
+            // Decode the multibyte character
+            bool is_valid_utf8_char =
+            arrow::util::UTF8Decode((const uint8_t**)&in_char, 
&char_codepoint);
+
+            // If it is an invalid utf8 character, UTF8Decode evaluates to 
false
+            if (!is_valid_utf8_char) {
+            gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+            *out_len = 0;
+            return "";
+            }
+
+            // int32_t input = static_cast<int32_t>(char_codepoint);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.4X", char_codepoint);
+
+            memcpy(tmp, hex_string, 4);
+            tmp+=4;
+        }else if (char_len == 3) {
+
+            const auto* in_char = (const uint8_t*)(data + i);
+
+            // Decode the multibyte character
+            bool is_valid_utf8_char =
+            arrow::util::UTF8Decode((const uint8_t**)&in_char, 
&char_codepoint);
+
+            // If it is an invalid utf8 character, UTF8Decode evaluates to 
false
+            if (!is_valid_utf8_char) {
+            gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+            *out_len = 0;
+            return "";
+            }
+
+            // int32_t input = static_cast<int32_t>(char_codepoint);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.8X", char_codepoint);
+
+            memcpy(tmp, hex_string, 8);
+            tmp+=8;
+        }else if(char_len==4) {
+
+            const auto* in_char = (const uint8_t*)(data + i);
+
+            // Decode the multibyte character
+            bool is_valid_utf8_char =
+            arrow::util::UTF8Decode((const uint8_t**)&in_char, 
&char_codepoint);
+
+            // If it is an invalid utf8 character, UTF8Decode evaluates to 
false
+            if (!is_valid_utf8_char) {
+            gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+            *out_len = 0;
+            return "";
+            }
+
+            // int32_t input = static_cast<int32_t>(char_codepoint);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.8X", char_codepoint);
+
+            memcpy(tmp, hex_string, 8);
+            tmp+=8;
+        }
+    }
+    *out_len=static_cast<int32_t>(tmp-out);
+    return out;
+  }else if (memcmp (charset, "UTF-16LE", 8) == 0) {
+    char *tmp = out;
+
+    for (int32_t i = 0; i < data_len; i += char_len) {
+      
+      
+      char_len = gdv_fn_utf8_char_length(data[i]);
+
+      if (char_len == 1) {
+            
+            int32_t input = static_cast<int32_t>(data[i]);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.4X", input);
+
+            for(int32_t xx = 0; xx < 4; xx += 4 ){
+
+                char pp = hex_string[xx];
+                hex_string[xx] = hex_string[xx+2];
+                hex_string[xx+2] = pp;
+
+                pp = hex_string[xx+1];
+                hex_string[xx+1] = hex_string[xx+3];
+                hex_string[xx+3] = pp;
+            }
+
+            memcpy(tmp, hex_string, 4);
+            tmp+=4;
+        }else if (char_len == 2) {
+
+            const auto* in_char = (const uint8_t*)(data + i);
+
+            // Decode the multibyte character
+            bool is_valid_utf8_char =
+            arrow::util::UTF8Decode((const uint8_t**)&in_char, 
&char_codepoint);
+
+            // If it is an invalid utf8 character, UTF8Decode evaluates to 
false
+            if (!is_valid_utf8_char) {
+            gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+            *out_len = 0;
+            return "";
+            }
+
+            // int32_t input = static_cast<int32_t>(char_codepoint);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.4X", char_codepoint);
+
+            for(int32_t xx = 0; xx < 4 ; xx += 4 ) {
+
+                char pp = hex_string[xx];
+                hex_string[xx] = hex_string[xx+2];
+                hex_string[xx+2] = pp;
+
+                pp = hex_string[xx+1];
+                hex_string[xx+1] = hex_string[xx+3];
+                hex_string[xx+3] = pp;
+            }
+
+            memcpy(tmp, hex_string, 4);
+            tmp+=4;
+        }else if( char_len == 3) {
+
+            const auto* in_char = (const uint8_t*)(data + i);
+
+            // Decode the multibyte character
+            bool is_valid_utf8_char =
+            arrow::util::UTF8Decode((const uint8_t**)&in_char, 
&char_codepoint);
+
+            // If it is an invalid utf8 character, UTF8Decode evaluates to 
false
+            if (!is_valid_utf8_char) {
+            gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+            *out_len = 0;
+            return "";
+            }
+
+            // int32_t input = static_cast<int32_t>(char_codepoint);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.8X", char_codepoint);
+
+            for(int32_t xx = 0 ; xx < 8 ; xx += 4) {
+
+
+                char pp = hex_string[xx];
+                hex_string[xx] = hex_string[xx+2];
+                hex_string[xx+2] = pp;
+
+                pp = hex_string[xx+1];
+                hex_string[xx+1] = hex_string[xx+3];
+                hex_string[xx+3] = pp;
+
+                // std::swap(hex_string[xx],hex_string[xx+2]);
+                // std::swap(hex_string[xx+1],hex_string[xx+3]);
+            }
+
+            memcpy(tmp, hex_string, 8);
+            tmp+=8;
+        }else if ( char_len == 4 ) {
+
+            const auto* in_char = (const uint8_t*)(data + i);
+
+            // Decode the multibyte character
+            bool is_valid_utf8_char =
+            arrow::util::UTF8Decode((const uint8_t**)&in_char, 
&char_codepoint);
+
+            // If it is an invalid utf8 character, UTF8Decode evaluates to 
false
+            if (!is_valid_utf8_char) {
+            gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+            *out_len = 0;
+            return "";
+            }
+
+            // Convert the encoded codepoint to its uppercase codepoint
+            // int32_t input = static_cast<int32_t>(char_codepoint);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.8X", char_codepoint);
+
+            for(int32_t xx = 0 ; xx < 8 ; xx += 4) {
+
+
+                char pp = hex_string[xx];
+                hex_string[xx] = hex_string[xx+2];
+                hex_string[xx+2] = pp;
+
+                pp = hex_string[xx+1];
+                hex_string[xx+1] = hex_string[xx+3];
+                hex_string[xx+3] = pp;
+
+                // std::swap(hex_string[xx],hex_string[xx+2]);
+                // std::swap(hex_string[xx+1],hex_string[xx+3]);

Review Comment:
   Remember to remove the code that is not being used by the function



##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const 
char* data, int32_t data_le
   return out;
 }
 
+//converts utf8 to utf16 encoding
+GANDIVA_EXPORT
+const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len,
+                          const char* charset, int32_t charset_len, int32_t* 
out_len) { 
+  if (data_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+  
+  char* out = reinterpret_cast<char*>(
+    gdv_fn_context_arena_malloc(context, 32 * data_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context,
+    "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t char_len;
+  uint32_t char_codepoint;
+  
+
+  if (memcmp(charset, "UTF-16BE", 8) == 0) {
+    char *tmp=out;
+
+    for (int32_t i = 0; i < data_len; i += char_len) {
+      
+      
+      char_len = gdv_fn_utf8_char_length(data[i]);
+
+      if (char_len == 1) {
+            
+            int32_t input = static_cast<int32_t>(data[i]);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.4X", input);
+
+            memcpy(tmp, hex_string, 4);
+            tmp+=4;

Review Comment:
   I think you can use the snprintf function directly in the output buffer 
instead of creating a temporary one and calling memcpy. I will put the logic 
below:
   ```cpp
   auto index_written = 0
   
   auto num_bytes = sprintf(hex_string + index_written, "%.4X", input);
   
   index_written += num_bytes;
   ```



##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const 
char* data, int32_t data_le
   return out;
 }
 
+//converts utf8 to utf16 encoding
+GANDIVA_EXPORT
+const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len,
+                          const char* charset, int32_t charset_len, int32_t* 
out_len) { 
+  if (data_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+  
+  char* out = reinterpret_cast<char*>(
+    gdv_fn_context_arena_malloc(context, 32 * data_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context,
+    "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t char_len;
+  uint32_t char_codepoint;
+  
+
+  if (memcmp(charset, "UTF-16BE", 8) == 0) {
+    char *tmp=out;
+
+    for (int32_t i = 0; i < data_len; i += char_len) {
+      
+      
+      char_len = gdv_fn_utf8_char_length(data[i]);
+
+      if (char_len == 1) {
+            
+            int32_t input = static_cast<int32_t>(data[i]);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.4X", input);
+
+            memcpy(tmp, hex_string, 4);
+            tmp+=4;

Review Comment:
   We used the logic to make the output buffer of the hash function: 
https://github.com/apache/arrow/blob/3d6240c1ee7802829d2ed209f4135906e9413915/cpp/src/gandiva/hash_utils.cc#L129



##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const 
char* data, int32_t data_le
   return out;
 }
 
+//converts utf8 to utf16 encoding
+GANDIVA_EXPORT
+const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len,
+                          const char* charset, int32_t charset_len, int32_t* 
out_len) { 
+  if (data_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+  
+  char* out = reinterpret_cast<char*>(
+    gdv_fn_context_arena_malloc(context, 32 * data_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context,
+    "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t char_len;
+  uint32_t char_codepoint;
+  

Review Comment:
   The code seems to have some format problems(it can give errors on some CI 
steps). To fix it, install clang-format v12 in your machine and run that 
command:
   ```
   python3 [PATH-TO_ARROW]/arrow/cpp/build-support/run_clang_format.py 
--clang_format_binary /usr/bin/clang-format-12 --exclude_globs 
[PATH_TO_ARROW]/arrow/cpp/build-support/lint_exclusions.txt --source_dir 
[PATH-TO_ARROW]/arrow/cpp/src --fix
   ```
   
   Change the PATH_TO_ARROW to the location of your arrow repository inside 
your machine. And the `/usr/bin/clang-format-12` is the path were clang-format 
is installed in Linux machines, maybe for Mac and Windows the path is different
   



##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const 
char* data, int32_t data_le
   return out;
 }
 
+//converts utf8 to utf16 encoding
+GANDIVA_EXPORT
+const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len,
+                          const char* charset, int32_t charset_len, int32_t* 
out_len) { 
+  if (data_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+  
+  char* out = reinterpret_cast<char*>(
+    gdv_fn_context_arena_malloc(context, 32 * data_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context,
+    "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t char_len;
+  uint32_t char_codepoint;
+  
+
+  if (memcmp(charset, "UTF-16BE", 8) == 0) {

Review Comment:
   There is a way to compare it in a case-insensitive way? If the user passes 
`utf-16be` , will it work?



##########
cpp/src/gandiva/function_registry_string.cc:
##########
@@ -515,7 +515,15 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
 
       NativeFunction("translate", {}, DataTypeVector{utf8(), utf8(), utf8()}, 
utf8(),
                      kResultNullIfNull, "translate_utf8_utf8_utf8",
-                     NativeFunction::kNeedsContext | 
NativeFunction::kCanReturnErrors)};
+                     NativeFunction::kNeedsContext | 
NativeFunction::kCanReturnErrors),
+                     
+      NativeFunction("encoder", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+                     kResultNullIfNull, "gdv_fn_encode",
+                     NativeFunction::kNeedsContext),
+                     
+      NativeFunction("decoder", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+                     kResultNullIfNull, "gdv_fn_encode",
+                     NativeFunction::kNeedsContext)};

Review Comment:
   Should the name of the functions be equal to the Hive ones? In HIVE the 
functions names are `encode` and `decode`



##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const 
char* data, int32_t data_le
   return out;
 }
 
+//converts utf8 to utf16 encoding
+GANDIVA_EXPORT
+const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len,
+                          const char* charset, int32_t charset_len, int32_t* 
out_len) { 
+  if (data_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+  
+  char* out = reinterpret_cast<char*>(
+    gdv_fn_context_arena_malloc(context, 32 * data_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context,
+    "Could not allocate memory for output string");
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t char_len;
+  uint32_t char_codepoint;
+  
+
+  if (memcmp(charset, "UTF-16BE", 8) == 0) {
+    char *tmp=out;
+
+    for (int32_t i = 0; i < data_len; i += char_len) {
+      
+      
+      char_len = gdv_fn_utf8_char_length(data[i]);
+
+      if (char_len == 1) {
+            
+            int32_t input = static_cast<int32_t>(data[i]);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.4X", input);
+
+            memcpy(tmp, hex_string, 4);
+            tmp+=4;
+      }else if (char_len == 2) {
+
+            const auto* in_char = (const uint8_t*)(data + i);
+
+            // Decode the multibyte character
+            bool is_valid_utf8_char =
+            arrow::util::UTF8Decode((const uint8_t**)&in_char, 
&char_codepoint);
+
+            // If it is an invalid utf8 character, UTF8Decode evaluates to 
false
+            if (!is_valid_utf8_char) {
+            gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+            *out_len = 0;
+            return "";
+            }
+
+            // int32_t input = static_cast<int32_t>(char_codepoint);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.4X", char_codepoint);
+
+            memcpy(tmp, hex_string, 4);
+            tmp+=4;
+        }else if (char_len == 3) {
+
+            const auto* in_char = (const uint8_t*)(data + i);
+
+            // Decode the multibyte character
+            bool is_valid_utf8_char =
+            arrow::util::UTF8Decode((const uint8_t**)&in_char, 
&char_codepoint);
+
+            // If it is an invalid utf8 character, UTF8Decode evaluates to 
false
+            if (!is_valid_utf8_char) {
+            gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+            *out_len = 0;
+            return "";
+            }
+
+            // int32_t input = static_cast<int32_t>(char_codepoint);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.8X", char_codepoint);
+
+            memcpy(tmp, hex_string, 8);
+            tmp+=8;
+        }else if(char_len==4) {
+
+            const auto* in_char = (const uint8_t*)(data + i);
+
+            // Decode the multibyte character
+            bool is_valid_utf8_char =
+            arrow::util::UTF8Decode((const uint8_t**)&in_char, 
&char_codepoint);
+
+            // If it is an invalid utf8 character, UTF8Decode evaluates to 
false
+            if (!is_valid_utf8_char) {
+            gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+            *out_len = 0;
+            return "";
+            }
+
+            // int32_t input = static_cast<int32_t>(char_codepoint);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.8X", char_codepoint);
+
+            memcpy(tmp, hex_string, 8);
+            tmp+=8;
+        }
+    }
+    *out_len=static_cast<int32_t>(tmp-out);
+    return out;
+  }else if (memcmp (charset, "UTF-16LE", 8) == 0) {
+    char *tmp = out;
+
+    for (int32_t i = 0; i < data_len; i += char_len) {
+      
+      
+      char_len = gdv_fn_utf8_char_length(data[i]);
+
+      if (char_len == 1) {
+            
+            int32_t input = static_cast<int32_t>(data[i]);
+
+            char hex_string[20];
+            sprintf(hex_string, "%.4X", input);
+
+            for(int32_t xx = 0; xx < 4; xx += 4 ){
+
+                char pp = hex_string[xx];
+                hex_string[xx] = hex_string[xx+2];
+                hex_string[xx+2] = pp;
+
+                pp = hex_string[xx+1];
+                hex_string[xx+1] = hex_string[xx+3];
+                hex_string[xx+3] = pp;
+            }
+
+            memcpy(tmp, hex_string, 4);
+            tmp+=4;
+        }else if (char_len == 2) {
+
+            const auto* in_char = (const uint8_t*)(data + i);
+
+            // Decode the multibyte character
+            bool is_valid_utf8_char =
+            arrow::util::UTF8Decode((const uint8_t**)&in_char, 
&char_codepoint);
+
+            // If it is an invalid utf8 character, UTF8Decode evaluates to 
false
+            if (!is_valid_utf8_char) {
+            gdv_fn_set_error_for_invalid_utf8(context, data[i]);
+            *out_len = 0;
+            return "";
+            }
+
+            // int32_t input = static_cast<int32_t>(char_codepoint);

Review Comment:
   Always remember to remove the code that is not being used.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] anthonylouisbsb commented on a diff in pull request #13497: ARROW-16970: [C++][Gandiva] ENCODE(UTF-8 -> UTF-16BE and UTF-8 -> UTF-16LE) and DECODE(UTF-16BE -> UTF-8 and UTF-16LE -> UTF-8)

Reply via email to