anthonylouisbsb commented on code in PR #13497: URL: https://github.com/apache/arrow/pull/13497#discussion_r913892153
########## cpp/src/gandiva/tests/projector_test.cc: ########## @@ -2907,4 +2907,125 @@ TEST_F(TestProjector, TestTranslate) { // Validate results EXPECT_ARROW_ARRAY_EQUALS(exp_translate, outputs.at(0)); } +TEST_F(TestProjector, TestENCODEFunction) { + // schema for input fields + auto field0 = field("f0", arrow::utf8()); + auto field1 = field("f1", arrow::utf8()); + auto schema = arrow::schema({field0, field1}); + + // output fields + auto field_base = field("encoder", arrow::utf8()); + + // Build expression + auto encoder = TreeExprBuilder::MakeExpression("encoder", {field0, field1}, field_base); + + std::shared_ptr<Projector> projector; + + auto status = Projector::Make(schema, {encoder}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 4; + + auto array = MakeArrowArrayUtf8({"A", "B", "C", "D"}, {true, true, true, true}); + + auto array1 = MakeArrowArrayUtf8({"UTF-16BE", "UTF-16BE", "UTF-16BE", "UTF-16BE"}, + {true, true, true, true}); + + auto exp_encoder = + MakeArrowArrayUtf8({"0041", "0042", "0043", "0044"}, {true, true, true, true}); + + auto in_batch0 = arrow::RecordBatch::Make(schema, num_records, {array, array1}); + + arrow::ArrayVector outputs0; + + // Evaluate expression + status = projector->Evaluate(*in_batch0, pool_, &outputs0); + EXPECT_TRUE(status.ok()); + + EXPECT_ARROW_ARRAY_EQUALS(exp_encoder, outputs0.at(0)); + + std::shared_ptr<Projector> projector1; + + status = Projector::Make(schema, {encoder}, TestConfiguration(), &projector1); + + auto array2 = MakeArrowArrayUtf8({"A", "B", "C", "D"}, {true, true, true, true}); + + auto array3 = MakeArrowArrayUtf8({"UTF-16LE", "UTF-16LE", "UTF-16LE", "UTF-16LE"}, + {true, true, true, true}); + + auto exp_encoder1 = + MakeArrowArrayUtf8({"4100", "4200", "4300", "4400"}, {true, true, true, true}); + + auto in_batch1 = arrow::RecordBatch::Make(schema, num_records, {array2, array3}); + + arrow::ArrayVector outputs1; + + // Evaluate expression + status = projector1->Evaluate(*in_batch1, pool_, &outputs1); + EXPECT_TRUE(status.ok()); + + EXPECT_ARROW_ARRAY_EQUALS(exp_encoder1, outputs1.at(0)); +} + +TEST_F(TestProjector, TestDECODEFunction) { Review Comment: I ran that function locally(in Linux) and it is not working: ``` Value of: (exp_decoder)->Equals(outputs.at(0), arrow::EqualOptions().nans_equal(true)) Actual: false Expected: true expected array: [ "A", "B", "C", "D" ] actual array: [ "0030003000340031", "0030003000340032", "0030003000340033", "0030003000340034" ] ``` ########## cpp/src/gandiva/gdv_string_function_stubs.cc: ########## @@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le return out; } +//converts utf8 to utf16 encoding +GANDIVA_EXPORT +const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len, + const char* charset, int32_t charset_len, int32_t* out_len) { + if (data_len == 0) { + *out_len = 0; + return ""; + } + + char* out = reinterpret_cast<char*>( + gdv_fn_context_arena_malloc(context, 32 * data_len)); + if (out == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + int32_t char_len; + uint32_t char_codepoint; + Review Comment: You can test to check if your code has format problems locally following that command: https://github.com/apache/arrow/blob/3d6240c1ee7802829d2ed209f4135906e9413915/docker-compose.yml#L1650 ########## cpp/src/gandiva/gdv_string_function_stubs.cc: ########## @@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le return out; } +//converts utf8 to utf16 encoding +GANDIVA_EXPORT +const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len, + const char* charset, int32_t charset_len, int32_t* out_len) { + if (data_len == 0) { + *out_len = 0; + return ""; + } + + char* out = reinterpret_cast<char*>( + gdv_fn_context_arena_malloc(context, 32 * data_len)); + if (out == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + int32_t char_len; + uint32_t char_codepoint; + + + if (memcmp(charset, "UTF-16BE", 8) == 0) { + char *tmp=out; + + for (int32_t i = 0; i < data_len; i += char_len) { + + + char_len = gdv_fn_utf8_char_length(data[i]); + + if (char_len == 1) { + + int32_t input = static_cast<int32_t>(data[i]); + + char hex_string[20]; + sprintf(hex_string, "%.4X", input); + + memcpy(tmp, hex_string, 4); + tmp+=4; + }else if (char_len == 2) { + + const auto* in_char = (const uint8_t*)(data + i); + + // Decode the multibyte character + bool is_valid_utf8_char = + arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint); + + // If it is an invalid utf8 character, UTF8Decode evaluates to false + if (!is_valid_utf8_char) { + gdv_fn_set_error_for_invalid_utf8(context, data[i]); + *out_len = 0; + return ""; + } + + // int32_t input = static_cast<int32_t>(char_codepoint); + + char hex_string[20]; + sprintf(hex_string, "%.4X", char_codepoint); + + memcpy(tmp, hex_string, 4); + tmp+=4; + }else if (char_len == 3) { + + const auto* in_char = (const uint8_t*)(data + i); + + // Decode the multibyte character + bool is_valid_utf8_char = + arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint); + + // If it is an invalid utf8 character, UTF8Decode evaluates to false + if (!is_valid_utf8_char) { + gdv_fn_set_error_for_invalid_utf8(context, data[i]); + *out_len = 0; + return ""; + } + + // int32_t input = static_cast<int32_t>(char_codepoint); + + char hex_string[20]; + sprintf(hex_string, "%.8X", char_codepoint); + + memcpy(tmp, hex_string, 8); + tmp+=8; + }else if(char_len==4) { + + const auto* in_char = (const uint8_t*)(data + i); + + // Decode the multibyte character + bool is_valid_utf8_char = + arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint); + + // If it is an invalid utf8 character, UTF8Decode evaluates to false + if (!is_valid_utf8_char) { + gdv_fn_set_error_for_invalid_utf8(context, data[i]); + *out_len = 0; + return ""; + } + + // int32_t input = static_cast<int32_t>(char_codepoint); + + char hex_string[20]; + sprintf(hex_string, "%.8X", char_codepoint); + + memcpy(tmp, hex_string, 8); + tmp+=8; + } + } + *out_len=static_cast<int32_t>(tmp-out); + return out; + }else if (memcmp (charset, "UTF-16LE", 8) == 0) { + char *tmp = out; + + for (int32_t i = 0; i < data_len; i += char_len) { + + + char_len = gdv_fn_utf8_char_length(data[i]); + + if (char_len == 1) { + + int32_t input = static_cast<int32_t>(data[i]); + + char hex_string[20]; + sprintf(hex_string, "%.4X", input); + + for(int32_t xx = 0; xx < 4; xx += 4 ){ + + char pp = hex_string[xx]; + hex_string[xx] = hex_string[xx+2]; + hex_string[xx+2] = pp; + + pp = hex_string[xx+1]; + hex_string[xx+1] = hex_string[xx+3]; + hex_string[xx+3] = pp; + } + + memcpy(tmp, hex_string, 4); + tmp+=4; + }else if (char_len == 2) { + + const auto* in_char = (const uint8_t*)(data + i); + + // Decode the multibyte character + bool is_valid_utf8_char = + arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint); + + // If it is an invalid utf8 character, UTF8Decode evaluates to false + if (!is_valid_utf8_char) { + gdv_fn_set_error_for_invalid_utf8(context, data[i]); + *out_len = 0; + return ""; + } + + // int32_t input = static_cast<int32_t>(char_codepoint); + + char hex_string[20]; + sprintf(hex_string, "%.4X", char_codepoint); + + for(int32_t xx = 0; xx < 4 ; xx += 4 ) { + + char pp = hex_string[xx]; + hex_string[xx] = hex_string[xx+2]; + hex_string[xx+2] = pp; + + pp = hex_string[xx+1]; + hex_string[xx+1] = hex_string[xx+3]; + hex_string[xx+3] = pp; + } + + memcpy(tmp, hex_string, 4); + tmp+=4; + }else if( char_len == 3) { + + const auto* in_char = (const uint8_t*)(data + i); + + // Decode the multibyte character + bool is_valid_utf8_char = + arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint); + + // If it is an invalid utf8 character, UTF8Decode evaluates to false + if (!is_valid_utf8_char) { + gdv_fn_set_error_for_invalid_utf8(context, data[i]); + *out_len = 0; + return ""; + } + + // int32_t input = static_cast<int32_t>(char_codepoint); + + char hex_string[20]; + sprintf(hex_string, "%.8X", char_codepoint); + + for(int32_t xx = 0 ; xx < 8 ; xx += 4) { + + + char pp = hex_string[xx]; + hex_string[xx] = hex_string[xx+2]; + hex_string[xx+2] = pp; + + pp = hex_string[xx+1]; + hex_string[xx+1] = hex_string[xx+3]; + hex_string[xx+3] = pp; + + // std::swap(hex_string[xx],hex_string[xx+2]); + // std::swap(hex_string[xx+1],hex_string[xx+3]); + } + + memcpy(tmp, hex_string, 8); + tmp+=8; + }else if ( char_len == 4 ) { + + const auto* in_char = (const uint8_t*)(data + i); + + // Decode the multibyte character + bool is_valid_utf8_char = + arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint); + + // If it is an invalid utf8 character, UTF8Decode evaluates to false + if (!is_valid_utf8_char) { + gdv_fn_set_error_for_invalid_utf8(context, data[i]); + *out_len = 0; + return ""; + } + + // Convert the encoded codepoint to its uppercase codepoint + // int32_t input = static_cast<int32_t>(char_codepoint); + + char hex_string[20]; + sprintf(hex_string, "%.8X", char_codepoint); + + for(int32_t xx = 0 ; xx < 8 ; xx += 4) { + + + char pp = hex_string[xx]; + hex_string[xx] = hex_string[xx+2]; + hex_string[xx+2] = pp; + + pp = hex_string[xx+1]; + hex_string[xx+1] = hex_string[xx+3]; + hex_string[xx+3] = pp; + + // std::swap(hex_string[xx],hex_string[xx+2]); + // std::swap(hex_string[xx+1],hex_string[xx+3]); Review Comment: Remember to remove the code that is not being used by the function ########## cpp/src/gandiva/gdv_string_function_stubs.cc: ########## @@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le return out; } +//converts utf8 to utf16 encoding +GANDIVA_EXPORT +const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len, + const char* charset, int32_t charset_len, int32_t* out_len) { + if (data_len == 0) { + *out_len = 0; + return ""; + } + + char* out = reinterpret_cast<char*>( + gdv_fn_context_arena_malloc(context, 32 * data_len)); + if (out == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + int32_t char_len; + uint32_t char_codepoint; + + + if (memcmp(charset, "UTF-16BE", 8) == 0) { + char *tmp=out; + + for (int32_t i = 0; i < data_len; i += char_len) { + + + char_len = gdv_fn_utf8_char_length(data[i]); + + if (char_len == 1) { + + int32_t input = static_cast<int32_t>(data[i]); + + char hex_string[20]; + sprintf(hex_string, "%.4X", input); + + memcpy(tmp, hex_string, 4); + tmp+=4; Review Comment: I think you can use the snprintf function directly in the output buffer instead of creating a temporary one and calling memcpy. I will put the logic below: ```cpp auto index_written = 0 auto num_bytes = sprintf(hex_string + index_written, "%.4X", input); index_written += num_bytes; ``` ########## cpp/src/gandiva/gdv_string_function_stubs.cc: ########## @@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le return out; } +//converts utf8 to utf16 encoding +GANDIVA_EXPORT +const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len, + const char* charset, int32_t charset_len, int32_t* out_len) { + if (data_len == 0) { + *out_len = 0; + return ""; + } + + char* out = reinterpret_cast<char*>( + gdv_fn_context_arena_malloc(context, 32 * data_len)); + if (out == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + int32_t char_len; + uint32_t char_codepoint; + + + if (memcmp(charset, "UTF-16BE", 8) == 0) { + char *tmp=out; + + for (int32_t i = 0; i < data_len; i += char_len) { + + + char_len = gdv_fn_utf8_char_length(data[i]); + + if (char_len == 1) { + + int32_t input = static_cast<int32_t>(data[i]); + + char hex_string[20]; + sprintf(hex_string, "%.4X", input); + + memcpy(tmp, hex_string, 4); + tmp+=4; Review Comment: We used the logic to make the output buffer of the hash function: https://github.com/apache/arrow/blob/3d6240c1ee7802829d2ed209f4135906e9413915/cpp/src/gandiva/hash_utils.cc#L129 ########## cpp/src/gandiva/gdv_string_function_stubs.cc: ########## @@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le return out; } +//converts utf8 to utf16 encoding +GANDIVA_EXPORT +const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len, + const char* charset, int32_t charset_len, int32_t* out_len) { + if (data_len == 0) { + *out_len = 0; + return ""; + } + + char* out = reinterpret_cast<char*>( + gdv_fn_context_arena_malloc(context, 32 * data_len)); + if (out == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + int32_t char_len; + uint32_t char_codepoint; + Review Comment: The code seems to have some format problems(it can give errors on some CI steps). To fix it, install clang-format v12 in your machine and run that command: ``` python3 [PATH-TO_ARROW]/arrow/cpp/build-support/run_clang_format.py --clang_format_binary /usr/bin/clang-format-12 --exclude_globs [PATH_TO_ARROW]/arrow/cpp/build-support/lint_exclusions.txt --source_dir [PATH-TO_ARROW]/arrow/cpp/src --fix ``` Change the PATH_TO_ARROW to the location of your arrow repository inside your machine. And the `/usr/bin/clang-format-12` is the path were clang-format is installed in Linux machines, maybe for Mac and Windows the path is different ########## cpp/src/gandiva/gdv_string_function_stubs.cc: ########## @@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le return out; } +//converts utf8 to utf16 encoding +GANDIVA_EXPORT +const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len, + const char* charset, int32_t charset_len, int32_t* out_len) { + if (data_len == 0) { + *out_len = 0; + return ""; + } + + char* out = reinterpret_cast<char*>( + gdv_fn_context_arena_malloc(context, 32 * data_len)); + if (out == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + int32_t char_len; + uint32_t char_codepoint; + + + if (memcmp(charset, "UTF-16BE", 8) == 0) { Review Comment: There is a way to compare it in a case-insensitive way? If the user passes `utf-16be` , will it work? ########## cpp/src/gandiva/function_registry_string.cc: ########## @@ -515,7 +515,15 @@ std::vector<NativeFunction> GetStringFunctionRegistry() { NativeFunction("translate", {}, DataTypeVector{utf8(), utf8(), utf8()}, utf8(), kResultNullIfNull, "translate_utf8_utf8_utf8", - NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)}; + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + + NativeFunction("encoder", {}, DataTypeVector{utf8(), utf8()}, utf8(), + kResultNullIfNull, "gdv_fn_encode", + NativeFunction::kNeedsContext), + + NativeFunction("decoder", {}, DataTypeVector{utf8(), utf8()}, utf8(), + kResultNullIfNull, "gdv_fn_encode", + NativeFunction::kNeedsContext)}; Review Comment: Should the name of the functions be equal to the Hive ones? In HIVE the functions names are `encode` and `decode` ########## cpp/src/gandiva/gdv_string_function_stubs.cc: ########## @@ -264,6 +264,356 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le return out; } +//converts utf8 to utf16 encoding +GANDIVA_EXPORT +const char* gdv_fn_encode(int64_t context, const char* data, int32_t data_len, + const char* charset, int32_t charset_len, int32_t* out_len) { + if (data_len == 0) { + *out_len = 0; + return ""; + } + + char* out = reinterpret_cast<char*>( + gdv_fn_context_arena_malloc(context, 32 * data_len)); + if (out == nullptr) { + gdv_fn_context_set_error_msg(context, + "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + int32_t char_len; + uint32_t char_codepoint; + + + if (memcmp(charset, "UTF-16BE", 8) == 0) { + char *tmp=out; + + for (int32_t i = 0; i < data_len; i += char_len) { + + + char_len = gdv_fn_utf8_char_length(data[i]); + + if (char_len == 1) { + + int32_t input = static_cast<int32_t>(data[i]); + + char hex_string[20]; + sprintf(hex_string, "%.4X", input); + + memcpy(tmp, hex_string, 4); + tmp+=4; + }else if (char_len == 2) { + + const auto* in_char = (const uint8_t*)(data + i); + + // Decode the multibyte character + bool is_valid_utf8_char = + arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint); + + // If it is an invalid utf8 character, UTF8Decode evaluates to false + if (!is_valid_utf8_char) { + gdv_fn_set_error_for_invalid_utf8(context, data[i]); + *out_len = 0; + return ""; + } + + // int32_t input = static_cast<int32_t>(char_codepoint); + + char hex_string[20]; + sprintf(hex_string, "%.4X", char_codepoint); + + memcpy(tmp, hex_string, 4); + tmp+=4; + }else if (char_len == 3) { + + const auto* in_char = (const uint8_t*)(data + i); + + // Decode the multibyte character + bool is_valid_utf8_char = + arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint); + + // If it is an invalid utf8 character, UTF8Decode evaluates to false + if (!is_valid_utf8_char) { + gdv_fn_set_error_for_invalid_utf8(context, data[i]); + *out_len = 0; + return ""; + } + + // int32_t input = static_cast<int32_t>(char_codepoint); + + char hex_string[20]; + sprintf(hex_string, "%.8X", char_codepoint); + + memcpy(tmp, hex_string, 8); + tmp+=8; + }else if(char_len==4) { + + const auto* in_char = (const uint8_t*)(data + i); + + // Decode the multibyte character + bool is_valid_utf8_char = + arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint); + + // If it is an invalid utf8 character, UTF8Decode evaluates to false + if (!is_valid_utf8_char) { + gdv_fn_set_error_for_invalid_utf8(context, data[i]); + *out_len = 0; + return ""; + } + + // int32_t input = static_cast<int32_t>(char_codepoint); + + char hex_string[20]; + sprintf(hex_string, "%.8X", char_codepoint); + + memcpy(tmp, hex_string, 8); + tmp+=8; + } + } + *out_len=static_cast<int32_t>(tmp-out); + return out; + }else if (memcmp (charset, "UTF-16LE", 8) == 0) { + char *tmp = out; + + for (int32_t i = 0; i < data_len; i += char_len) { + + + char_len = gdv_fn_utf8_char_length(data[i]); + + if (char_len == 1) { + + int32_t input = static_cast<int32_t>(data[i]); + + char hex_string[20]; + sprintf(hex_string, "%.4X", input); + + for(int32_t xx = 0; xx < 4; xx += 4 ){ + + char pp = hex_string[xx]; + hex_string[xx] = hex_string[xx+2]; + hex_string[xx+2] = pp; + + pp = hex_string[xx+1]; + hex_string[xx+1] = hex_string[xx+3]; + hex_string[xx+3] = pp; + } + + memcpy(tmp, hex_string, 4); + tmp+=4; + }else if (char_len == 2) { + + const auto* in_char = (const uint8_t*)(data + i); + + // Decode the multibyte character + bool is_valid_utf8_char = + arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint); + + // If it is an invalid utf8 character, UTF8Decode evaluates to false + if (!is_valid_utf8_char) { + gdv_fn_set_error_for_invalid_utf8(context, data[i]); + *out_len = 0; + return ""; + } + + // int32_t input = static_cast<int32_t>(char_codepoint); Review Comment: Always remember to remove the code that is not being used. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org