This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 478f2f6254932cb59680c568c3dfff56185a1967 Author: Pindikura Ravindra <[email protected]> AuthorDate: Mon Oct 1 23:02:23 2018 +0530 [Gandiva] math functions, utf8_length - removed short-citrcuit for beginsWithPlusOne since it doesn't work with multi-byte characters in utf8 --- cpp/src/gandiva/function_registry.cc | 71 ++++++++++++-- cpp/src/gandiva/like_holder.cc | 14 --- cpp/src/gandiva/like_holder.h | 4 - cpp/src/gandiva/like_holder_test.cc | 7 +- cpp/src/gandiva/precompiled/CMakeLists.txt | 4 +- cpp/src/gandiva/precompiled/arithmetic_ops.cc | 4 +- cpp/src/gandiva/precompiled/context_helper.cc | 2 +- cpp/src/gandiva/precompiled/extended_math_ops.cc | 105 +++++++++++++++++++++ .../gandiva/precompiled/extended_math_ops_test.cc | 81 ++++++++++++++++ cpp/src/gandiva/precompiled/string_ops.cc | 61 ++++++++++-- cpp/src/gandiva/precompiled/string_ops_test.cc | 35 ++++--- cpp/src/gandiva/precompiled/time.cc | 5 +- cpp/src/gandiva/precompiled/types.h | 33 ++++++- cpp/src/gandiva/tests/projector_test.cc | 77 +++++++++++++++ cpp/src/gandiva/tests/utf8_test.cc | 18 ++-- 15 files changed, 457 insertions(+), 64 deletions(-) diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index 0f4d80b..6e5bc23 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -49,11 +49,19 @@ using std::vector; NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, TYPE(), RESULT_NULL_IF_NULL, \ STRINGIFY(NAME##_##TYPE##_##TYPE)) -// Divide fubnction -#define DIVIDE(NAME, TYPE) \ - NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, TYPE(), RESULT_NULL_INTERNAL, \ - STRINGIFY(NAME##_##TYPE##_##TYPE), false /* does not need holder */, \ - true /* can return error */) +// Binary functions that : +// - have the same input type for both params +// - NULL handling is of type NULL_IINTERNAL +// - can return error. +// +// The pre-compiled fn name includes the base name & input type names. eg. add_int32_int32 +#define BINARY_UNSAFE_NULL_INTERNAL(NAME, IN_TYPE, OUT_TYPE) \ + NativeFunction(#NAME, DataTypeVector{IN_TYPE(), IN_TYPE()}, OUT_TYPE(), \ + RESULT_NULL_INTERNAL, STRINGIFY(NAME##_##IN_TYPE##_##IN_TYPE), \ + false /* does not need holder */, true /* can return error */) + +// Divide function +#define DIVIDE(NAME, TYPE) BINARY_UNSAFE_NULL_INTERNAL(NAME, TYPE, TYPE) // Binary functions that : // - have different input types, or output type @@ -91,6 +99,15 @@ using std::vector; NativeFunction(#NAME, DataTypeVector{TYPE()}, boolean(), RESULT_NULL_NEVER, \ STRINGIFY(NAME##_##TYPE)) +// Unary functions that : +// - NULL handling is of type NULL_INTERNAL +// +// The pre-compiled fn name includes the base name & input type name. eg. castFloat_int32 +#define UNARY_UNSAFE_NULL_INTERNAL(NAME, IN_TYPE, OUT_TYPE) \ + NativeFunction(#NAME, DataTypeVector{IN_TYPE()}, OUT_TYPE(), RESULT_NULL_INTERNAL, \ + STRINGIFY(NAME##_##IN_TYPE), false /* does not need holder */, \ + true /* can return error */) + // Binary functions that : // - NULL handling is of type NULL_NEVER // @@ -193,6 +210,44 @@ NativeFunction FunctionRegistry::pc_registry_[] = { UNARY_SAFE_NULL_IF_NULL(castFLOAT8, float32, float64), UNARY_SAFE_NULL_IF_NULL(castDATE, int64, date64), + // extended math ops + UNARY_SAFE_NULL_IF_NULL(cbrt, int32, float64), + UNARY_SAFE_NULL_IF_NULL(cbrt, int64, float64), + UNARY_SAFE_NULL_IF_NULL(cbrt, uint32, float64), + UNARY_SAFE_NULL_IF_NULL(cbrt, uint64, float64), + UNARY_SAFE_NULL_IF_NULL(cbrt, float32, float64), + UNARY_SAFE_NULL_IF_NULL(cbrt, float64, float64), + + UNARY_SAFE_NULL_IF_NULL(exp, int32, float64), + UNARY_SAFE_NULL_IF_NULL(exp, int64, float64), + UNARY_SAFE_NULL_IF_NULL(exp, uint32, float64), + UNARY_SAFE_NULL_IF_NULL(exp, uint64, float64), + UNARY_SAFE_NULL_IF_NULL(exp, float32, float64), + UNARY_SAFE_NULL_IF_NULL(exp, float64, float64), + + UNARY_SAFE_NULL_IF_NULL(log, int32, float64), + UNARY_SAFE_NULL_IF_NULL(log, int64, float64), + UNARY_SAFE_NULL_IF_NULL(log, uint32, float64), + UNARY_SAFE_NULL_IF_NULL(log, uint64, float64), + UNARY_SAFE_NULL_IF_NULL(log, float32, float64), + UNARY_SAFE_NULL_IF_NULL(log, float64, float64), + + UNARY_SAFE_NULL_IF_NULL(log10, int32, float64), + UNARY_SAFE_NULL_IF_NULL(log10, int64, float64), + UNARY_SAFE_NULL_IF_NULL(log10, uint32, float64), + UNARY_SAFE_NULL_IF_NULL(log10, uint64, float64), + UNARY_SAFE_NULL_IF_NULL(log10, float32, float64), + UNARY_SAFE_NULL_IF_NULL(log10, float64, float64), + + BINARY_UNSAFE_NULL_INTERNAL(log, int32, float64), + BINARY_UNSAFE_NULL_INTERNAL(log, int64, float64), + BINARY_UNSAFE_NULL_INTERNAL(log, uint32, float64), + BINARY_UNSAFE_NULL_INTERNAL(log, uint64, float64), + BINARY_UNSAFE_NULL_INTERNAL(log, float32, float64), + BINARY_UNSAFE_NULL_INTERNAL(log, float64, float64), + + BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(power, float64), + // nullable never operations NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnull), NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnotnull), @@ -346,6 +401,10 @@ NativeFunction FunctionRegistry::pc_registry_[] = { UNARY_SAFE_NULL_IF_NULL(octet_length, binary, int32), UNARY_SAFE_NULL_IF_NULL(bit_length, utf8, int32), UNARY_SAFE_NULL_IF_NULL(bit_length, binary, int32), + UNARY_UNSAFE_NULL_INTERNAL(char_length, utf8, int32), + UNARY_UNSAFE_NULL_INTERNAL(length, utf8, int32), + UNARY_UNSAFE_NULL_INTERNAL(lengthUtf8, binary, int32), + VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, equal), VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, not_equal), VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, less_than), @@ -355,8 +414,6 @@ NativeFunction FunctionRegistry::pc_registry_[] = { BINARY_RELATIONAL_SAFE_NULL_IF_NULL(starts_with, utf8), BINARY_RELATIONAL_SAFE_NULL_IF_NULL(ends_with, utf8), - BINARY_RELATIONAL_SAFE_NULL_IF_NULL(starts_with_plus_one, utf8), - BINARY_RELATIONAL_SAFE_NULL_IF_NULL(ends_with_plus_one, utf8), NativeFunction("like", DataTypeVector{utf8(), utf8()}, boolean(), RESULT_NULL_IF_NULL, "like_utf8_utf8", true /*needs_holder*/), diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index 6c35c3a..b790fb3 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -29,8 +29,6 @@ namespace helpers { RE2 LikeHolder::starts_with_regex_(R"((\w|\s)*\.\*)"); RE2 LikeHolder::ends_with_regex_(R"(\.\*(\w|\s)*)"); -RE2 LikeHolder::starts_with_plus_one_regex_(R"((\w|\s)*\.)"); -RE2 LikeHolder::ends_with_plus_one_regex_(R"(\.(\w|\s)*)"); // Short-circuit pattern matches for the two common sub cases : // - starts_with and ends_with. @@ -53,18 +51,6 @@ const FunctionNode LikeHolder::TryOptimize(const FunctionNode &node) { std::make_shared<LiteralNode>(literal_type, LiteralHolder(suffix), false); return FunctionNode("ends_with", {node.children().at(0), suffix_node}, node.return_type()); - } else if (RE2::FullMatch(pattern, starts_with_plus_one_regex_)) { - auto prefix = pattern.substr(0, pattern.length() - 1); // trim . - auto prefix_node = - std::make_shared<LiteralNode>(literal_type, LiteralHolder(prefix), false); - return FunctionNode("starts_with_plus_one", {node.children().at(0), prefix_node}, - node.return_type()); - } else if (RE2::FullMatch(pattern, ends_with_plus_one_regex_)) { - auto suffix = pattern.substr(1); // skip . - auto suffix_node = - std::make_shared<LiteralNode>(literal_type, LiteralHolder(suffix), false); - return FunctionNode("ends_with_plus_one", {node.children().at(0), suffix_node}, - node.return_type()); } } diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h index be8c928..3a88f4f 100644 --- a/cpp/src/gandiva/like_holder.h +++ b/cpp/src/gandiva/like_holder.h @@ -55,10 +55,6 @@ class LikeHolder : public FunctionHolder { static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with - static RE2 starts_with_plus_one_regex_; // pre-compiled pattern for matching - // starts_with_plus_one - static RE2 - ends_with_plus_one_regex_; // pre-compiled pattern for matching ends_with_plus_one }; #ifdef GDV_HELPERS diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index 97b384d..baaba34 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -95,14 +95,13 @@ TEST_F(TestLikeHolder, TestOptimise) { EXPECT_EQ(fnode.descriptor()->name(), "ends_with"); EXPECT_EQ(fnode.ToString(), "bool ends_with((utf8) in, (const string) xyz)"); - // optimise for 'starts_with_plus_one + // no optimisation for others. fnode = LikeHolder::TryOptimize(BuildLike("xyz_")); - EXPECT_EQ(fnode.ToString(), "bool starts_with_plus_one((utf8) in, (const string) xyz)"); + EXPECT_EQ(fnode.descriptor()->name(), "like"); fnode = LikeHolder::TryOptimize(BuildLike("_xyz")); - EXPECT_EQ(fnode.ToString(), "bool ends_with_plus_one((utf8) in, (const string) xyz)"); + EXPECT_EQ(fnode.descriptor()->name(), "like"); - // no optimisation for others. fnode = LikeHolder::TryOptimize(BuildLike("%xyz%")); EXPECT_EQ(fnode.descriptor()->name(), "like"); diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index f3f854c..21621b4 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -21,6 +21,7 @@ set(PRECOMPILED_SRCS arithmetic_ops.cc bitmap.cc context_helper.cc + extended_math_ops.cc hash.cc print.cc sample.cc @@ -57,5 +58,6 @@ add_precompiled_unit_test(epoch_time_point_test.cc) add_precompiled_unit_test(time_test.cc time.cc timestamp_arithmetic.cc context_helper.cc ../execution_context.cc) add_precompiled_unit_test(hash_test.cc hash.cc) add_precompiled_unit_test(sample_test.cc sample.cc) -add_precompiled_unit_test(string_ops_test.cc string_ops.cc) +add_precompiled_unit_test(string_ops_test.cc string_ops.cc context_helper.cc ../execution_context.cc) add_precompiled_unit_test(arithmetic_ops_test.cc arithmetic_ops.cc context_helper.cc ../execution_context.cc) +add_precompiled_unit_test(extended_math_ops_test.cc extended_math_ops.cc context_helper.cc ../execution_context.cc) diff --git a/cpp/src/gandiva/precompiled/arithmetic_ops.cc b/cpp/src/gandiva/precompiled/arithmetic_ops.cc index 36d4076..ae6a0d3 100644 --- a/cpp/src/gandiva/precompiled/arithmetic_ops.cc +++ b/cpp/src/gandiva/precompiled/arithmetic_ops.cc @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include "../execution_context.h" - extern "C" { #include "./types.h" @@ -170,7 +168,7 @@ NUMERIC_BOOL_DATE_FUNCTION(IS_NOT_DISTINCT_FROM) } \ if (in2 == 0) { \ char const* err_msg = "divide by zero error"; \ - set_error_msg(execution_context, err_msg); \ + context_set_error_msg(execution_context, err_msg); \ return 0; \ } \ *out_valid = true; \ diff --git a/cpp/src/gandiva/precompiled/context_helper.cc b/cpp/src/gandiva/precompiled/context_helper.cc index 1c05eda..35dfdf7 100644 --- a/cpp/src/gandiva/precompiled/context_helper.cc +++ b/cpp/src/gandiva/precompiled/context_helper.cc @@ -18,7 +18,7 @@ #include "../execution_context.h" #include "types.h" -void set_error_msg(int64_t context_ptr, char const* err_msg) { +void context_set_error_msg(int64_t context_ptr, char const* err_msg) { gandiva::helpers::ExecutionContext* execution_context_ptr = reinterpret_cast<gandiva::helpers::ExecutionContext*>(context_ptr); (execution_context_ptr)->set_error_msg(err_msg); diff --git a/cpp/src/gandiva/precompiled/extended_math_ops.cc b/cpp/src/gandiva/precompiled/extended_math_ops.cc new file mode 100644 index 0000000..617819a --- /dev/null +++ b/cpp/src/gandiva/precompiled/extended_math_ops.cc @@ -0,0 +1,105 @@ +// Copyright (C) 2017-2018 Dremio Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +extern "C" { + +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "./types.h" + +// Expand the inner fn for types that support extended math. +#define ENUMERIC_TYPES_UNARY(INNER, OUT_TYPE) \ + INNER(int32, OUT_TYPE) \ + INNER(uint32, OUT_TYPE) \ + INNER(int64, OUT_TYPE) \ + INNER(uint64, OUT_TYPE) \ + INNER(float32, OUT_TYPE) \ + INNER(float64, OUT_TYPE) + +// Cubic root +#define CBRT(IN_TYPE, OUT_TYPE) \ + FORCE_INLINE \ + OUT_TYPE cbrt_##IN_TYPE(IN_TYPE in) { return (cbrtl(in)); } + +ENUMERIC_TYPES_UNARY(CBRT, float64) + +// Exponent +#define EXP(IN_TYPE, OUT_TYPE) \ + FORCE_INLINE \ + OUT_TYPE exp_##IN_TYPE(IN_TYPE in) { return (expl(in)); } + +ENUMERIC_TYPES_UNARY(EXP, float64) + +// log +#define LOG(IN_TYPE, OUT_TYPE) \ + FORCE_INLINE \ + OUT_TYPE log_##IN_TYPE(IN_TYPE in) { return (logl(in)); } + +ENUMERIC_TYPES_UNARY(LOG, float64) + +// log base 10 +#define LOG10(IN_TYPE, OUT_TYPE) \ + FORCE_INLINE \ + OUT_TYPE log10_##IN_TYPE(IN_TYPE in) { return (log10l(in)); } + +ENUMERIC_TYPES_UNARY(LOG10, float64) + +FORCE_INLINE +void set_error_for_logbase(int64_t execution_context, double base) { + char const *prefix = "divide by zero error with log of base"; + int size = strlen(prefix) + 64; + char *error = (char *)malloc(size); + snprintf(error, size, "%s %f", prefix, base); + context_set_error_msg(execution_context, error); + free(error); +} + +// log with base +#define LOG_WITH_BASE(IN_TYPE1, IN_TYPE2, OUT_TYPE) \ + FORCE_INLINE \ + OUT_TYPE log_##IN_TYPE1##_##IN_TYPE2(IN_TYPE1 base, boolean is_base_valid, \ + IN_TYPE2 value, boolean is_value_valid, \ + int64 context, boolean *out_valid) { \ + *out_valid = false; \ + if (!is_base_valid || !is_value_valid) { \ + return 0; \ + } \ + OUT_TYPE log_of_base = logl(base); \ + if (log_of_base == 0) { \ + set_error_for_logbase(context, base); \ + return 0; \ + } \ + *out_valid = true; \ + return (logl(value) / logl(base)); \ + } + +LOG_WITH_BASE(int32, int32, float64) +LOG_WITH_BASE(uint32, uint32, float64) +LOG_WITH_BASE(int64, int64, float64) +LOG_WITH_BASE(uint64, uint64, float64) +LOG_WITH_BASE(float32, float32, float64) +LOG_WITH_BASE(float64, float64, float64) + +// power +#define POWER(IN_TYPE1, IN_TYPE2, OUT_TYPE) \ + FORCE_INLINE \ + OUT_TYPE power_##IN_TYPE1##_##IN_TYPE2(IN_TYPE1 in1, IN_TYPE2 in2) { \ + return (powl(in1, in2)); \ + } + +POWER(float64, float64, float64) + +} // extern "C" diff --git a/cpp/src/gandiva/precompiled/extended_math_ops_test.cc b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc new file mode 100644 index 0000000..cfee248 --- /dev/null +++ b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc @@ -0,0 +1,81 @@ +// Copyright (C) 2017-2018 Dremio Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <gtest/gtest.h> +#include "gandiva/execution_context.h" +#include "gandiva/precompiled/types.h" + +namespace gandiva { + +TEST(TestExtendedMathOps, TestCbrt) { + EXPECT_EQ(cbrt_int32(27), 3); + EXPECT_EQ(cbrt_int64(27), 3); + EXPECT_EQ(cbrt_float32(27), 3); + EXPECT_EQ(cbrt_float64(27), 3); + EXPECT_EQ(cbrt_float64(-27), -3); + + EXPECT_EQ(cbrt_float32(15.625), 2.5); + EXPECT_EQ(cbrt_float64(15.625), 2.5); +} + +TEST(TestExtendedMathOps, TestExp) { + double val = 20.085536923187668; + + EXPECT_EQ(exp_int32(3), val); + EXPECT_EQ(exp_int64(3), val); + EXPECT_EQ(exp_float32(3), val); + EXPECT_EQ(exp_float64(3), val); +} + +TEST(TestExtendedMathOps, TestLog) { + double val = 4.1588830833596715; + + EXPECT_EQ(log_int32(64), val); + EXPECT_EQ(log_int64(64), val); + EXPECT_EQ(log_float32(64), val); + EXPECT_EQ(log_float64(64), val); + + EXPECT_EQ(log_int32(0), -std::numeric_limits<double>::infinity()); +} + +TEST(TestExtendedMathOps, TestLog10) { + EXPECT_EQ(log10_int32(100), 2); + EXPECT_EQ(log10_int64(100), 2); + EXPECT_EQ(log10_float32(100), 2); + EXPECT_EQ(log10_float64(100), 2); +} + +TEST(TestExtendedMathOps, TestPower) { + EXPECT_EQ(power_float64_float64(2, 5.4), 42.22425314473263); + EXPECT_EQ(power_float64_float64(5.4, 2), 29.160000000000004); +} + +TEST(TestArithmeticOps, TestLogWithBase) { + boolean is_valid; + gandiva::helpers::ExecutionContext error_holder; + float64 out = log_int32_int32(1, true, 10, true, (int64)&error_holder, &is_valid); + EXPECT_EQ(out, 0); + EXPECT_EQ(is_valid, false); + EXPECT_EQ(error_holder.has_error(), true); + EXPECT_TRUE(error_holder.get_error().find("divide by zero error") != std::string::npos) + << error_holder.get_error(); + + gandiva::helpers::ExecutionContext error_holder1; + out = log_int32_int32(2, true, 64, true, (int64)&error_holder, &is_valid); + EXPECT_EQ(out, 6); + EXPECT_EQ(is_valid, true); + EXPECT_EQ(error_holder1.has_error(), false); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 184c241..7fc0501 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -19,6 +19,8 @@ extern "C" { +#include <stdio.h> +#include <stdlib.h> #include <string.h> #include "./types.h" @@ -88,15 +90,62 @@ bool ends_with_utf8_utf8(const char *data, int32 data_len, const char *suffix, } FORCE_INLINE -bool starts_with_plus_one_utf8_utf8(const char *data, int32 data_len, const char *prefix, - int32 prefix_len) { - return ((data_len == prefix_len + 1) && (memcmp(data, prefix, prefix_len) == 0)); +int32 utf8_char_length(char c) { + if (c >= 0) { // 1-byte char + return 1; + } else if ((c & 0xE0) == 0xC0) { // 2-byte char + return 2; + } else if ((c & 0xF0) == 0xE0) { // 3-byte char + return 3; + } else if ((c & 0xF8) == 0xF0) { // 4-byte char + return 4; + } + // invalid char + return 0; } FORCE_INLINE -bool ends_with_plus_one_utf8_utf8(const char *data, int32 data_len, const char *suffix, - int32 suffix_len) { - return ((data_len == suffix_len + 1) && (memcmp(data + 1, suffix, suffix_len) == 0)); +void set_error_for_invalid_utf(int64_t execution_context, char val) { + char const *fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string"; + int size = strlen(fmt) + 64; + char *error = (char *)malloc(size); + snprintf(error, size, fmt, (unsigned char)val); + context_set_error_msg(execution_context, error); + free(error); +} + +// Count the number of utf8 characters +FORCE_INLINE +int32 utf8_length(const char *data, int32 data_len, boolean is_valid, int64 context, + boolean *out_valid) { + *out_valid = false; + if (!is_valid) { + return 0; + } + + int char_len = 0; + int count = 0; + for (int i = 0; i < data_len; i += char_len) { + char_len = utf8_char_length(data[i]); + if (char_len == 0) { + set_error_for_invalid_utf(context, data[i]); + return 0; + } + ++count; + } + *out_valid = true; + return count; } +#define UTF8_LENGTH_NULL_INTERNAL(NAME, TYPE) \ + FORCE_INLINE \ + int32 NAME##_##TYPE(TYPE in, int32 in_len, boolean is_valid, int64 context, \ + boolean *out_valid) { \ + return utf8_length(in, in_len, is_valid, context, out_valid); \ + } + +UTF8_LENGTH_NULL_INTERNAL(char_length, utf8) +UTF8_LENGTH_NULL_INTERNAL(length, utf8) +UTF8_LENGTH_NULL_INTERNAL(lengthUtf8, binary) + } // extern "C" diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index b4f522c..64e3264 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -16,8 +16,8 @@ // under the License. #include <gtest/gtest.h> +#include "gandiva/execution_context.h" #include "gandiva/precompiled/types.h" - namespace gandiva { TEST(TestStringOps, TestCompare) { @@ -51,18 +51,31 @@ TEST(TestStringOps, TestBeginsEnds) { EXPECT_TRUE(ends_with_utf8_utf8("sir", 3, "sir", 3)); EXPECT_FALSE(ends_with_utf8_utf8("ir", 2, "sir", 3)); EXPECT_FALSE(ends_with_utf8_utf8("hello", 5, "sir", 3)); +} + +TEST(TestStringOps, TestCharLength) { + bool valid; + + EXPECT_EQ(utf8_length("hello sir", 9, true, 0, &valid), 9); + EXPECT_TRUE(valid); + + std::string a("âpple"); + EXPECT_EQ(utf8_length(a.data(), a.length(), true, 0, &valid), 5); + EXPECT_TRUE(valid); - // starts_with_plus_one - EXPECT_TRUE(starts_with_plus_one_utf8_utf8("hello ", 6, "hello", 5)); - EXPECT_FALSE(starts_with_plus_one_utf8_utf8("hello world", 11, "hello", 5)); - EXPECT_FALSE(starts_with_plus_one_utf8_utf8("hello", 5, "hello", 5)); - EXPECT_FALSE(starts_with_plus_one_utf8_utf8("hell", 4, "hello", 5)); + std::string b("मदन"); + EXPECT_EQ(utf8_length(b.data(), b.length(), true, 0, &valid), 3); + EXPECT_TRUE(valid); - // ends_with_plus_one - EXPECT_TRUE(ends_with_plus_one_utf8_utf8("gworld", 6, "world", 5)); - EXPECT_FALSE(ends_with_plus_one_utf8_utf8("hello world", 11, "world", 5)); - EXPECT_FALSE(ends_with_plus_one_utf8_utf8("world", 5, "world", 5)); - EXPECT_FALSE(ends_with_plus_one_utf8_utf8("worl", 4, "world", 5)); + // invalid utf8 + gandiva::helpers::ExecutionContext ctx; + std::string c("\xf8\x28"); + EXPECT_EQ(utf8_length(c.data(), c.length(), true, (int64)&ctx, &valid), 0); + EXPECT_TRUE(ctx.get_error().find( + "unexpected byte \\f8 encountered while decoding utf8 string") != + std::string::npos) + << ctx.get_error(); + EXPECT_FALSE(valid); } } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/time.cc b/cpp/src/gandiva/precompiled/time.cc index 9a3d6e3..2ac2fd9 100644 --- a/cpp/src/gandiva/precompiled/time.cc +++ b/cpp/src/gandiva/precompiled/time.cc @@ -509,9 +509,8 @@ void set_error_for_date(int32 length, const char *input, const char *msg, int64_t execution_context) { int size = length + strlen(msg) + 1; char *error = (char *)malloc(size); - strcpy(error, msg); - strcat(error, input); - set_error_msg(execution_context, error); + snprintf(error, size, "%s%s", msg, input); + context_set_error_msg(execution_context, error); free(error); } diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index c9ac3c2..de924fa 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -45,13 +45,12 @@ using binary = char*; #define FORCE_INLINE __attribute__((always_inline)) #endif -// Declarations : used in testing - extern "C" { bool bitMapGetBit(const unsigned char* bmap, int position); void bitMapSetBit(unsigned char* bmap, int position, bool value); void bitMapClearBitIfFalse(unsigned char* bmap, int position, bool value); +void context_set_error_msg(int64_t context_ptr, const char *err_msg); int64 extractMillennium_timestamp(timestamp millis); int64 extractCentury_timestamp(timestamp millis); @@ -126,6 +125,31 @@ int32 mod_int64_int32(int64 left, int32 right); int64 divide_int64_int64(int64 in1, boolean is_valid1, int64 in2, boolean is_valid2, int64 error_holder, bool *out_valid); +float64 cbrt_int32(int32); +float64 cbrt_int64(int64); +float64 cbrt_float32(float32); +float64 cbrt_float64(float64); + +float64 exp_int32(int32); +float64 exp_int64(int64); +float64 exp_float32(float32); +float64 exp_float64(float64); + +float64 log_int32(int32); +float64 log_int64(int64); +float64 log_float32(float32); +float64 log_float64(float64); + +float64 log10_int32(int32); +float64 log10_int64(int64); +float64 log10_float32(float32); +float64 log10_float64(float64); + +float64 power_float64_float64(float64, float64); + +float64 log_int32_int32(int32 base, boolean is_base_valid, int32 value, + boolean is_value_valid, int64 context, boolean *out_valid); + bool starts_with_utf8_utf8(const char *data, int32 data_len, const char *prefix, int32 prefix_len); bool ends_with_utf8_utf8(const char *data, int32 data_len, const char *suffix, @@ -135,11 +159,12 @@ bool starts_with_plus_one_utf8_utf8(const char *data, int32 data_len, const char bool ends_with_plus_one_utf8_utf8(const char *data, int32 data_len, const char *suffix, int32 suffix_len); +int32 utf8_length(const char *data, int32 data_len, boolean is_valid, int64 context, + boolean *out_valid); + date64 castDATE_utf8(const char *input, int32 length, boolean is_valid1, int64_t execution_context, boolean *out_valid); -void set_error_msg(int64_t context_ptr, char const *err_msg); - } // extern "C" #endif // PRECOMPILED_TYPES_H diff --git a/cpp/src/gandiva/tests/projector_test.cc b/cpp/src/gandiva/tests/projector_test.cc index a7f71ec..f6feb9c 100644 --- a/cpp/src/gandiva/tests/projector_test.cc +++ b/cpp/src/gandiva/tests/projector_test.cc @@ -320,6 +320,83 @@ TEST_F(TestProjector, TestAllIntTypes) { TestArithmeticOpsForType<arrow::Int64Type, int64_t>(pool_); } +TEST_F(TestProjector, TestExtendedMath) { + // schema for input fields + auto field0 = arrow::field("f0", arrow::float64()); + auto field1 = arrow::field("f1", arrow::float64()); + auto schema = arrow::schema({field0, field1}); + + // output fields + auto field_cbrt = arrow::field("cbrt", arrow::float64()); + auto field_exp = arrow::field("exp", arrow::float64()); + auto field_log = arrow::field("log", arrow::float64()); + auto field_log10 = arrow::field("log10", arrow::float64()); + auto field_logb = arrow::field("logb", arrow::float64()); + auto field_power = arrow::field("power", arrow::float64()); + + // Build expression + auto cbrt_expr = TreeExprBuilder::MakeExpression("cbrt", {field0}, field_cbrt); + auto exp_expr = TreeExprBuilder::MakeExpression("exp", {field0}, field_exp); + auto log_expr = TreeExprBuilder::MakeExpression("log", {field0}, field_log); + auto log10_expr = TreeExprBuilder::MakeExpression("log10", {field0}, field_log10); + auto logb_expr = TreeExprBuilder::MakeExpression("log", {field0, field1}, field_logb); + auto power_expr = + TreeExprBuilder::MakeExpression("power", {field0, field1}, field_power); + + std::shared_ptr<Projector> projector; + Status status = Projector::Make( + schema, {cbrt_expr, exp_expr, log_expr, log10_expr, logb_expr, power_expr}, + &projector); + EXPECT_TRUE(status.ok()); + + // Create a row-batch with some sample data + int num_records = 4; + std::vector<double> input0 = {16, 10, -14, 8.3}; + std::vector<double> input1 = {2, 3, 5, 7}; + std::vector<bool> validity = {true, true, true, true}; + + auto array0 = MakeArrowArray<arrow::DoubleType, double>(input0, validity); + auto array1 = MakeArrowArray<arrow::DoubleType, double>(input1, validity); + + // expected output + std::vector<double> cbrt_vals; + std::vector<double> exp_vals; + std::vector<double> log_vals; + std::vector<double> log10_vals; + std::vector<double> logb_vals; + std::vector<double> power_vals; + for (int i = 0; i < num_records; i++) { + cbrt_vals.push_back(cbrtl(input0[i])); + exp_vals.push_back(expl(input0[i])); + log_vals.push_back(logl(input0[i])); + log10_vals.push_back(log10l(input0[i])); + logb_vals.push_back(logl(input1[i]) / logl(input0[i])); + power_vals.push_back(powl(input0[i], input1[i])); + } + auto expected_cbrt = MakeArrowArray<arrow::DoubleType, double>(cbrt_vals, validity); + auto expected_exp = MakeArrowArray<arrow::DoubleType, double>(exp_vals, validity); + auto expected_log = MakeArrowArray<arrow::DoubleType, double>(log_vals, validity); + auto expected_log10 = MakeArrowArray<arrow::DoubleType, double>(log10_vals, validity); + auto expected_logb = MakeArrowArray<arrow::DoubleType, double>(logb_vals, validity); + auto expected_power = MakeArrowArray<arrow::DoubleType, double>(power_vals, validity); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, array1}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(expected_cbrt, outputs.at(0)); + EXPECT_ARROW_ARRAY_EQUALS(expected_exp, outputs.at(1)); + EXPECT_ARROW_ARRAY_EQUALS(expected_log, outputs.at(2)); + EXPECT_ARROW_ARRAY_EQUALS(expected_log10, outputs.at(3)); + EXPECT_ARROW_ARRAY_EQUALS(expected_logb, outputs.at(4)); + EXPECT_ARROW_ARRAY_EQUALS(expected_power, outputs.at(5)); +} + TEST_F(TestProjector, TestFloatLessThan) { // schema for input fields auto field0 = field("f0", float32()); diff --git a/cpp/src/gandiva/tests/utf8_test.cc b/cpp/src/gandiva/tests/utf8_test.cc index 3e3a495..63d49dd 100644 --- a/cpp/src/gandiva/tests/utf8_test.cc +++ b/cpp/src/gandiva/tests/utf8_test.cc @@ -46,10 +46,12 @@ TEST_F(TestUtf8, TestSimple) { // output fields auto res_1 = field("res1", int32()); auto res_2 = field("res2", boolean()); + auto res_3 = field("res3", int32()); // build expressions. // octet_length(a) // octet_length(a) == bit_length(a) / 8 + // length(a) auto expr_a = TreeExprBuilder::MakeExpression("octet_length", {field_a}, res_1); auto node_a = TreeExprBuilder::MakeField(field_a); @@ -60,20 +62,23 @@ TEST_F(TestUtf8, TestSimple) { auto is_equal = TreeExprBuilder::MakeFunction("equal", {octet_length, div_8}, boolean()); auto expr_b = TreeExprBuilder::MakeExpression(is_equal, res_2); + auto expr_c = TreeExprBuilder::MakeExpression("length", {field_a}, res_3); // Build a projector for the expressions. std::shared_ptr<Projector> projector; - Status status = Projector::Make(schema, {expr_a, expr_b}, &projector); + Status status = Projector::Make(schema, {expr_a, expr_b, expr_c}, &projector); EXPECT_TRUE(status.ok()) << status.message(); // Create a row-batch with some sample data - int num_records = 4; - auto array_a = - MakeArrowArrayUtf8({"foo", "hello", "bye", "hi"}, {true, true, false, true}); + int num_records = 5; + auto array_a = MakeArrowArrayUtf8({"foo", "hello", "bye", "hi", "मदन"}, + {true, true, false, true, true}); // expected output - auto exp_1 = MakeArrowArrayInt32({3, 5, 0, 2}, {true, true, false, true}); - auto exp_2 = MakeArrowArrayBool({true, true, false, true}, {true, true, false, true}); + auto exp_1 = MakeArrowArrayInt32({3, 5, 0, 2, 9}, {true, true, false, true, true}); + auto exp_2 = MakeArrowArrayBool({true, true, false, true, true}, + {true, true, false, true, true}); + auto exp_3 = MakeArrowArrayInt32({3, 5, 0, 2, 3}, {true, true, false, true, true}); // prepare input record batch auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); @@ -86,6 +91,7 @@ TEST_F(TestUtf8, TestSimple) { // Validate results EXPECT_ARROW_ARRAY_EQUALS(exp_1, outputs.at(0)); EXPECT_ARROW_ARRAY_EQUALS(exp_2, outputs.at(1)); + EXPECT_ARROW_ARRAY_EQUALS(exp_3, outputs.at(2)); } TEST_F(TestUtf8, TestLiteral) {
