[arrow] 16/17: [Gandiva] math functions, utf8_length

wesm Wed, 10 Oct 2018 01:03:01 -0700

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


commit 478f2f6254932cb59680c568c3dfff56185a1967
Author: Pindikura Ravindra <[email protected]>
AuthorDate: Mon Oct 1 23:02:23 2018 +0530

    [Gandiva] math functions, utf8_length
    
    - removed short-citrcuit for beginsWithPlusOne since it doesn't work with
      multi-byte characters in utf8
---
 cpp/src/gandiva/function_registry.cc               |  71 ++++++++++++--
 cpp/src/gandiva/like_holder.cc                     |  14 ---
 cpp/src/gandiva/like_holder.h                      |   4 -
 cpp/src/gandiva/like_holder_test.cc                |   7 +-
 cpp/src/gandiva/precompiled/CMakeLists.txt         |   4 +-
 cpp/src/gandiva/precompiled/arithmetic_ops.cc      |   4 +-
 cpp/src/gandiva/precompiled/context_helper.cc      |   2 +-
 cpp/src/gandiva/precompiled/extended_math_ops.cc   | 105 +++++++++++++++++++++
 .../gandiva/precompiled/extended_math_ops_test.cc  |  81 ++++++++++++++++
 cpp/src/gandiva/precompiled/string_ops.cc          |  61 ++++++++++--
 cpp/src/gandiva/precompiled/string_ops_test.cc     |  35 ++++---
 cpp/src/gandiva/precompiled/time.cc                |   5 +-
 cpp/src/gandiva/precompiled/types.h                |  33 ++++++-
 cpp/src/gandiva/tests/projector_test.cc            |  77 +++++++++++++++
 cpp/src/gandiva/tests/utf8_test.cc                 |  18 ++--
 15 files changed, 457 insertions(+), 64 deletions(-)

diff --git a/cpp/src/gandiva/function_registry.cc 
b/cpp/src/gandiva/function_registry.cc
index 0f4d80b..6e5bc23 100644
--- a/cpp/src/gandiva/function_registry.cc
+++ b/cpp/src/gandiva/function_registry.cc
@@ -49,11 +49,19 @@ using std::vector;
   NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, TYPE(), 
RESULT_NULL_IF_NULL, \
                  STRINGIFY(NAME##_##TYPE##_##TYPE))
 
-// Divide fubnction
-#define DIVIDE(NAME, TYPE)                                                     
       \
-  NativeFunction(#NAME, DataTypeVector{TYPE(), TYPE()}, TYPE(), 
RESULT_NULL_INTERNAL, \
-                 STRINGIFY(NAME##_##TYPE##_##TYPE), false /* does not need 
holder */, \
-                 true /* can return error */)
+// Binary functions that :
+// - have the same input type for both params
+// - NULL handling is of type NULL_IINTERNAL
+// - can return error.
+//
+// The pre-compiled fn name includes the base name & input type names. eg. 
add_int32_int32
+#define BINARY_UNSAFE_NULL_INTERNAL(NAME, IN_TYPE, OUT_TYPE)                   
 \
+  NativeFunction(#NAME, DataTypeVector{IN_TYPE(), IN_TYPE()}, OUT_TYPE(),      
 \
+                 RESULT_NULL_INTERNAL, 
STRINGIFY(NAME##_##IN_TYPE##_##IN_TYPE), \
+                 false /* does not need holder */, true /* can return error */)
+
+// Divide function
+#define DIVIDE(NAME, TYPE) BINARY_UNSAFE_NULL_INTERNAL(NAME, TYPE, TYPE)
 
 // Binary functions that :
 // - have different input types, or output type
@@ -91,6 +99,15 @@ using std::vector;
   NativeFunction(#NAME, DataTypeVector{TYPE()}, boolean(), RESULT_NULL_NEVER, \
                  STRINGIFY(NAME##_##TYPE))
 
+// Unary functions that :
+// - NULL handling is of type NULL_INTERNAL
+//
+// The pre-compiled fn name includes the base name & input type name. eg. 
castFloat_int32
+#define UNARY_UNSAFE_NULL_INTERNAL(NAME, IN_TYPE, OUT_TYPE)                    
      \
+  NativeFunction(#NAME, DataTypeVector{IN_TYPE()}, OUT_TYPE(), 
RESULT_NULL_INTERNAL, \
+                 STRINGIFY(NAME##_##IN_TYPE), false /* does not need holder 
*/,      \
+                 true /* can return error */)
+
 // Binary functions that :
 // - NULL handling is of type NULL_NEVER
 //
@@ -193,6 +210,44 @@ NativeFunction FunctionRegistry::pc_registry_[] = {
     UNARY_SAFE_NULL_IF_NULL(castFLOAT8, float32, float64),
     UNARY_SAFE_NULL_IF_NULL(castDATE, int64, date64),
 
+    // extended math ops
+    UNARY_SAFE_NULL_IF_NULL(cbrt, int32, float64),
+    UNARY_SAFE_NULL_IF_NULL(cbrt, int64, float64),
+    UNARY_SAFE_NULL_IF_NULL(cbrt, uint32, float64),
+    UNARY_SAFE_NULL_IF_NULL(cbrt, uint64, float64),
+    UNARY_SAFE_NULL_IF_NULL(cbrt, float32, float64),
+    UNARY_SAFE_NULL_IF_NULL(cbrt, float64, float64),
+
+    UNARY_SAFE_NULL_IF_NULL(exp, int32, float64),
+    UNARY_SAFE_NULL_IF_NULL(exp, int64, float64),
+    UNARY_SAFE_NULL_IF_NULL(exp, uint32, float64),
+    UNARY_SAFE_NULL_IF_NULL(exp, uint64, float64),
+    UNARY_SAFE_NULL_IF_NULL(exp, float32, float64),
+    UNARY_SAFE_NULL_IF_NULL(exp, float64, float64),
+
+    UNARY_SAFE_NULL_IF_NULL(log, int32, float64),
+    UNARY_SAFE_NULL_IF_NULL(log, int64, float64),
+    UNARY_SAFE_NULL_IF_NULL(log, uint32, float64),
+    UNARY_SAFE_NULL_IF_NULL(log, uint64, float64),
+    UNARY_SAFE_NULL_IF_NULL(log, float32, float64),
+    UNARY_SAFE_NULL_IF_NULL(log, float64, float64),
+
+    UNARY_SAFE_NULL_IF_NULL(log10, int32, float64),
+    UNARY_SAFE_NULL_IF_NULL(log10, int64, float64),
+    UNARY_SAFE_NULL_IF_NULL(log10, uint32, float64),
+    UNARY_SAFE_NULL_IF_NULL(log10, uint64, float64),
+    UNARY_SAFE_NULL_IF_NULL(log10, float32, float64),
+    UNARY_SAFE_NULL_IF_NULL(log10, float64, float64),
+
+    BINARY_UNSAFE_NULL_INTERNAL(log, int32, float64),
+    BINARY_UNSAFE_NULL_INTERNAL(log, int64, float64),
+    BINARY_UNSAFE_NULL_INTERNAL(log, uint32, float64),
+    BINARY_UNSAFE_NULL_INTERNAL(log, uint64, float64),
+    BINARY_UNSAFE_NULL_INTERNAL(log, float32, float64),
+    BINARY_UNSAFE_NULL_INTERNAL(log, float64, float64),
+
+    BINARY_SYMMETRIC_SAFE_NULL_IF_NULL(power, float64),
+
     // nullable never operations
     NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnull),
     NUMERIC_BOOL_DATE_TYPES(UNARY_SAFE_NULL_NEVER_BOOL, isnotnull),
@@ -346,6 +401,10 @@ NativeFunction FunctionRegistry::pc_registry_[] = {
     UNARY_SAFE_NULL_IF_NULL(octet_length, binary, int32),
     UNARY_SAFE_NULL_IF_NULL(bit_length, utf8, int32),
     UNARY_SAFE_NULL_IF_NULL(bit_length, binary, int32),
+    UNARY_UNSAFE_NULL_INTERNAL(char_length, utf8, int32),
+    UNARY_UNSAFE_NULL_INTERNAL(length, utf8, int32),
+    UNARY_UNSAFE_NULL_INTERNAL(lengthUtf8, binary, int32),
+
     VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, equal),
     VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, not_equal),
     VAR_LEN_TYPES(BINARY_RELATIONAL_SAFE_NULL_IF_NULL, less_than),
@@ -355,8 +414,6 @@ NativeFunction FunctionRegistry::pc_registry_[] = {
 
     BINARY_RELATIONAL_SAFE_NULL_IF_NULL(starts_with, utf8),
     BINARY_RELATIONAL_SAFE_NULL_IF_NULL(ends_with, utf8),
-    BINARY_RELATIONAL_SAFE_NULL_IF_NULL(starts_with_plus_one, utf8),
-    BINARY_RELATIONAL_SAFE_NULL_IF_NULL(ends_with_plus_one, utf8),
 
     NativeFunction("like", DataTypeVector{utf8(), utf8()}, boolean(), 
RESULT_NULL_IF_NULL,
                    "like_utf8_utf8", true /*needs_holder*/),
diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc
index 6c35c3a..b790fb3 100644
--- a/cpp/src/gandiva/like_holder.cc
+++ b/cpp/src/gandiva/like_holder.cc
@@ -29,8 +29,6 @@ namespace helpers {
 
 RE2 LikeHolder::starts_with_regex_(R"((\w|\s)*\.\*)");
 RE2 LikeHolder::ends_with_regex_(R"(\.\*(\w|\s)*)");
-RE2 LikeHolder::starts_with_plus_one_regex_(R"((\w|\s)*\.)");
-RE2 LikeHolder::ends_with_plus_one_regex_(R"(\.(\w|\s)*)");
 
 // Short-circuit pattern matches for the two common sub cases :
 // - starts_with and ends_with.
@@ -53,18 +51,6 @@ const FunctionNode LikeHolder::TryOptimize(const 
FunctionNode &node) {
           std::make_shared<LiteralNode>(literal_type, LiteralHolder(suffix), 
false);
       return FunctionNode("ends_with", {node.children().at(0), suffix_node},
                           node.return_type());
-    } else if (RE2::FullMatch(pattern, starts_with_plus_one_regex_)) {
-      auto prefix = pattern.substr(0, pattern.length() - 1);  // trim .
-      auto prefix_node =
-          std::make_shared<LiteralNode>(literal_type, LiteralHolder(prefix), 
false);
-      return FunctionNode("starts_with_plus_one", {node.children().at(0), 
prefix_node},
-                          node.return_type());
-    } else if (RE2::FullMatch(pattern, ends_with_plus_one_regex_)) {
-      auto suffix = pattern.substr(1);  // skip .
-      auto suffix_node =
-          std::make_shared<LiteralNode>(literal_type, LiteralHolder(suffix), 
false);
-      return FunctionNode("ends_with_plus_one", {node.children().at(0), 
suffix_node},
-                          node.return_type());
     }
   }
 
diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h
index be8c928..3a88f4f 100644
--- a/cpp/src/gandiva/like_holder.h
+++ b/cpp/src/gandiva/like_holder.h
@@ -55,10 +55,6 @@ class LikeHolder : public FunctionHolder {
 
   static RE2 starts_with_regex_;  // pre-compiled pattern for matching 
starts_with
   static RE2 ends_with_regex_;    // pre-compiled pattern for matching 
ends_with
-  static RE2 starts_with_plus_one_regex_;  // pre-compiled pattern for matching
-                                           // starts_with_plus_one
-  static RE2
-      ends_with_plus_one_regex_;  // pre-compiled pattern for matching 
ends_with_plus_one
 };
 
 #ifdef GDV_HELPERS
diff --git a/cpp/src/gandiva/like_holder_test.cc 
b/cpp/src/gandiva/like_holder_test.cc
index 97b384d..baaba34 100644
--- a/cpp/src/gandiva/like_holder_test.cc
+++ b/cpp/src/gandiva/like_holder_test.cc
@@ -95,14 +95,13 @@ TEST_F(TestLikeHolder, TestOptimise) {
   EXPECT_EQ(fnode.descriptor()->name(), "ends_with");
   EXPECT_EQ(fnode.ToString(), "bool ends_with((utf8) in, (const string) xyz)");
 
-  // optimise for 'starts_with_plus_one
+  // no optimisation for others.
   fnode = LikeHolder::TryOptimize(BuildLike("xyz_"));
-  EXPECT_EQ(fnode.ToString(), "bool starts_with_plus_one((utf8) in, (const 
string) xyz)");
+  EXPECT_EQ(fnode.descriptor()->name(), "like");
 
   fnode = LikeHolder::TryOptimize(BuildLike("_xyz"));
-  EXPECT_EQ(fnode.ToString(), "bool ends_with_plus_one((utf8) in, (const 
string) xyz)");
+  EXPECT_EQ(fnode.descriptor()->name(), "like");
 
-  // no optimisation for others.
   fnode = LikeHolder::TryOptimize(BuildLike("%xyz%"));
   EXPECT_EQ(fnode.descriptor()->name(), "like");
 
diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt 
b/cpp/src/gandiva/precompiled/CMakeLists.txt
index f3f854c..21621b4 100644
--- a/cpp/src/gandiva/precompiled/CMakeLists.txt
+++ b/cpp/src/gandiva/precompiled/CMakeLists.txt
@@ -21,6 +21,7 @@ set(PRECOMPILED_SRCS
     arithmetic_ops.cc
     bitmap.cc
     context_helper.cc
+    extended_math_ops.cc
     hash.cc
     print.cc
     sample.cc
@@ -57,5 +58,6 @@ add_precompiled_unit_test(epoch_time_point_test.cc)
 add_precompiled_unit_test(time_test.cc time.cc timestamp_arithmetic.cc 
context_helper.cc ../execution_context.cc)
 add_precompiled_unit_test(hash_test.cc hash.cc)
 add_precompiled_unit_test(sample_test.cc sample.cc)
-add_precompiled_unit_test(string_ops_test.cc string_ops.cc)
+add_precompiled_unit_test(string_ops_test.cc string_ops.cc context_helper.cc 
../execution_context.cc)
 add_precompiled_unit_test(arithmetic_ops_test.cc arithmetic_ops.cc 
context_helper.cc ../execution_context.cc)
+add_precompiled_unit_test(extended_math_ops_test.cc extended_math_ops.cc 
context_helper.cc ../execution_context.cc)
diff --git a/cpp/src/gandiva/precompiled/arithmetic_ops.cc 
b/cpp/src/gandiva/precompiled/arithmetic_ops.cc
index 36d4076..ae6a0d3 100644
--- a/cpp/src/gandiva/precompiled/arithmetic_ops.cc
+++ b/cpp/src/gandiva/precompiled/arithmetic_ops.cc
@@ -15,8 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "../execution_context.h"
-
 extern "C" {
 
 #include "./types.h"
@@ -170,7 +168,7 @@ NUMERIC_BOOL_DATE_FUNCTION(IS_NOT_DISTINCT_FROM)
     }                                                                          
         \
     if (in2 == 0) {                                                            
         \
       char const* err_msg = "divide by zero error";                            
         \
-      set_error_msg(execution_context, err_msg);                               
         \
+      context_set_error_msg(execution_context, err_msg);                       
         \
       return 0;                                                                
         \
     }                                                                          
         \
     *out_valid = true;                                                         
         \
diff --git a/cpp/src/gandiva/precompiled/context_helper.cc 
b/cpp/src/gandiva/precompiled/context_helper.cc
index 1c05eda..35dfdf7 100644
--- a/cpp/src/gandiva/precompiled/context_helper.cc
+++ b/cpp/src/gandiva/precompiled/context_helper.cc
@@ -18,7 +18,7 @@
 #include "../execution_context.h"
 #include "types.h"
 
-void set_error_msg(int64_t context_ptr, char const* err_msg) {
+void context_set_error_msg(int64_t context_ptr, char const* err_msg) {
   gandiva::helpers::ExecutionContext* execution_context_ptr =
       reinterpret_cast<gandiva::helpers::ExecutionContext*>(context_ptr);
   (execution_context_ptr)->set_error_msg(err_msg);
diff --git a/cpp/src/gandiva/precompiled/extended_math_ops.cc 
b/cpp/src/gandiva/precompiled/extended_math_ops.cc
new file mode 100644
index 0000000..617819a
--- /dev/null
+++ b/cpp/src/gandiva/precompiled/extended_math_ops.cc
@@ -0,0 +1,105 @@
+// Copyright (C) 2017-2018 Dremio Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+extern "C" {
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./types.h"
+
+// Expand the inner fn for types that support extended math.
+#define ENUMERIC_TYPES_UNARY(INNER, OUT_TYPE) \
+  INNER(int32, OUT_TYPE)                      \
+  INNER(uint32, OUT_TYPE)                     \
+  INNER(int64, OUT_TYPE)                      \
+  INNER(uint64, OUT_TYPE)                     \
+  INNER(float32, OUT_TYPE)                    \
+  INNER(float64, OUT_TYPE)
+
+// Cubic root
+#define CBRT(IN_TYPE, OUT_TYPE) \
+  FORCE_INLINE                  \
+  OUT_TYPE cbrt_##IN_TYPE(IN_TYPE in) { return (cbrtl(in)); }
+
+ENUMERIC_TYPES_UNARY(CBRT, float64)
+
+// Exponent
+#define EXP(IN_TYPE, OUT_TYPE) \
+  FORCE_INLINE                 \
+  OUT_TYPE exp_##IN_TYPE(IN_TYPE in) { return (expl(in)); }
+
+ENUMERIC_TYPES_UNARY(EXP, float64)
+
+// log
+#define LOG(IN_TYPE, OUT_TYPE) \
+  FORCE_INLINE                 \
+  OUT_TYPE log_##IN_TYPE(IN_TYPE in) { return (logl(in)); }
+
+ENUMERIC_TYPES_UNARY(LOG, float64)
+
+// log base 10
+#define LOG10(IN_TYPE, OUT_TYPE) \
+  FORCE_INLINE                   \
+  OUT_TYPE log10_##IN_TYPE(IN_TYPE in) { return (log10l(in)); }
+
+ENUMERIC_TYPES_UNARY(LOG10, float64)
+
+FORCE_INLINE
+void set_error_for_logbase(int64_t execution_context, double base) {
+  char const *prefix = "divide by zero error with log of base";
+  int size = strlen(prefix) + 64;
+  char *error = (char *)malloc(size);
+  snprintf(error, size, "%s %f", prefix, base);
+  context_set_error_msg(execution_context, error);
+  free(error);
+}
+
+// log with base
+#define LOG_WITH_BASE(IN_TYPE1, IN_TYPE2, OUT_TYPE)                            
\
+  FORCE_INLINE                                                                 
\
+  OUT_TYPE log_##IN_TYPE1##_##IN_TYPE2(IN_TYPE1 base, boolean is_base_valid,   
\
+                                       IN_TYPE2 value, boolean is_value_valid, 
\
+                                       int64 context, boolean *out_valid) {    
\
+    *out_valid = false;                                                        
\
+    if (!is_base_valid || !is_value_valid) {                                   
\
+      return 0;                                                                
\
+    }                                                                          
\
+    OUT_TYPE log_of_base = logl(base);                                         
\
+    if (log_of_base == 0) {                                                    
\
+      set_error_for_logbase(context, base);                                    
\
+      return 0;                                                                
\
+    }                                                                          
\
+    *out_valid = true;                                                         
\
+    return (logl(value) / logl(base));                                         
\
+  }
+
+LOG_WITH_BASE(int32, int32, float64)
+LOG_WITH_BASE(uint32, uint32, float64)
+LOG_WITH_BASE(int64, int64, float64)
+LOG_WITH_BASE(uint64, uint64, float64)
+LOG_WITH_BASE(float32, float32, float64)
+LOG_WITH_BASE(float64, float64, float64)
+
+// power
+#define POWER(IN_TYPE1, IN_TYPE2, OUT_TYPE)                            \
+  FORCE_INLINE                                                         \
+  OUT_TYPE power_##IN_TYPE1##_##IN_TYPE2(IN_TYPE1 in1, IN_TYPE2 in2) { \
+    return (powl(in1, in2));                                           \
+  }
+
+POWER(float64, float64, float64)
+
+}  // extern "C"
diff --git a/cpp/src/gandiva/precompiled/extended_math_ops_test.cc 
b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
new file mode 100644
index 0000000..cfee248
--- /dev/null
+++ b/cpp/src/gandiva/precompiled/extended_math_ops_test.cc
@@ -0,0 +1,81 @@
+// Copyright (C) 2017-2018 Dremio Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "gandiva/execution_context.h"
+#include "gandiva/precompiled/types.h"
+
+namespace gandiva {
+
+TEST(TestExtendedMathOps, TestCbrt) {
+  EXPECT_EQ(cbrt_int32(27), 3);
+  EXPECT_EQ(cbrt_int64(27), 3);
+  EXPECT_EQ(cbrt_float32(27), 3);
+  EXPECT_EQ(cbrt_float64(27), 3);
+  EXPECT_EQ(cbrt_float64(-27), -3);
+
+  EXPECT_EQ(cbrt_float32(15.625), 2.5);
+  EXPECT_EQ(cbrt_float64(15.625), 2.5);
+}
+
+TEST(TestExtendedMathOps, TestExp) {
+  double val = 20.085536923187668;
+
+  EXPECT_EQ(exp_int32(3), val);
+  EXPECT_EQ(exp_int64(3), val);
+  EXPECT_EQ(exp_float32(3), val);
+  EXPECT_EQ(exp_float64(3), val);
+}
+
+TEST(TestExtendedMathOps, TestLog) {
+  double val = 4.1588830833596715;
+
+  EXPECT_EQ(log_int32(64), val);
+  EXPECT_EQ(log_int64(64), val);
+  EXPECT_EQ(log_float32(64), val);
+  EXPECT_EQ(log_float64(64), val);
+
+  EXPECT_EQ(log_int32(0), -std::numeric_limits<double>::infinity());
+}
+
+TEST(TestExtendedMathOps, TestLog10) {
+  EXPECT_EQ(log10_int32(100), 2);
+  EXPECT_EQ(log10_int64(100), 2);
+  EXPECT_EQ(log10_float32(100), 2);
+  EXPECT_EQ(log10_float64(100), 2);
+}
+
+TEST(TestExtendedMathOps, TestPower) {
+  EXPECT_EQ(power_float64_float64(2, 5.4), 42.22425314473263);
+  EXPECT_EQ(power_float64_float64(5.4, 2), 29.160000000000004);
+}
+
+TEST(TestArithmeticOps, TestLogWithBase) {
+  boolean is_valid;
+  gandiva::helpers::ExecutionContext error_holder;
+  float64 out = log_int32_int32(1, true, 10, true, (int64)&error_holder, 
&is_valid);
+  EXPECT_EQ(out, 0);
+  EXPECT_EQ(is_valid, false);
+  EXPECT_EQ(error_holder.has_error(), true);
+  EXPECT_TRUE(error_holder.get_error().find("divide by zero error") != 
std::string::npos)
+      << error_holder.get_error();
+
+  gandiva::helpers::ExecutionContext error_holder1;
+  out = log_int32_int32(2, true, 64, true, (int64)&error_holder, &is_valid);
+  EXPECT_EQ(out, 6);
+  EXPECT_EQ(is_valid, true);
+  EXPECT_EQ(error_holder1.has_error(), false);
+}
+
+}  // namespace gandiva
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc 
b/cpp/src/gandiva/precompiled/string_ops.cc
index 184c241..7fc0501 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -19,6 +19,8 @@
 
 extern "C" {
 
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include "./types.h"
 
@@ -88,15 +90,62 @@ bool ends_with_utf8_utf8(const char *data, int32 data_len, 
const char *suffix,
 }
 
 FORCE_INLINE
-bool starts_with_plus_one_utf8_utf8(const char *data, int32 data_len, const 
char *prefix,
-                                    int32 prefix_len) {
-  return ((data_len == prefix_len + 1) && (memcmp(data, prefix, prefix_len) == 
0));
+int32 utf8_char_length(char c) {
+  if (c >= 0) {  // 1-byte char
+    return 1;
+  } else if ((c & 0xE0) == 0xC0) {  // 2-byte char
+    return 2;
+  } else if ((c & 0xF0) == 0xE0) {  // 3-byte char
+    return 3;
+  } else if ((c & 0xF8) == 0xF0) {  // 4-byte char
+    return 4;
+  }
+  // invalid char
+  return 0;
 }
 
 FORCE_INLINE
-bool ends_with_plus_one_utf8_utf8(const char *data, int32 data_len, const char 
*suffix,
-                                  int32 suffix_len) {
-  return ((data_len == suffix_len + 1) && (memcmp(data + 1, suffix, 
suffix_len) == 0));
+void set_error_for_invalid_utf(int64_t execution_context, char val) {
+  char const *fmt = "unexpected byte \\%02hhx encountered while decoding utf8 
string";
+  int size = strlen(fmt) + 64;
+  char *error = (char *)malloc(size);
+  snprintf(error, size, fmt, (unsigned char)val);
+  context_set_error_msg(execution_context, error);
+  free(error);
+}
+
+// Count the number of utf8 characters
+FORCE_INLINE
+int32 utf8_length(const char *data, int32 data_len, boolean is_valid, int64 
context,
+                  boolean *out_valid) {
+  *out_valid = false;
+  if (!is_valid) {
+    return 0;
+  }
+
+  int char_len = 0;
+  int count = 0;
+  for (int i = 0; i < data_len; i += char_len) {
+    char_len = utf8_char_length(data[i]);
+    if (char_len == 0) {
+      set_error_for_invalid_utf(context, data[i]);
+      return 0;
+    }
+    ++count;
+  }
+  *out_valid = true;
+  return count;
 }
 
+#define UTF8_LENGTH_NULL_INTERNAL(NAME, TYPE)                                 \
+  FORCE_INLINE                                                                \
+  int32 NAME##_##TYPE(TYPE in, int32 in_len, boolean is_valid, int64 context, \
+                      boolean *out_valid) {                                   \
+    return utf8_length(in, in_len, is_valid, context, out_valid);             \
+  }
+
+UTF8_LENGTH_NULL_INTERNAL(char_length, utf8)
+UTF8_LENGTH_NULL_INTERNAL(length, utf8)
+UTF8_LENGTH_NULL_INTERNAL(lengthUtf8, binary)
+
 }  // extern "C"
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc 
b/cpp/src/gandiva/precompiled/string_ops_test.cc
index b4f522c..64e3264 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -16,8 +16,8 @@
 // under the License.
 
 #include <gtest/gtest.h>
+#include "gandiva/execution_context.h"
 #include "gandiva/precompiled/types.h"
-
 namespace gandiva {
 
 TEST(TestStringOps, TestCompare) {
@@ -51,18 +51,31 @@ TEST(TestStringOps, TestBeginsEnds) {
   EXPECT_TRUE(ends_with_utf8_utf8("sir", 3, "sir", 3));
   EXPECT_FALSE(ends_with_utf8_utf8("ir", 2, "sir", 3));
   EXPECT_FALSE(ends_with_utf8_utf8("hello", 5, "sir", 3));
+}
+
+TEST(TestStringOps, TestCharLength) {
+  bool valid;
+
+  EXPECT_EQ(utf8_length("hello sir", 9, true, 0, &valid), 9);
+  EXPECT_TRUE(valid);
+
+  std::string a("âpple");
+  EXPECT_EQ(utf8_length(a.data(), a.length(), true, 0, &valid), 5);
+  EXPECT_TRUE(valid);
 
-  // starts_with_plus_one
-  EXPECT_TRUE(starts_with_plus_one_utf8_utf8("hello ", 6, "hello", 5));
-  EXPECT_FALSE(starts_with_plus_one_utf8_utf8("hello world", 11, "hello", 5));
-  EXPECT_FALSE(starts_with_plus_one_utf8_utf8("hello", 5, "hello", 5));
-  EXPECT_FALSE(starts_with_plus_one_utf8_utf8("hell", 4, "hello", 5));
+  std::string b("मदन");
+  EXPECT_EQ(utf8_length(b.data(), b.length(), true, 0, &valid), 3);
+  EXPECT_TRUE(valid);
 
-  // ends_with_plus_one
-  EXPECT_TRUE(ends_with_plus_one_utf8_utf8("gworld", 6, "world", 5));
-  EXPECT_FALSE(ends_with_plus_one_utf8_utf8("hello world", 11, "world", 5));
-  EXPECT_FALSE(ends_with_plus_one_utf8_utf8("world", 5, "world", 5));
-  EXPECT_FALSE(ends_with_plus_one_utf8_utf8("worl", 4, "world", 5));
+  // invalid utf8
+  gandiva::helpers::ExecutionContext ctx;
+  std::string c("\xf8\x28");
+  EXPECT_EQ(utf8_length(c.data(), c.length(), true, (int64)&ctx, &valid), 0);
+  EXPECT_TRUE(ctx.get_error().find(
+                  "unexpected byte \\f8 encountered while decoding utf8 
string") !=
+              std::string::npos)
+      << ctx.get_error();
+  EXPECT_FALSE(valid);
 }
 
 }  // namespace gandiva
diff --git a/cpp/src/gandiva/precompiled/time.cc 
b/cpp/src/gandiva/precompiled/time.cc
index 9a3d6e3..2ac2fd9 100644
--- a/cpp/src/gandiva/precompiled/time.cc
+++ b/cpp/src/gandiva/precompiled/time.cc
@@ -509,9 +509,8 @@ void set_error_for_date(int32 length, const char *input, 
const char *msg,
                         int64_t execution_context) {
   int size = length + strlen(msg) + 1;
   char *error = (char *)malloc(size);
-  strcpy(error, msg);
-  strcat(error, input);
-  set_error_msg(execution_context, error);
+  snprintf(error, size, "%s%s", msg, input);
+  context_set_error_msg(execution_context, error);
   free(error);
 }
 
diff --git a/cpp/src/gandiva/precompiled/types.h 
b/cpp/src/gandiva/precompiled/types.h
index c9ac3c2..de924fa 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -45,13 +45,12 @@ using binary = char*;
 #define FORCE_INLINE __attribute__((always_inline))
 #endif
 
-// Declarations : used in testing
-
 extern "C" {
 
 bool bitMapGetBit(const unsigned char* bmap, int position);
 void bitMapSetBit(unsigned char* bmap, int position, bool value);
 void bitMapClearBitIfFalse(unsigned char* bmap, int position, bool value);
+void context_set_error_msg(int64_t context_ptr, const char *err_msg);
 
 int64 extractMillennium_timestamp(timestamp millis);
 int64 extractCentury_timestamp(timestamp millis);
@@ -126,6 +125,31 @@ int32 mod_int64_int32(int64 left, int32 right);
 int64 divide_int64_int64(int64 in1, boolean is_valid1, int64 in2, boolean 
is_valid2,
                          int64 error_holder, bool *out_valid);
 
+float64 cbrt_int32(int32);
+float64 cbrt_int64(int64);
+float64 cbrt_float32(float32);
+float64 cbrt_float64(float64);
+
+float64 exp_int32(int32);
+float64 exp_int64(int64);
+float64 exp_float32(float32);
+float64 exp_float64(float64);
+
+float64 log_int32(int32);
+float64 log_int64(int64);
+float64 log_float32(float32);
+float64 log_float64(float64);
+
+float64 log10_int32(int32);
+float64 log10_int64(int64);
+float64 log10_float32(float32);
+float64 log10_float64(float64);
+
+float64 power_float64_float64(float64, float64);
+
+float64 log_int32_int32(int32 base, boolean is_base_valid, int32 value,
+                        boolean is_value_valid, int64 context, boolean 
*out_valid);
+
 bool starts_with_utf8_utf8(const char *data, int32 data_len, const char 
*prefix,
                            int32 prefix_len);
 bool ends_with_utf8_utf8(const char *data, int32 data_len, const char *suffix,
@@ -135,11 +159,12 @@ bool starts_with_plus_one_utf8_utf8(const char *data, 
int32 data_len, const char
 bool ends_with_plus_one_utf8_utf8(const char *data, int32 data_len, const char 
*suffix,
                                   int32 suffix_len);
 
+int32 utf8_length(const char *data, int32 data_len, boolean is_valid, int64 
context,
+                  boolean *out_valid);
+
 date64 castDATE_utf8(const char *input, int32 length, boolean is_valid1,
                      int64_t execution_context, boolean *out_valid);
 
-void set_error_msg(int64_t context_ptr, char const *err_msg);
-
 }  // extern "C"
 
 #endif  // PRECOMPILED_TYPES_H
diff --git a/cpp/src/gandiva/tests/projector_test.cc 
b/cpp/src/gandiva/tests/projector_test.cc
index a7f71ec..f6feb9c 100644
--- a/cpp/src/gandiva/tests/projector_test.cc
+++ b/cpp/src/gandiva/tests/projector_test.cc
@@ -320,6 +320,83 @@ TEST_F(TestProjector, TestAllIntTypes) {
   TestArithmeticOpsForType<arrow::Int64Type, int64_t>(pool_);
 }
 
+TEST_F(TestProjector, TestExtendedMath) {
+  // schema for input fields
+  auto field0 = arrow::field("f0", arrow::float64());
+  auto field1 = arrow::field("f1", arrow::float64());
+  auto schema = arrow::schema({field0, field1});
+
+  // output fields
+  auto field_cbrt = arrow::field("cbrt", arrow::float64());
+  auto field_exp = arrow::field("exp", arrow::float64());
+  auto field_log = arrow::field("log", arrow::float64());
+  auto field_log10 = arrow::field("log10", arrow::float64());
+  auto field_logb = arrow::field("logb", arrow::float64());
+  auto field_power = arrow::field("power", arrow::float64());
+
+  // Build expression
+  auto cbrt_expr = TreeExprBuilder::MakeExpression("cbrt", {field0}, 
field_cbrt);
+  auto exp_expr = TreeExprBuilder::MakeExpression("exp", {field0}, field_exp);
+  auto log_expr = TreeExprBuilder::MakeExpression("log", {field0}, field_log);
+  auto log10_expr = TreeExprBuilder::MakeExpression("log10", {field0}, 
field_log10);
+  auto logb_expr = TreeExprBuilder::MakeExpression("log", {field0, field1}, 
field_logb);
+  auto power_expr =
+      TreeExprBuilder::MakeExpression("power", {field0, field1}, field_power);
+
+  std::shared_ptr<Projector> projector;
+  Status status = Projector::Make(
+      schema, {cbrt_expr, exp_expr, log_expr, log10_expr, logb_expr, 
power_expr},
+      &projector);
+  EXPECT_TRUE(status.ok());
+
+  // Create a row-batch with some sample data
+  int num_records = 4;
+  std::vector<double> input0 = {16, 10, -14, 8.3};
+  std::vector<double> input1 = {2, 3, 5, 7};
+  std::vector<bool> validity = {true, true, true, true};
+
+  auto array0 = MakeArrowArray<arrow::DoubleType, double>(input0, validity);
+  auto array1 = MakeArrowArray<arrow::DoubleType, double>(input1, validity);
+
+  // expected output
+  std::vector<double> cbrt_vals;
+  std::vector<double> exp_vals;
+  std::vector<double> log_vals;
+  std::vector<double> log10_vals;
+  std::vector<double> logb_vals;
+  std::vector<double> power_vals;
+  for (int i = 0; i < num_records; i++) {
+    cbrt_vals.push_back(cbrtl(input0[i]));
+    exp_vals.push_back(expl(input0[i]));
+    log_vals.push_back(logl(input0[i]));
+    log10_vals.push_back(log10l(input0[i]));
+    logb_vals.push_back(logl(input1[i]) / logl(input0[i]));
+    power_vals.push_back(powl(input0[i], input1[i]));
+  }
+  auto expected_cbrt = MakeArrowArray<arrow::DoubleType, double>(cbrt_vals, 
validity);
+  auto expected_exp = MakeArrowArray<arrow::DoubleType, double>(exp_vals, 
validity);
+  auto expected_log = MakeArrowArray<arrow::DoubleType, double>(log_vals, 
validity);
+  auto expected_log10 = MakeArrowArray<arrow::DoubleType, double>(log10_vals, 
validity);
+  auto expected_logb = MakeArrowArray<arrow::DoubleType, double>(logb_vals, 
validity);
+  auto expected_power = MakeArrowArray<arrow::DoubleType, double>(power_vals, 
validity);
+
+  // prepare input record batch
+  auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array0, 
array1});
+
+  // Evaluate expression
+  arrow::ArrayVector outputs;
+  status = projector->Evaluate(*in_batch, pool_, &outputs);
+  EXPECT_TRUE(status.ok());
+
+  // Validate results
+  EXPECT_ARROW_ARRAY_EQUALS(expected_cbrt, outputs.at(0));
+  EXPECT_ARROW_ARRAY_EQUALS(expected_exp, outputs.at(1));
+  EXPECT_ARROW_ARRAY_EQUALS(expected_log, outputs.at(2));
+  EXPECT_ARROW_ARRAY_EQUALS(expected_log10, outputs.at(3));
+  EXPECT_ARROW_ARRAY_EQUALS(expected_logb, outputs.at(4));
+  EXPECT_ARROW_ARRAY_EQUALS(expected_power, outputs.at(5));
+}
+
 TEST_F(TestProjector, TestFloatLessThan) {
   // schema for input fields
   auto field0 = field("f0", float32());
diff --git a/cpp/src/gandiva/tests/utf8_test.cc 
b/cpp/src/gandiva/tests/utf8_test.cc
index 3e3a495..63d49dd 100644
--- a/cpp/src/gandiva/tests/utf8_test.cc
+++ b/cpp/src/gandiva/tests/utf8_test.cc
@@ -46,10 +46,12 @@ TEST_F(TestUtf8, TestSimple) {
   // output fields
   auto res_1 = field("res1", int32());
   auto res_2 = field("res2", boolean());
+  auto res_3 = field("res3", int32());
 
   // build expressions.
   // octet_length(a)
   // octet_length(a) == bit_length(a) / 8
+  // length(a)
   auto expr_a = TreeExprBuilder::MakeExpression("octet_length", {field_a}, 
res_1);
 
   auto node_a = TreeExprBuilder::MakeField(field_a);
@@ -60,20 +62,23 @@ TEST_F(TestUtf8, TestSimple) {
   auto is_equal =
       TreeExprBuilder::MakeFunction("equal", {octet_length, div_8}, boolean());
   auto expr_b = TreeExprBuilder::MakeExpression(is_equal, res_2);
+  auto expr_c = TreeExprBuilder::MakeExpression("length", {field_a}, res_3);
 
   // Build a projector for the expressions.
   std::shared_ptr<Projector> projector;
-  Status status = Projector::Make(schema, {expr_a, expr_b}, &projector);
+  Status status = Projector::Make(schema, {expr_a, expr_b, expr_c}, 
&projector);
   EXPECT_TRUE(status.ok()) << status.message();
 
   // Create a row-batch with some sample data
-  int num_records = 4;
-  auto array_a =
-      MakeArrowArrayUtf8({"foo", "hello", "bye", "hi"}, {true, true, false, 
true});
+  int num_records = 5;
+  auto array_a = MakeArrowArrayUtf8({"foo", "hello", "bye", "hi", "मदन"},
+                                    {true, true, false, true, true});
 
   // expected output
-  auto exp_1 = MakeArrowArrayInt32({3, 5, 0, 2}, {true, true, false, true});
-  auto exp_2 = MakeArrowArrayBool({true, true, false, true}, {true, true, 
false, true});
+  auto exp_1 = MakeArrowArrayInt32({3, 5, 0, 2, 9}, {true, true, false, true, 
true});
+  auto exp_2 = MakeArrowArrayBool({true, true, false, true, true},
+                                  {true, true, false, true, true});
+  auto exp_3 = MakeArrowArrayInt32({3, 5, 0, 2, 3}, {true, true, false, true, 
true});
 
   // prepare input record batch
   auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a});
@@ -86,6 +91,7 @@ TEST_F(TestUtf8, TestSimple) {
   // Validate results
   EXPECT_ARROW_ARRAY_EQUALS(exp_1, outputs.at(0));
   EXPECT_ARROW_ARRAY_EQUALS(exp_2, outputs.at(1));
+  EXPECT_ARROW_ARRAY_EQUALS(exp_3, outputs.at(2));
 }
 
 TEST_F(TestUtf8, TestLiteral) {

[arrow] 16/17: [Gandiva] math functions, utf8_length

Reply via email to