This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 1ecda896dad3aec4df01afc52483ee8aa5e49791 Author: praveenbingo <[email protected]> AuthorDate: Mon Oct 1 20:03:56 2018 +0530 [Gandiva] Added more functions. [Gandiva] Added more functions. Added support for to_date and castDATE functions. To Date is a work in progress with some known issues around processing of dates with incorrect timezones. It will be fixed in subsequent PRs. --- cpp/src/gandiva/CMakeLists.txt | 12 +- cpp/src/gandiva/date_utils.cc | 246 +++++++++++++++++++++ cpp/src/gandiva/date_utils.h | 55 +++++ cpp/src/gandiva/function_holder_registry.h | 2 + cpp/src/gandiva/function_holder_stubs.cc | 12 + cpp/src/gandiva/function_registry.cc | 6 + cpp/src/gandiva/precompiled/CMakeLists.txt | 5 +- cpp/src/gandiva/precompiled/arithmetic_ops.cc | 1 - .../{context_helper.h => context_helper.cc} | 1 + cpp/src/gandiva/precompiled/time.cc | 64 ++++++ cpp/src/gandiva/precompiled/time_test.cc | 57 +++++ cpp/src/gandiva/precompiled/types.h | 5 + cpp/src/gandiva/symbols-helpers.map | 2 +- cpp/src/gandiva/tests/utf8_test.cc | 153 +++++++++++++ cpp/src/gandiva/to_date_holder.cc | 117 ++++++++++ cpp/src/gandiva/to_date_holder.h | 59 +++++ cpp/src/gandiva/to_date_holder_test.cc | 130 +++++++++++ 17 files changed, 918 insertions(+), 9 deletions(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 7065800..2a2900b 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -49,7 +49,9 @@ configure_file(bc_file_path.cc.in ${BC_FILE_PATH_CC}) set(SHARED_HELPER_FILES like_holder.cc regex_util.cc - execution_context.cc) + execution_context.cc + to_date_holder.cc + date_utils.cc) set(SRC_FILES annotator.cc bitmap_accumulator.cc @@ -169,15 +171,15 @@ add_gandiva_unit_test(engine_llvm_test.cc engine.cc llvm_types.cc configuration. add_gandiva_unit_test(function_signature_test.cc function_signature.cc) add_gandiva_unit_test(function_registry_test.cc function_registry.cc function_signature.cc) add_gandiva_unit_test(llvm_types_test.cc llvm_types.cc) -add_gandiva_unit_test(llvm_generator_test.cc llvm_generator.cc regex_util.cc engine.cc llvm_types.cc expr_decomposer.cc function_registry.cc annotator.cc bitmap_accumulator.cc configuration.cc function_signature.cc like_holder.cc regex_util.cc execution_context.cc ${BC_FILE_PATH_CC}) +add_gandiva_unit_test(llvm_generator_test.cc llvm_generator.cc regex_util.cc engine.cc llvm_types.cc expr_decomposer.cc function_registry.cc annotator.cc bitmap_accumulator.cc configuration.cc function_signature.cc like_holder.cc to_date_holder.cc date_utils.cc regex_util.cc execution_context.cc ${BC_FILE_PATH_CC}) add_gandiva_unit_test(annotator_test.cc annotator.cc function_signature.cc) -add_gandiva_unit_test(tree_expr_test.cc tree_expr_builder.cc expr_decomposer.cc annotator.cc function_registry.cc function_signature.cc like_holder.cc regex_util.cc) -add_gandiva_unit_test(expr_decomposer_test.cc expr_decomposer.cc tree_expr_builder.cc annotator.cc function_registry.cc function_signature.cc like_holder.cc regex_util.cc) +add_gandiva_unit_test(tree_expr_test.cc tree_expr_builder.cc expr_decomposer.cc annotator.cc function_registry.cc function_signature.cc like_holder.cc regex_util.cc to_date_holder.cc date_utils.cc execution_context.cc) +add_gandiva_unit_test(expr_decomposer_test.cc expr_decomposer.cc tree_expr_builder.cc annotator.cc function_registry.cc function_signature.cc like_holder.cc regex_util.cc to_date_holder.cc date_utils.cc execution_context.cc) add_gandiva_unit_test(status_test.cc) add_gandiva_unit_test(expression_registry_test.cc llvm_types.cc expression_registry.cc function_signature.cc function_registry.cc) add_gandiva_unit_test(selection_vector_test.cc selection_vector.cc) add_gandiva_unit_test(lru_cache_test.cc) -add_gandiva_unit_test(like_holder_test.cc like_holder.cc regex_util.cc) +add_gandiva_unit_test(to_date_holder_test.cc to_date_holder.cc date_utils.cc execution_context.cc) add_subdirectory(jni) add_subdirectory(precompiled) diff --git a/cpp/src/gandiva/date_utils.cc b/cpp/src/gandiva/date_utils.cc new file mode 100644 index 0000000..3841a88 --- /dev/null +++ b/cpp/src/gandiva/date_utils.cc @@ -0,0 +1,246 @@ +// Copyright (C) 2017-2018 Dremio Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <algorithm> +#include <memory> +#include <vector> + +#include "gandiva/date_utils.h" + +namespace gandiva { + +#ifdef GDV_HELPERS +namespace helpers { +#endif + +std::vector<std::string> DateUtils::GetMatches(std::string pattern, bool exactMatch) { + // we are case insensitive + std::transform(pattern.begin(), pattern.end(), pattern.begin(), ::tolower); + std::vector<std::string> matches; + + for (const auto& it : sql_date_format_to_boost_map_) { + if (it.first.find(pattern) != std::string::npos && + (!exactMatch || (it.first.length() == pattern.length()))) { + matches.push_back(it.first); + } + } + + return matches; +} + +std::vector<std::string> DateUtils::GetPotentialMatches(std::string pattern) { + return GetMatches(pattern, false); +} + +std::vector<std::string> DateUtils::GetExactMatches(std::string pattern) { + return GetMatches(pattern, true); +} + +/** + * Validates and converts {@param format} to the strptime equivalent + * + * @param format date format + * @return date format converted to strptime format + */ +Status DateUtils::ToInternalFormat(const std::string& format, + std::shared_ptr<std::string>* internal_format) { + std::stringstream builder; + std::stringstream buffer; + bool is_in_quoted_text = false; + + for (uint i = 0; i < format.length(); i++) { + char currentChar = format[i]; + + // logic before we append to the buffer + if (currentChar == '"') { + if (is_in_quoted_text) { + // we are done with a quoted block + is_in_quoted_text = false; + + // use ' for quoting + builder << '\''; + builder << buffer.str(); + builder << '\''; + + // clear buffer + buffer.str(""); + continue; + } else { + if (buffer.str().length() > 0) { + std::stringstream err_msg; + err_msg << "Invalid date format string '" << format << "' at position " << i; + return Status::Invalid(err_msg.str()); + } + + is_in_quoted_text = true; + continue; + } + } + + // handle special characters we want to simply pass through, but only if not in quoted + // and the buffer is empty + std::string special_characters = "*-/,.;: "; + if (!is_in_quoted_text && buffer.str().length() == 0 && + (special_characters.find_first_of(currentChar) != std::string::npos)) { + builder << currentChar; + continue; + } + + // append to the buffer + buffer << currentChar; + + // nothing else to do if we are in quoted text + if (is_in_quoted_text) { + continue; + } + + // check how many matches we have for our buffer + std::vector<std::string> potentialList = GetPotentialMatches(buffer.str()); + int potentialCount = potentialList.size(); + + if (potentialCount >= 1) { + // one potential and the length match + if (potentialCount == 1 && potentialList[0].length() == buffer.str().length()) { + // we have a match! + builder << sql_date_format_to_boost_map_[potentialList[0]]; + buffer.str(""); + } else { + // Some patterns (like MON, MONTH) can cause ambiguity, such as "MON:". "MON" + // will have two potential matches, but "MON:" will match nothing, so we want to + // look ahead when we match "MON" and check if adding the next char leads to 0 + // potentials. If it does, we go ahead and treat the buffer as matched (if a + // potential match exists that matches the buffer) + if (format.length() - 1 > i) { + std::string lookAheadPattern = (buffer.str() + format.at(i + 1)); + std::transform(lookAheadPattern.begin(), lookAheadPattern.end(), + lookAheadPattern.begin(), ::tolower); + ; + bool lookAheadMatched = false; + + // we can query potentialList to see if it has anything that matches the + // lookahead pattern + for (std::string potential : potentialList) { + if (potential.find(lookAheadPattern) != std::string::npos) { + lookAheadMatched = true; + break; + } + } + + if (!lookAheadMatched) { + // check if any of the potential matches are the same length as our buffer, we + // do not want to match "MO:" + bool matched = false; + for (std::string potential : potentialList) { + if (potential.length() == buffer.str().length()) { + matched = true; + break; + } + } + + if (matched) { + std::string match = buffer.str(); + std::transform(match.begin(), match.end(), match.begin(), ::tolower); + ; + builder << sql_date_format_to_boost_map_[match]; + buffer.str(""); + continue; + } + } + } + } + } else { + // no potential matches found + std::stringstream err_msg; + err_msg << "Invalid date format string '" << format << "' at position " << i; + return Status::Invalid(err_msg.str()); + } + } + + if (buffer.str().length() > 0) { + // Some patterns (like MON, MONTH) can cause us to reach this point with a valid + // buffer value as MON has 2 valid potential matches, so double check here + std::vector<std::string> exactMatches = GetExactMatches(buffer.str()); + if (exactMatches.size() == 1 && exactMatches[0].length() == buffer.str().length()) { + builder << sql_date_format_to_boost_map_[exactMatches[0]]; + } else { + // we didn't successfully parse the entire string + int pos = format.length() - buffer.str().length(); + std::stringstream err_msg; + err_msg << "Invalid date format string '" << format << "' at position " << pos; + return Status::Invalid(err_msg.str()); + } + } + std::string final_pattern = builder.str(); + internal_format->reset(new std::string(final_pattern)); + return Status::OK(); +} + +DateUtils::date_format_converter DateUtils::sql_date_format_to_boost_map_ = InitMap(); + +DateUtils::date_format_converter DateUtils::InitMap() { + date_format_converter map; + + // Era + map["ad"] = "%EC"; + map["bc"] = "%EC"; + // Meridian + map["am"] = "%p"; + map["pm"] = "%p"; + // Century + map["cc"] = "%C"; + // Week of year + map["ww"] = "%W"; + // Day of week + map["d"] = "%u"; + // Day name of week + map["dy"] = "%a"; + map["day"] = "%a"; + // Year + map["yyyy"] = "%Y"; + map["yy"] = "%y"; + // Day of year + map["ddd"] = "%j"; + // Month + map["mm"] = "%m"; + map["mon"] = "%b"; + map["month"] = "%b"; + // Day of month + map["dd"] = "%d"; + // Hour of day + map["hh"] = "%I"; + map["hh12"] = "%I"; + map["hh24"] = "%H"; + // Minutes + map["mi"] = "%M"; + // Seconds + map["ss"] = "%S"; + // Milliseconds + map["f"] = "S"; + map["ff"] = "SS"; + map["fff"] = "SSS"; + /* + // Timezone not tested/supported yet fully. + map["tzd"] = "%Z"; + map["tzo"] = "%z"; + map["tzh:tzm"] = "%z"; + */ + + return map; +} + +#ifdef GDV_HELPERS +} // namespace helpers +#endif + +} // namespace gandiva diff --git a/cpp/src/gandiva/date_utils.h b/cpp/src/gandiva/date_utils.h new file mode 100644 index 0000000..664554e --- /dev/null +++ b/cpp/src/gandiva/date_utils.h @@ -0,0 +1,55 @@ +// Copyright (C) 2017-2018 Dremio Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TO_DATE_HELPER_H +#define TO_DATE_HELPER_H + +#include <unordered_map> +#include <vector> + +#include "gandiva/status.h" + +namespace gandiva { + +#ifdef GDV_HELPERS +namespace helpers { +#endif + +/// \brief Utility class for converting sql date patterns to internal date patterns. +class DateUtils { + public: + static Status ToInternalFormat(const std::string& format, + std::shared_ptr<std::string>* internal_format); + + private: + using date_format_converter = std::unordered_map<std::string, std::string>; + + static date_format_converter sql_date_format_to_boost_map_; + + static date_format_converter InitMap(); + + static std::vector<std::string> GetMatches(std::string pattern, bool exactMatch); + + static std::vector<std::string> GetPotentialMatches(std::string pattern); + + static std::vector<std::string> GetExactMatches(std::string pattern); +}; + +#ifdef GDV_HELPERS +} // namespace helpers +#endif + +} // namespace gandiva + +#endif // TO_DATE_HELPER_H diff --git a/cpp/src/gandiva/function_holder_registry.h b/cpp/src/gandiva/function_holder_registry.h index 02351f2..93996c7 100644 --- a/cpp/src/gandiva/function_holder_registry.h +++ b/cpp/src/gandiva/function_holder_registry.h @@ -25,6 +25,7 @@ #include "gandiva/function_holder.h" #include "gandiva/like_holder.h" #include "gandiva/node.h" +#include "gandiva/to_date_holder.h" #include "gandiva/status.h" namespace gandiva { @@ -59,6 +60,7 @@ class FunctionHolderRegistry { static map_type& makers() { static map_type maker_map = { {"like", LAMBDA_MAKER(LikeHolder)}, + {"to_date", LAMBDA_MAKER(ToDateHolder)}, }; return maker_map; } diff --git a/cpp/src/gandiva/function_holder_stubs.cc b/cpp/src/gandiva/function_holder_stubs.cc index 3d26308..9708674 100644 --- a/cpp/src/gandiva/function_holder_stubs.cc +++ b/cpp/src/gandiva/function_holder_stubs.cc @@ -16,6 +16,7 @@ // under the License. #include "gandiva/like_holder.h" +#include "gandiva/to_date_holder.h" // Wrapper C functions for "like" to be invoked from LLVM. extern "C" bool like_utf8_utf8(int64_t ptr, const char* data, int data_len, @@ -24,3 +25,14 @@ extern "C" bool like_utf8_utf8(int64_t ptr, const char* data, int data_len, reinterpret_cast<gandiva::helpers::LikeHolder*>(ptr); return (*holder)(std::string(data, data_len)); } + +extern "C" int64_t to_date_utf8_utf8_int32(int64_t ptr, const char *data, int data_len, + bool in1_validity, const char *pattern, + int pattern_len, bool in2_validity, + int32_t suppress_errors, bool in3_validity, + int64_t execution_context, bool *out_valid) { + gandiva::helpers::ToDateHolder *holder = + reinterpret_cast<gandiva::helpers::ToDateHolder *>(ptr); + return (*holder)(std::string(data, data_len), in1_validity, execution_context, + out_valid); +} diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index c0ead6e..0f4d80b 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -361,6 +361,12 @@ NativeFunction FunctionRegistry::pc_registry_[] = { NativeFunction("like", DataTypeVector{utf8(), utf8()}, boolean(), RESULT_NULL_IF_NULL, "like_utf8_utf8", true /*needs_holder*/), + NativeFunction("to_date", DataTypeVector{utf8(), utf8(), int32()}, date64(), + RESULT_NULL_INTERNAL, "to_date_utf8_utf8_int32", true, true), + + NativeFunction("castDATE", DataTypeVector{utf8()}, date64(), RESULT_NULL_INTERNAL, + "castDATE_utf8", false /*needs_holder*/, true /*needs context*/), + // Null internal (sample) NativeFunction("half_or_null", DataTypeVector{int32()}, int32(), RESULT_NULL_INTERNAL, "half_or_null_int32"), diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index 891dcce..f3f854c 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -20,6 +20,7 @@ project(gandiva) set(PRECOMPILED_SRCS arithmetic_ops.cc bitmap.cc + context_helper.cc hash.cc print.cc sample.cc @@ -53,8 +54,8 @@ add_custom_target(precompiled ALL DEPENDS ${GANDIVA_BC_OUTPUT_PATH}) # testing add_precompiled_unit_test(bitmap_test.cc bitmap.cc) add_precompiled_unit_test(epoch_time_point_test.cc) -add_precompiled_unit_test(time_test.cc time.cc timestamp_arithmetic.cc) +add_precompiled_unit_test(time_test.cc time.cc timestamp_arithmetic.cc context_helper.cc ../execution_context.cc) add_precompiled_unit_test(hash_test.cc hash.cc) add_precompiled_unit_test(sample_test.cc sample.cc) add_precompiled_unit_test(string_ops_test.cc string_ops.cc) -add_precompiled_unit_test(arithmetic_ops_test.cc arithmetic_ops.cc ../execution_context.cc) +add_precompiled_unit_test(arithmetic_ops_test.cc arithmetic_ops.cc context_helper.cc ../execution_context.cc) diff --git a/cpp/src/gandiva/precompiled/arithmetic_ops.cc b/cpp/src/gandiva/precompiled/arithmetic_ops.cc index de2eb7d..36d4076 100644 --- a/cpp/src/gandiva/precompiled/arithmetic_ops.cc +++ b/cpp/src/gandiva/precompiled/arithmetic_ops.cc @@ -19,7 +19,6 @@ extern "C" { -#include "./context_helper.h" #include "./types.h" // Expand inner macro for all numeric types. diff --git a/cpp/src/gandiva/precompiled/context_helper.h b/cpp/src/gandiva/precompiled/context_helper.cc similarity index 98% rename from cpp/src/gandiva/precompiled/context_helper.h rename to cpp/src/gandiva/precompiled/context_helper.cc index d2a2864..1c05eda 100644 --- a/cpp/src/gandiva/precompiled/context_helper.h +++ b/cpp/src/gandiva/precompiled/context_helper.cc @@ -16,6 +16,7 @@ #define GANDIVA_CONTEXT_HELPER_H #include "../execution_context.h" +#include "types.h" void set_error_msg(int64_t context_ptr, char const* err_msg) { gandiva::helpers::ExecutionContext* execution_context_ptr = diff --git a/cpp/src/gandiva/precompiled/time.cc b/cpp/src/gandiva/precompiled/time.cc index 3903da1..9a3d6e3 100644 --- a/cpp/src/gandiva/precompiled/time.cc +++ b/cpp/src/gandiva/precompiled/time.cc @@ -20,6 +20,7 @@ extern "C" { #include <stdlib.h> +#include <string.h> #include <time.h> #include "./time_constants.h" @@ -503,4 +504,67 @@ bool IsLastDayOfMonth(const EpochTimePoint &tp) { DATE_TYPES(MONTHS_BETWEEN) +FORCE_INLINE +void set_error_for_date(int32 length, const char *input, const char *msg, + int64_t execution_context) { + int size = length + strlen(msg) + 1; + char *error = (char *)malloc(size); + strcpy(error, msg); + strcat(error, input); + set_error_msg(execution_context, error); + free(error); +} + +date64 castDATE_utf8(const char *input, int32 length, boolean is_valid1, + int64_t execution_context, boolean *out_valid) { + *out_valid = false; + if (!is_valid1) { + return 0; + } + // format : 0 is year, 1 is month and 2 is day. + int dateFields[3]; + int dateIndex = 0, index = 0, value = 0; + while (dateIndex < 3 && index < length) { + if (!isdigit(input[index])) { + dateFields[dateIndex++] = value; + value = 0; + } else { + value = (value * 10) + (input[index] - '0'); + } + index++; + } + + if (dateIndex < 3) { + // If we reached the end of input, we would have not encountered a separator + // store the last value + dateFields[dateIndex++] = value; + } + const char *msg = "Not a valid date value "; + if (dateIndex != 3) { + set_error_for_date(length, input, msg, execution_context); + return 0; + } + + /* Handle two digit years + * If range of two digits is between 70 - 99 then year = 1970 - 1999 + * Else if two digits is between 00 - 69 = 2000 - 2069 + */ + if (dateFields[0] < 100) { + if (dateFields[0] < 70) { + dateFields[0] += 2000; + } else { + dateFields[0] += 1900; + } + } + date::year_month_day day = + date::year(dateFields[0]) / date::month(dateFields[1]) / date::day(dateFields[2]); + if (!day.ok()) { + set_error_for_date(length, input, msg, execution_context); + return 0; + } + *out_valid = true; + return std::chrono::time_point_cast<std::chrono::milliseconds>(date::sys_days(day)) + .time_since_epoch() + .count(); +} } // extern "C" diff --git a/cpp/src/gandiva/precompiled/time_test.cc b/cpp/src/gandiva/precompiled/time_test.cc index 56317c2..e53f44f 100644 --- a/cpp/src/gandiva/precompiled/time_test.cc +++ b/cpp/src/gandiva/precompiled/time_test.cc @@ -19,6 +19,8 @@ #include <gtest/gtest.h> #include "gandiva/precompiled/types.h" +#include "gandiva/precompiled/date.h" +#include "../execution_context.h" namespace gandiva { @@ -28,6 +30,61 @@ timestamp StringToTimestamp(const char* buf) { return timegm(&tm) * 1000; // to millis } +TEST(TestTime, TestCastDate) { + const char *date = "1967-12-1"; + helpers::ExecutionContext context; + bool valid; + int64_t cast_to_date = castDATE_utf8(date, 9, true, (int64_t)&context, &valid); + EXPECT_EQ(cast_to_date, -65836800000); + EXPECT_EQ(valid, true); + + const char *date1 = "1972-12-1"; + cast_to_date = castDATE_utf8(date1, 9, true, (int64_t)&context, &valid); + EXPECT_EQ(cast_to_date, 92016000000); + EXPECT_EQ(valid, true); + + const char *date2 = "1972222222"; + cast_to_date = castDATE_utf8(date2, 10, true, (int64_t)&context, &valid); + EXPECT_EQ(cast_to_date, 0); + EXPECT_EQ(context.get_error(), "Not a valid date value 1972222222"); + EXPECT_EQ(valid, false); + + const char *date3 = "blahblah"; + cast_to_date = castDATE_utf8(date3, 8, true, (int64_t)&context, &valid); + EXPECT_EQ(cast_to_date, 0); + EXPECT_EQ(valid, false); + + const char *date4 = "1967-12-1bb"; + cast_to_date = castDATE_utf8(date4, 11, true, (int64_t)&context, &valid); + EXPECT_EQ(cast_to_date, -65836800000); + EXPECT_EQ(valid, true); + + const char *date5 = "67-12-1"; + cast_to_date = castDATE_utf8(date5, 7, true, (int64_t)&context, &valid); + EXPECT_EQ(cast_to_date, 3089923200000); + EXPECT_EQ(valid, true); + + const char *date6 = "67-1-1"; + cast_to_date = castDATE_utf8(date6, 7, true, (int64_t)&context, &valid); + EXPECT_EQ(cast_to_date, 3061065600000); + EXPECT_EQ(valid, true); + + const char *date7 = "71-1-1"; + cast_to_date = castDATE_utf8(date7, 7, true, (int64_t)&context, &valid); + EXPECT_EQ(cast_to_date, 31536000000); + EXPECT_EQ(valid, true); + + const char *date8 = "71-45-1"; + cast_to_date = castDATE_utf8(date8, 7, true, (int64_t)&context, &valid); + EXPECT_EQ(cast_to_date, 0); + EXPECT_EQ(valid, false); + + const char *date9 = "71-12-XX"; + cast_to_date = castDATE_utf8(date9, 8, true, (int64_t)&context, &valid); + EXPECT_EQ(cast_to_date, 0); + EXPECT_EQ(valid, false); +} + TEST(TestTime, TestExtractTime) { // 10:20:33 int32 time_as_millis_in_day = 37233000; diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 7315214..c9ac3c2 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -135,6 +135,11 @@ bool starts_with_plus_one_utf8_utf8(const char *data, int32 data_len, const char bool ends_with_plus_one_utf8_utf8(const char *data, int32 data_len, const char *suffix, int32 suffix_len); +date64 castDATE_utf8(const char *input, int32 length, boolean is_valid1, + int64_t execution_context, boolean *out_valid); + +void set_error_msg(int64_t context_ptr, char const *err_msg); + } // extern "C" #endif // PRECOMPILED_TYPES_H diff --git a/cpp/src/gandiva/symbols-helpers.map b/cpp/src/gandiva/symbols-helpers.map index 8f77d16..48eedd1 100644 --- a/cpp/src/gandiva/symbols-helpers.map +++ b/cpp/src/gandiva/symbols-helpers.map @@ -1,4 +1,4 @@ { - global: extern "C++" { gandiva*; like*;}; + global: extern "C++" { gandiva*; like*;to_date*}; local: *; }; diff --git a/cpp/src/gandiva/tests/utf8_test.cc b/cpp/src/gandiva/tests/utf8_test.cc index 5e49a53..3e3a495 100644 --- a/cpp/src/gandiva/tests/utf8_test.cc +++ b/cpp/src/gandiva/tests/utf8_test.cc @@ -25,7 +25,9 @@ namespace gandiva { using arrow::boolean; +using arrow::date64; using arrow::int32; +using arrow::int64; using arrow::utf8; class TestUtf8 : public ::testing::Test { @@ -262,4 +264,155 @@ TEST_F(TestUtf8, TestBeginsEnds) { EXPECT_ARROW_ARRAY_EQUALS(exp2, outputs.at(1)); } +TEST_F(TestUtf8, TestCastDate) { + // schema for input fields + auto field_a = field("a", utf8()); + auto schema = arrow::schema({field_a}); + + // output fields + auto res_1 = field("res1", int64()); + + // build expressions. + // extractYear(castDATE(a)) + auto node_a = TreeExprBuilder::MakeField(field_a); + auto cast_function = TreeExprBuilder::MakeFunction("castDATE", {node_a}, date64()); + auto extract_year = + TreeExprBuilder::MakeFunction("extractYear", {cast_function}, int64()); + auto expr = TreeExprBuilder::MakeExpression(extract_year, res_1); + + // Build a projector for the expressions. + std::shared_ptr<Projector> projector; + Status status = Projector::Make(schema, {expr}, &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = MakeArrowArrayUtf8({"1967-12-1", "67-12-01", "incorrect", "67-45-11"}, + {true, true, false, true}); + + // expected output + auto exp_1 = MakeArrowArrayInt64({1967, 2067, 0, 0}, {true, true, false, false}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_EQ(status.code(), StatusCode::ExecutionError); + std::string expected_error = "Not a valid date value "; + EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); + + auto array_a_2 = MakeArrowArrayUtf8({"1967-12-1", "67-12-01", "67-1-1", "91-1-1"}, + {true, true, true, true}); + auto exp_2 = MakeArrowArrayInt64({1967, 2067, 2067, 1991}, {true, true, true, true}); + auto in_batch_2 = arrow::RecordBatch::Make(schema, num_records, {array_a_2}); + arrow::ArrayVector outputs2; + status = projector->Evaluate(*in_batch_2, pool_, &outputs2); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp_2, outputs2.at(0)); +} + +TEST_F(TestUtf8, TestToDateNoError) { + // schema for input fields + auto field_a = field("a", utf8()); + auto schema = arrow::schema({field_a}); + + // output fields + auto res_1 = field("res1", int64()); + + // build expressions. + // extractYear(castDATE(a)) + auto node_a = TreeExprBuilder::MakeField(field_a); + auto node_b = TreeExprBuilder::MakeStringLiteral("YYYY-MM-DD"); + auto node_c = TreeExprBuilder::MakeLiteral(1); + + auto cast_function = + TreeExprBuilder::MakeFunction("to_date", {node_a, node_b, node_c}, date64()); + auto extract_year = + TreeExprBuilder::MakeFunction("extractYear", {cast_function}, int64()); + auto expr = TreeExprBuilder::MakeExpression(extract_year, res_1); + + // Build a projector for the expressions. + std::shared_ptr<Projector> projector; + Status status = Projector::Make(schema, {expr}, &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = MakeArrowArrayUtf8({"1967-12-1", "67-12-01", "incorrect", "67-45-11"}, + {true, true, false, true}); + + // expected output + auto exp_1 = MakeArrowArrayInt64({1967, 67, 0, 0}, {true, true, false, false}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + EXPECT_ARROW_ARRAY_EQUALS(exp_1, outputs.at(0)); + + // Create a row-batch with some sample data + auto array_a_2 = MakeArrowArrayUtf8( + {"1967-12-1", "1967-12-01", "1967-11-11", "1991-11-11"}, {true, true, true, true}); + auto exp_2 = MakeArrowArrayInt64({1967, 1967, 1967, 1991}, {true, true, true, true}); + auto in_batch_2 = arrow::RecordBatch::Make(schema, num_records, {array_a_2}); + arrow::ArrayVector outputs2; + status = projector->Evaluate(*in_batch_2, pool_, &outputs2); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp_2, outputs2.at(0)); +} + +TEST_F(TestUtf8, TestToDateError) { + // schema for input fields + auto field_a = field("a", utf8()); + auto schema = arrow::schema({field_a}); + + // output fields + auto res_1 = field("res1", int64()); + + // build expressions. + // extractYear(castDATE(a)) + auto node_a = TreeExprBuilder::MakeField(field_a); + auto node_b = TreeExprBuilder::MakeStringLiteral("YYYY-MM-DD"); + auto node_c = TreeExprBuilder::MakeLiteral(0); + + auto cast_function = + TreeExprBuilder::MakeFunction("to_date", {node_a, node_b, node_c}, date64()); + auto extract_year = + TreeExprBuilder::MakeFunction("extractYear", {cast_function}, int64()); + auto expr = TreeExprBuilder::MakeExpression(extract_year, res_1); + + // Build a projector for the expressions. + std::shared_ptr<Projector> projector; + Status status = Projector::Make(schema, {expr}, &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = MakeArrowArrayUtf8({"1967-12-1", "67-12-01", "incorrect", "67-45-11"}, + {true, true, false, true}); + + // expected output + auto exp_1 = MakeArrowArrayInt64({1967, 67, 0, 0}, {true, true, false, false}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_EQ(status.code(), StatusCode::ExecutionError); + std::string expected_error = "Error parsing value 67-45-11 for given format"; + EXPECT_TRUE(status.message().find(expected_error) != std::string::npos) + << status.message(); +} + } // namespace gandiva diff --git a/cpp/src/gandiva/to_date_holder.cc b/cpp/src/gandiva/to_date_holder.cc new file mode 100644 index 0000000..96fafd1 --- /dev/null +++ b/cpp/src/gandiva/to_date_holder.cc @@ -0,0 +1,117 @@ +// Copyright (C) 2017-2018 Dremio Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <algorithm> +#include <string> + +#include "gandiva/precompiled/date.h" +#include "gandiva/date_utils.h" +#include "gandiva/execution_context.h" +#include "gandiva/node.h" +#include "gandiva/to_date_holder.h" + +namespace gandiva { + +#ifdef GDV_HELPERS +namespace helpers { +#endif + +Status ToDateHolder::Make(const FunctionNode &node, + std::shared_ptr<ToDateHolder> *holder) { + if (node.children().size() != 3) { + return Status::Invalid("'to_date' function requires three parameters"); + } + + auto literal_pattern = dynamic_cast<LiteralNode *>(node.children().at(1).get()); + if (literal_pattern == nullptr) { + return Status::Invalid( + "'to_date' function requires a literal as the second parameter"); + } + + auto literal_type = literal_pattern->return_type()->id(); + if (literal_type != arrow::Type::STRING && literal_type != arrow::Type::BINARY) { + return Status::Invalid( + "'to_date' function requires a string literal as the second parameter"); + } + auto pattern = boost::get<std::string>(literal_pattern->holder()); + + auto literal_suppress_errors = dynamic_cast<LiteralNode *>(node.children().at(2).get()); + if (literal_pattern == nullptr) { + return Status::Invalid( + "'to_date' function requires a int literal as the third parameter"); + } + + literal_type = literal_suppress_errors->return_type()->id(); + if (literal_type != arrow::Type::INT32) { + return Status::Invalid( + "'to_date' function requires a int literal as the third parameter"); + } + auto suppress_errors = boost::get<int>(literal_suppress_errors->holder()); + return Make(pattern, suppress_errors, holder); +} + +Status ToDateHolder::Make(const std::string &sql_pattern, int32_t suppress_errors, + std::shared_ptr<ToDateHolder> *holder) { + std::shared_ptr<std::string> transformed_pattern; + Status status = DateUtils::ToInternalFormat(sql_pattern, &transformed_pattern); + GANDIVA_RETURN_NOT_OK(status); + auto lholder = std::shared_ptr<ToDateHolder>( + new ToDateHolder(*(transformed_pattern.get()), suppress_errors)); + *holder = lholder; + return Status::OK(); +} + +int64_t ToDateHolder::operator()(const std::string &data, bool in_valid, + int64_t execution_context, bool *out_valid) { + using namespace date; + using namespace std::chrono; + // Issues + // 1. processes date that do not match the format. + // 2. does not process time in format +08:00 (or) id. + *out_valid = false; + if (!in_valid) { + return 0; + } + struct tm result = {0}; + char *ret = strptime(data.c_str(), pattern_.c_str(), &result); + if (ret == nullptr) { + return_error(execution_context, data); + return 0; + } + *out_valid = true; + // ignore the time part + date::sys_seconds secs = + (sys_days{year{result.tm_year + 1900} / (result.tm_mon + 1) / result.tm_mday}); + int64_t seconds_since_epoch = secs.time_since_epoch().count(); + if (seconds_since_epoch == 0) { + return_error(execution_context, data); + return 0; + } + return seconds_since_epoch * 1000; +} + +void ToDateHolder::return_error(int64_t execution_context, const std::string &data) { + if (suppress_errors_ == 1) { + return; + } + ExecutionContext *execution_context_ptr = + reinterpret_cast<ExecutionContext *>(execution_context); + std::string err_msg = "Error parsing value " + data + " for given format."; + (execution_context_ptr)->set_error_msg(err_msg.c_str()); +} +#ifdef GDV_HELPERS +} +#endif + +} // namespace gandiva diff --git a/cpp/src/gandiva/to_date_holder.h b/cpp/src/gandiva/to_date_holder.h new file mode 100644 index 0000000..2e68a80 --- /dev/null +++ b/cpp/src/gandiva/to_date_holder.h @@ -0,0 +1,59 @@ +// Copyright (C) 2017-2018 Dremio Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TO_DATE_HOLDER_H +#define TO_DATE_HOLDER_H + +#include <unordered_map> + +#include "gandiva/function_holder.h" +#include "gandiva/node.h" +#include "gandiva/status.h" + +namespace gandiva { + +#ifdef GDV_HELPERS +namespace helpers { +#endif + +/// Function Holder for SQL 'to_date' +class ToDateHolder : public FunctionHolder { + public: + ~ToDateHolder() override = default; + + static Status Make(const FunctionNode &node, std::shared_ptr<ToDateHolder> *holder); + + static Status Make(const std::string &sql_pattern, int32_t suppress_errors, + std::shared_ptr<ToDateHolder> *holder); + + /// Return true if the data matches the pattern. + int64_t operator()(const std::string &data, bool in_valid, int64_t execution_context, + bool *out_valid); + + private: + ToDateHolder(const std::string &pattern, int32_t suppress_errors) + : pattern_(pattern), suppress_errors_(suppress_errors) {} + + void return_error(int64_t execution_context, const std::string &data); + + std::string pattern_; // date format string + + int32_t suppress_errors_; // should throw exception on runtime errors +}; + +#ifdef GDV_HELPERS +} +#endif +} // namespace gandiva +#endif // TO_DATE_HOLDER_H diff --git a/cpp/src/gandiva/to_date_holder_test.cc b/cpp/src/gandiva/to_date_holder_test.cc new file mode 100644 index 0000000..77761b4 --- /dev/null +++ b/cpp/src/gandiva/to_date_holder_test.cc @@ -0,0 +1,130 @@ +// Copyright (C) 2017-2018 Dremio Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <memory> +#include <vector> + +#include "precompiled/epoch_time_point.h" +#include "gandiva/execution_context.h" +#include "gandiva/to_date_holder.h" + +#include <gtest/gtest.h> + +namespace gandiva { + +class TestToDateHolder : public ::testing::Test { + public: + FunctionNode BuildToDate(std::string pattern) { + auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8())); + auto pattern_node = + std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern), false); + auto suppres_error_node = + std::make_shared<LiteralNode>(arrow::int32(), LiteralHolder(0), false); + return FunctionNode("to_date_utf8_utf8_int32", + {field, pattern_node, suppres_error_node}, arrow::int64()); + } +}; + +TEST_F(TestToDateHolder, TestSimpleDateTime) { + std::shared_ptr<ToDateHolder> to_date_holder; + + auto status = ToDateHolder::Make("YYYY-MM-DD HH:MI:SS", 1, &to_date_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + ExecutionContext execution_context; + auto &to_date = *to_date_holder; + bool out_valid; + int64_t millis_since_epoch = + to_date("1986-12-01 01:01:01", true, (int64_t)&execution_context, &out_valid); + EXPECT_EQ(millis_since_epoch, 533779200000); + + millis_since_epoch = + to_date("1986-12-01 01:01:01.11", true, (int64_t)&execution_context, &out_valid); + EXPECT_EQ(millis_since_epoch, 533779200000); + + millis_since_epoch = + to_date("1986-12-01 01:01:01 +0800", true, (int64_t)&execution_context, &out_valid); + EXPECT_EQ(millis_since_epoch, 533779200000); + + millis_since_epoch = + to_date("1986-12-11 01:30:00", true, (int64_t)&execution_context, &out_valid); + EXPECT_EQ(millis_since_epoch, 534643200000); +} + +TEST_F(TestToDateHolder, TestSimpleDate) { + std::shared_ptr<ToDateHolder> to_date_holder; + + auto status = ToDateHolder::Make("YYYY-MM-DD", 1, &to_date_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + ExecutionContext execution_context; + auto &to_date = *to_date_holder; + bool out_valid; + int64_t millis_since_epoch = + to_date("1986-12-01", true, (int64_t)&execution_context, &out_valid); + EXPECT_EQ(millis_since_epoch, 533779200000); + + millis_since_epoch = + to_date("1986-12-1", true, (int64_t)&execution_context, &out_valid); + EXPECT_EQ(millis_since_epoch, 533779200000); + + millis_since_epoch = + to_date("1886-12-1", true, (int64_t)&execution_context, &out_valid); + EXPECT_EQ(millis_since_epoch, -2621894400000); + + millis_since_epoch = + to_date("2012-12-1", true, (int64_t)&execution_context, &out_valid); + EXPECT_EQ(millis_since_epoch, 1354320000000); + + // wrong month. should return 0 since we are suppresing errors. + millis_since_epoch = + to_date("1986-21-01 01:01:01 +0800", true, (int64_t)&execution_context, &out_valid); + EXPECT_EQ(millis_since_epoch, 0); +} + +TEST_F(TestToDateHolder, TestSimpleDateTimeError) { + std::shared_ptr<ToDateHolder> to_date_holder; + + auto status = ToDateHolder::Make("YYYY-MM-DD HH:MI:SS", 0, &to_date_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + ExecutionContext execution_context; + auto &to_date = *to_date_holder; + bool out_valid; + + int64_t millis_since_epoch = + to_date("1986-21-01 01:01:01 +0800", true, (int64_t)&execution_context, &out_valid); + std::string expected_error = + "Error parsing value 1986-21-01 01:01:01 +0800 for given format"; + EXPECT_TRUE(execution_context.get_error().find(expected_error) != std::string::npos) + << status.message(); + + ExecutionContext execution_context1; + // not valid should not return error + millis_since_epoch = + to_date("nullptr", false, (int64_t)&execution_context1, &out_valid); + EXPECT_EQ(millis_since_epoch, 0); + EXPECT_TRUE(execution_context1.has_error() == false); +} + +TEST_F(TestToDateHolder, TestSimpleDateTimeMakeError) { + std::shared_ptr<ToDateHolder> to_date_holder; + // reject time stamps for now. + auto status = ToDateHolder::Make("YYYY-MM-DD HH:MI:SS tzo", 0, &to_date_holder); + EXPECT_EQ(status.IsInvalid(), true) << status.message(); +} + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +} // namespace gandiva
