This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new bb5b05b1f44 [opt](parse) optimize parsing string to datetime (#38385) bb5b05b1f44 is described below commit bb5b05b1f447bddf21fe1b4ed8c8b720aaa8291f Author: zclllhhjj <zhaochan...@selectdb.com> AuthorDate: Mon Jul 29 23:52:41 2024 +0800 [opt](parse) optimize parsing string to datetime (#38385) --- be/src/vec/functions/function_cast.h | 81 ++++++++++++++-------------------- be/src/vec/runtime/vdatetime_value.cpp | 29 +++++++----- 2 files changed, 51 insertions(+), 59 deletions(-) diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h index af2fadc84c2..5f3968e512b 100644 --- a/be/src/vec/functions/function_cast.h +++ b/be/src/vec/functions/function_cast.h @@ -978,9 +978,9 @@ struct NameToDateTime { static constexpr auto name = "toDateTime"; }; -template <typename DataType, typename Additions = void*, typename FromDataType = void*> +template <typename DataType, typename FromDataType = void*> bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, FunctionContext* context, - Additions additions [[maybe_unused]] = Additions()) { + UInt32 scale [[maybe_unused]] = 0) { if constexpr (IsDateTimeType<DataType>) { return try_read_datetime_text(x, rb, context->state()->timezone_obj()); } @@ -994,7 +994,6 @@ bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, FunctionCon } if constexpr (IsDateTimeV2Type<DataType>) { - UInt32 scale = additions; return try_read_datetime_v2_text(x, rb, context->state()->timezone_obj(), scale); } @@ -1032,7 +1031,6 @@ bool try_parse_impl(typename DataType::FieldType& x, ReadBuffer& rb, FunctionCon template <typename DataType, typename Additions = void*> StringParser::ParseResult try_parse_decimal_impl(typename DataType::FieldType& x, ReadBuffer& rb, - const cctz::time_zone& local_time_zone, Additions additions [[maybe_unused]] = Additions()) { if constexpr (IsDataTypeDecimalV2<DataType>) { @@ -1461,15 +1459,9 @@ private: const char* name; }; -struct NameCast { - static constexpr auto name = "CAST"; -}; - -template <typename FromDataType, typename ToDataType, typename Name> -struct ConvertThroughParsing { - static_assert(std::is_same_v<FromDataType, DataTypeString>, - "ConvertThroughParsing is only applicable for String or FixedString data types"); - +// always from DataTypeString +template <typename ToDataType, typename Name> +struct StringParsing { using ToFieldType = typename ToDataType::FieldType; static bool is_all_read(ReadBuffer& in) { return in.eof(); } @@ -1482,48 +1474,38 @@ struct ConvertThroughParsing { ColumnDecimal<ToFieldType>, ColumnVector<ToFieldType>>; const IColumn* col_from = block.get_by_position(arguments[0]).column.get(); - const ColumnString* col_from_string = check_and_get_column<ColumnString>(col_from); + const auto* col_from_string = check_and_get_column<ColumnString>(col_from); - if (std::is_same_v<FromDataType, DataTypeString> && !col_from_string) { + if (!col_from_string) { return Status::RuntimeError("Illegal column {} of first argument of function {}", col_from->get_name(), Name::name); } - size_t size = input_rows_count; + size_t row = input_rows_count; typename ColVecTo::MutablePtr col_to = nullptr; if constexpr (IsDataTypeDecimal<ToDataType>) { UInt32 scale = ((PrecisionScaleArg)additions).scale; ToDataType::check_type_scale(scale); - col_to = ColVecTo::create(size, scale); + col_to = ColVecTo::create(row, scale); } else { - col_to = ColVecTo::create(size); + col_to = ColVecTo::create(row); } typename ColVecTo::Container& vec_to = col_to->get_data(); ColumnUInt8::MutablePtr col_null_map_to; ColumnUInt8::Container* vec_null_map_to [[maybe_unused]] = nullptr; - col_null_map_to = ColumnUInt8::create(size); + col_null_map_to = ColumnUInt8::create(row); vec_null_map_to = &col_null_map_to->get_data(); - const ColumnString::Chars* chars = nullptr; - const IColumn::Offsets* offsets = nullptr; - size_t fixed_string_size = 0; - - if constexpr (std::is_same_v<FromDataType, DataTypeString>) { - chars = &col_from_string->get_chars(); - offsets = &col_from_string->get_offsets(); - } + const ColumnString::Chars* chars = &col_from_string->get_chars(); + const IColumn::Offsets* offsets = &col_from_string->get_offsets(); size_t current_offset = 0; - for (size_t i = 0; i < size; ++i) { - size_t next_offset = std::is_same_v<FromDataType, DataTypeString> - ? (*offsets)[i] - : (current_offset + fixed_string_size); - size_t string_size = std::is_same_v<FromDataType, DataTypeString> - ? next_offset - current_offset - : fixed_string_size; + for (size_t i = 0; i < row; ++i) { + size_t next_offset = (*offsets)[i]; + size_t string_size = next_offset - current_offset; ReadBuffer read_buffer(&(*chars)[current_offset], string_size); @@ -1531,8 +1513,7 @@ struct ConvertThroughParsing { if constexpr (IsDataTypeDecimal<ToDataType>) { ToDataType::check_type_precision((PrecisionScaleArg(additions).precision)); StringParser::ParseResult res = try_parse_decimal_impl<ToDataType>( - vec_to[i], read_buffer, context->state()->timezone_obj(), - PrecisionScaleArg(additions)); + vec_to[i], read_buffer, PrecisionScaleArg(additions)); parsed = (res == StringParser::PARSE_SUCCESS || res == StringParser::PARSE_OVERFLOW || res == StringParser::PARSE_UNDERFLOW); @@ -1542,8 +1523,8 @@ struct ConvertThroughParsing { parsed = try_parse_impl<ToDataType>(vec_to[i], read_buffer, context, type->get_scale()); } else { - parsed = try_parse_impl<ToDataType, void*, FromDataType>(vec_to[i], read_buffer, - context); + parsed = + try_parse_impl<ToDataType, DataTypeString>(vec_to[i], read_buffer, context); } (*vec_null_map_to)[i] = !parsed || !is_all_read(read_buffer); current_offset = next_offset; @@ -1557,25 +1538,27 @@ struct ConvertThroughParsing { template <typename Name> struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal32>, Name> - : ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal32>, Name> {}; + : StringParsing<DataTypeDecimal<Decimal32>, Name> {}; template <typename Name> struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal64>, Name> - : ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal64>, Name> {}; + : StringParsing<DataTypeDecimal<Decimal64>, Name> {}; template <typename Name> struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal128V2>, Name> - : ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal128V2>, Name> {}; + : StringParsing<DataTypeDecimal<Decimal128V2>, Name> {}; template <typename Name> struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal128V3>, Name> - : ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal128V3>, Name> {}; + : StringParsing<DataTypeDecimal<Decimal128V3>, Name> {}; template <typename Name> struct ConvertImpl<DataTypeString, DataTypeDecimal<Decimal256>, Name> - : ConvertThroughParsing<DataTypeString, DataTypeDecimal<Decimal256>, Name> {}; + : StringParsing<DataTypeDecimal<Decimal256>, Name> {}; template <typename Name> -struct ConvertImpl<DataTypeString, DataTypeIPv4, Name> - : ConvertThroughParsing<DataTypeString, DataTypeIPv4, Name> {}; +struct ConvertImpl<DataTypeString, DataTypeIPv4, Name> : StringParsing<DataTypeIPv4, Name> {}; template <typename Name> -struct ConvertImpl<DataTypeString, DataTypeIPv6, Name> - : ConvertThroughParsing<DataTypeString, DataTypeIPv6, Name> {}; +struct ConvertImpl<DataTypeString, DataTypeIPv6, Name> : StringParsing<DataTypeIPv6, Name> {}; + +struct NameCast { + static constexpr auto name = "CAST"; +}; template <typename ToDataType, typename Name> class FunctionConvertFromString : public IFunction { @@ -1610,8 +1593,8 @@ public: const IDataType* from_type = block.get_by_position(arguments[0]).type.get(); if (check_and_get_data_type<DataTypeString>(from_type)) { - return ConvertThroughParsing<DataTypeString, ToDataType, Name>::execute( - context, block, arguments, result, input_rows_count); + return StringParsing<ToDataType, Name>::execute(context, block, arguments, result, + input_rows_count); } return Status::RuntimeError( diff --git a/be/src/vec/runtime/vdatetime_value.cpp b/be/src/vec/runtime/vdatetime_value.cpp index 610983a149d..877573bcccb 100644 --- a/be/src/vec/runtime/vdatetime_value.cpp +++ b/be/src/vec/runtime/vdatetime_value.cpp @@ -55,6 +55,15 @@ uint8_t mysql_week_mode(uint32_t mode) { return mode; } +static bool check_space(char ch) { + // \t, \n, \v, \f, \r are 9~13, respectively. + return UNLIKELY(ch == ' ' || (ch >= 9 && ch <= 13)); +} + +static bool check_date_punct(char ch) { + return UNLIKELY(!(isdigit(ch) || isalpha(ch))); +} + static bool time_zone_begins(const char* ptr, const char* end) { return *ptr == '+' || (*ptr == '-' && ptr + 3 < end && *(ptr + 3) == ':') || (isalpha(*ptr) && *ptr != 'T'); @@ -104,7 +113,7 @@ bool VecDateTimeValue::from_date_str_base(const char* date_str, int len, _neg = false; // Skip space character - while (ptr < end && isspace(*ptr)) { + while (ptr < end && check_space(*ptr)) { ptr++; } if (ptr == end || !isdigit(*ptr)) { @@ -202,8 +211,8 @@ bool VecDateTimeValue::from_date_str_base(const char* date_str, int len, continue; } // escape separator - while (ptr < end && (ispunct(*ptr) || isspace(*ptr))) { - if (isspace(*ptr)) { + while (ptr < end && (check_date_punct(*ptr) || check_space(*ptr))) { + if (check_space(*ptr)) { if (((1 << field_idx) & allow_space_mask) == 0) { return false; } @@ -1235,7 +1244,7 @@ bool VecDateTimeValue::from_date_format_str(const char* format, int format_len, auto [year, month, day, hour, minute, second] = std::tuple {0, 0, 0, 0, 0, 0}; while (ptr < end && val < val_end) { // Skip space character - while (val < val_end && isspace(*val)) { + while (val < val_end && check_space(*val)) { val++; } if (val >= val_end) { @@ -1500,7 +1509,7 @@ bool VecDateTimeValue::from_date_format_str(const char* format, int format_len, default: return false; } - } else if (!isspace(*ptr)) { + } else if (!check_space(*ptr)) { if (*ptr != *val) { return false; } @@ -1987,13 +1996,13 @@ bool DateV2Value<T>::from_date_str(const char* date_str, int len, int scale /* = bool convert_zero) { return from_date_str_base(date_str, len, scale, nullptr, convert_zero); } -// when we parse template <typename T> bool DateV2Value<T>::from_date_str(const char* date_str, int len, const cctz::time_zone& local_time_zone, int scale /* = -1*/, bool convert_zero) { return from_date_str_base(date_str, len, scale, &local_time_zone, convert_zero); } +// if local_time_zone is null, only be able to parse time without timezone template <typename T> bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int scale, const cctz::time_zone* local_time_zone, bool convert_zero) { @@ -2005,7 +2014,7 @@ bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int scale int32_t date_len[MAX_DATE_PARTS] = {0}; // Skip space character - while (ptr < end && isspace(*ptr)) { + while (ptr < end && check_space(*ptr)) { ptr++; } if (ptr == end || !isdigit(*ptr)) { @@ -2153,8 +2162,8 @@ bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int scale continue; } // escape separator - while (ptr < end && (ispunct(*ptr) || isspace(*ptr))) { - if (isspace(*ptr)) { + while (ptr < end && (check_date_punct(*ptr) || check_space(*ptr))) { + if (check_space(*ptr)) { if (((1 << field_idx) & allow_space_mask) == 0) { return false; } @@ -2286,7 +2295,7 @@ bool DateV2Value<T>::from_date_format_str(const char* format, int format_len, co auto [year, month, day, hour, minute, second, microsecond] = std::tuple {0, 0, 0, 0, 0, 0, 0}; while (ptr < end && val < val_end) { // Skip space character - while (val < val_end && isspace(*val)) { + while (val < val_end && check_space(*val)) { val++; } if (val >= val_end) { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org