This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 6fa73b2aef6a82f2ae98676d0e15886370786b62 Author: Ashin Gau <[email protected]> AuthorDate: Fri Sep 1 14:40:20 2023 +0800 [fix](date) return right date value even if out of the range of date dictionary(#23664) PR(https://github.com/apache/doris/pull/22360) and PR(https://github.com/apache/doris/pull/22384) optimized the performance of date type. However hive supports date out of 1970~2038, leading wrong date value in tpcds benchmark. How to fix: 1. Increase dictionary range: 1900 ~ 2038 2. The date out of 1900 ~ 2038 is regenerated. --- be/src/service/doris_main.cpp | 1 - be/src/vec/exec/format/orc/vorc_reader.h | 8 ++--- .../format/parquet/fix_length_dict_decoder.hpp | 7 ++-- .../format/parquet/fix_length_plain_decoder.cpp | 9 ++--- be/src/vec/runtime/vdatetime_value.cpp | 39 ++++++++++++++++++---- be/src/vec/runtime/vdatetime_value.h | 23 +++++++++++-- be/test/vec/exec/parquet/parquet_thrift_test.cpp | 1 - .../external_table_p2/hive/test_complex_types.out | 3 ++ .../hive/test_complex_types.groovy | 2 ++ 9 files changed, 68 insertions(+), 25 deletions(-) diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index f1c98ebd4b..f7fb99caa8 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -451,7 +451,6 @@ int main(int argc, char** argv) { auto exec_env = doris::ExecEnv::GetInstance(); doris::ExecEnv::init(exec_env, paths); doris::TabletSchemaCache::create_global_schema_cache(); - doris::vectorized::init_date_day_offset_dict(); // init s3 write buffer pool doris::io::S3FileBufferPool* s3_buffer_pool = doris::io::S3FileBufferPool::GetInstance(); diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index 4f11eb4de1..05fe7125cd 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -406,7 +406,7 @@ private: if (data == nullptr) { return Status::InternalError("Wrong data type for colum '{}'", col_name); } - auto* __restrict date_day_offset_dict = get_date_day_offset_dict(); + date_day_offset_dict& date_dict = date_day_offset_dict::get(); auto& column_data = static_cast<ColumnVector<DorisColumnType>&>(*data_column).get_data(); auto origin_size = column_data.size(); column_data.resize(origin_size + num_values); @@ -423,14 +423,12 @@ private: } } int64_t date_value = data->data[i] + _offset_days; - DCHECK_LT(date_value, 25500); - DCHECK_GE(date_value, 0); if constexpr (std::is_same_v<CppType, VecDateTimeValue>) { - v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE); + v.create_from_date_v2(date_dict[date_value], TIME_DATE); // we should cast to date if using date v1. v.cast_to_date(); } else { - v = date_day_offset_dict[date_value]; + v = date_dict[date_value]; } } else { // timestamp if constexpr (is_filter) { diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index c368868fd8..1047414899 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -216,7 +216,7 @@ protected: size_t data_index = column_data.size(); column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); size_t dict_index = 0; - auto* __restrict date_day_offset_dict = get_date_day_offset_dict(); + date_day_offset_dict& date_dict = date_day_offset_dict::get(); ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) { switch (read_type) { @@ -224,15 +224,14 @@ protected: for (size_t i = 0; i < run_length; ++i) { int64_t date_value = _dict_items[_indexes[dict_index++]] + _decode_params->offset_days; - DCHECK_LT(date_value, 25500); if constexpr (std::is_same_v<CppType, VecDateTimeValue>) { auto& v = reinterpret_cast<CppType&>(column_data[data_index++]); - v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE); + v.create_from_date_v2(date_dict[date_value], TIME_DATE); // we should cast to date if using date v1. v.cast_to_date(); } else { reinterpret_cast<CppType&>(column_data[data_index++]) = - date_day_offset_dict[date_value]; + date_dict[date_value]; } } break; diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp index f4e24ca4ab..e94948ad40 100644 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp @@ -262,7 +262,7 @@ Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column, size_t data_index = column_data.size(); column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); ColumnSelectVector::DataReadType read_type; - auto* __restrict date_day_offset_dict = get_date_day_offset_dict(); + date_day_offset_dict& date_dict = date_day_offset_dict::get(); while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) { switch (read_type) { @@ -271,16 +271,13 @@ Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column, char* buf_start = _data->data + _offset; int64_t date_value = static_cast<int64_t>(*reinterpret_cast<int32_t*>(buf_start)) + _decode_params->offset_days; - DCHECK_LT(date_value, 25500); - DCHECK_GE(date_value, 0); if constexpr (std::is_same_v<CppType, VecDateTimeValue>) { auto& v = reinterpret_cast<CppType&>(column_data[data_index++]); - v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE); + v.create_from_date_v2(date_dict[date_value], TIME_DATE); // we should cast to date if using date v1. v.cast_to_date(); } else { - reinterpret_cast<CppType&>(column_data[data_index++]) = - date_day_offset_dict[date_value]; + reinterpret_cast<CppType&>(column_data[data_index++]) = date_dict[date_value]; } _offset += _type_length; } diff --git a/be/src/vec/runtime/vdatetime_value.cpp b/be/src/vec/runtime/vdatetime_value.cpp index 86d685a0b9..32a93424e7 100644 --- a/be/src/vec/runtime/vdatetime_value.cpp +++ b/be/src/vec/runtime/vdatetime_value.cpp @@ -1875,6 +1875,12 @@ void VecDateTimeValue::create_from_date_v2(DateV2Value<T>& value, TimeType type) this->_neg = 0; } +template <typename T> +void VecDateTimeValue::create_from_date_v2(DateV2Value<T>&& value, TimeType type) { + DateV2Value<T> v = value; + create_from_date_v2(v, type); +} + std::ostream& operator<<(std::ostream& os, const VecDateTimeValue& value) { char buf[64]; value.to_string(buf); @@ -2668,19 +2674,36 @@ typename DateV2Value<T>::underlying_value DateV2Value<T>::to_date_int_val() cons return int_val_; } -static std::array<DateV2Value<DateV2ValueType>, 25500> DATE_DAY_OFFSET_DICT; +static std::array<DateV2Value<DateV2ValueType>, date_day_offset_dict::DICT_DAYS> + DATE_DAY_OFFSET_ITEMS; +date_day_offset_dict date_day_offset_dict::instance = date_day_offset_dict(); -void init_date_day_offset_dict() { +date_day_offset_dict& date_day_offset_dict::get() { + return instance; +} + +date_day_offset_dict::date_day_offset_dict() { DateV2Value<DateV2ValueType> d; d.set_time(1969, 12, 31, 0, 0, 0, 0); - for (int i = 0; i < DATE_DAY_OFFSET_DICT.size(); ++i) { - DATE_DAY_OFFSET_DICT[i] = d; + for (int i = 0; i < DAY_AFTER_EPOCH; ++i) { + DATE_DAY_OFFSET_ITEMS[DAY_BEFORE_EPOCH + i] = d; d += 1; } + d.set_time(1969, 12, 31, 0, 0, 0, 0); + for (int i = 0; i <= DAY_BEFORE_EPOCH; ++i) { + DATE_DAY_OFFSET_ITEMS[DAY_BEFORE_EPOCH - i] = d; + d -= 1; + } } -DateV2Value<DateV2ValueType>* get_date_day_offset_dict() { - return DATE_DAY_OFFSET_DICT.data(); +DateV2Value<DateV2ValueType> date_day_offset_dict::operator[](int day) { + int index = day + DAY_BEFORE_EPOCH; + if (LIKELY(index >= 0 && index < DICT_DAYS)) { + return DATE_DAY_OFFSET_ITEMS[index]; + } else { + DateV2Value<DateV2ValueType> d = DATE_DAY_OFFSET_ITEMS[0]; + return d += index; + } } template <typename T> @@ -3634,8 +3657,12 @@ template std::size_t operator-(const DateV2Value<DateTimeV2ValueType>& v1, template void VecDateTimeValue::create_from_date_v2<DateV2ValueType>( DateV2Value<DateV2ValueType>& value, TimeType type); +template void VecDateTimeValue::create_from_date_v2<DateV2ValueType>( + DateV2Value<DateV2ValueType>&& value, TimeType type); template void VecDateTimeValue::create_from_date_v2<DateTimeV2ValueType>( DateV2Value<DateTimeV2ValueType>& value, TimeType type); +template void VecDateTimeValue::create_from_date_v2<DateTimeV2ValueType>( + DateV2Value<DateTimeV2ValueType>&& value, TimeType type); template int64_t VecDateTimeValue::second_diff<DateV2Value<DateV2ValueType>>( const DateV2Value<DateV2ValueType>& rhs) const; diff --git a/be/src/vec/runtime/vdatetime_value.h b/be/src/vec/runtime/vdatetime_value.h index 68b1b1ad58..aa2b23d942 100644 --- a/be/src/vec/runtime/vdatetime_value.h +++ b/be/src/vec/runtime/vdatetime_value.h @@ -270,6 +270,9 @@ public: template <typename T> void create_from_date_v2(DateV2Value<T>& value, TimeType type); + template <typename T> + void create_from_date_v2(DateV2Value<T>&& value, TimeType type); + void set_time(uint32_t year, uint32_t month, uint32_t day, uint32_t hour, uint32_t minute, uint32_t second); @@ -1496,8 +1499,24 @@ class DataTypeDateTime; class DataTypeDateV2; class DataTypeDateTimeV2; -[[maybe_unused]] void init_date_day_offset_dict(); -[[maybe_unused]] DateV2Value<DateV2ValueType>* get_date_day_offset_dict(); +class date_day_offset_dict { +private: + static date_day_offset_dict instance; + + date_day_offset_dict(); + ~date_day_offset_dict() = default; + date_day_offset_dict(const date_day_offset_dict&) = default; + date_day_offset_dict& operator=(const date_day_offset_dict&) = default; + +public: + static constexpr int DAY_BEFORE_EPOCH = 25566; // 1900-01-01 + static constexpr int DAY_AFTER_EPOCH = 25500; // 2039-10-24 + static constexpr int DICT_DAYS = DAY_BEFORE_EPOCH + DAY_AFTER_EPOCH; + + static date_day_offset_dict& get(); + + DateV2Value<DateV2ValueType> operator[](int day); +}; template <typename T> struct DateTraits {}; diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/parquet/parquet_thrift_test.cpp index 08879e604a..06201f6378 100644 --- a/be/test/vec/exec/parquet/parquet_thrift_test.cpp +++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp @@ -437,7 +437,6 @@ static void read_parquet_data_and_check(const std::string& parquet_file, } TEST_F(ParquetThriftReaderTest, type_decoder) { - init_date_day_offset_dict(); read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet", "./be/test/exec/test_data/parquet_scanner/type-decoder.txt", 10); } diff --git a/regression-test/data/external_table_p2/hive/test_complex_types.out b/regression-test/data/external_table_p2/hive/test_complex_types.out index c414a60a99..1bfb858fae 100644 --- a/regression-test/data/external_table_p2/hive/test_complex_types.out +++ b/regression-test/data/external_table_p2/hive/test_complex_types.out @@ -32,3 +32,6 @@ -- !map_with_nullable_key -- \N \N \N \N \N \N \N \N \N test test aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...] +-- !date_dict -- +2036-12-28 1898-12-28 2539-12-28 + diff --git a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy index 5422e1d9a4..c1f540e6ca 100644 --- a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy +++ b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy @@ -55,6 +55,8 @@ suite("test_complex_types", "p2") { qt_map_with_nullable_key """select * from parquet_all_types limit 1""" + qt_date_dict """select max(date1), max(date2), max(date3) from date_dict""" + sql """drop catalog ${catalog_name};""" } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
