This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 6fa73b2aef6a82f2ae98676d0e15886370786b62
Author: Ashin Gau <[email protected]>
AuthorDate: Fri Sep 1 14:40:20 2023 +0800

    [fix](date) return right date value even if out of the range of date 
dictionary(#23664)
    
    PR(https://github.com/apache/doris/pull/22360) and 
PR(https://github.com/apache/doris/pull/22384) optimized the performance of 
date type. However hive supports date out of 1970~2038, leading wrong date 
value in tpcds benchmark.
    How to fix:
    1. Increase dictionary range: 1900 ~ 2038
    2. The date out of 1900 ~ 2038 is regenerated.
---
 be/src/service/doris_main.cpp                      |  1 -
 be/src/vec/exec/format/orc/vorc_reader.h           |  8 ++---
 .../format/parquet/fix_length_dict_decoder.hpp     |  7 ++--
 .../format/parquet/fix_length_plain_decoder.cpp    |  9 ++---
 be/src/vec/runtime/vdatetime_value.cpp             | 39 ++++++++++++++++++----
 be/src/vec/runtime/vdatetime_value.h               | 23 +++++++++++--
 be/test/vec/exec/parquet/parquet_thrift_test.cpp   |  1 -
 .../external_table_p2/hive/test_complex_types.out  |  3 ++
 .../hive/test_complex_types.groovy                 |  2 ++
 9 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp
index f1c98ebd4b..f7fb99caa8 100644
--- a/be/src/service/doris_main.cpp
+++ b/be/src/service/doris_main.cpp
@@ -451,7 +451,6 @@ int main(int argc, char** argv) {
     auto exec_env = doris::ExecEnv::GetInstance();
     doris::ExecEnv::init(exec_env, paths);
     doris::TabletSchemaCache::create_global_schema_cache();
-    doris::vectorized::init_date_day_offset_dict();
 
     // init s3 write buffer pool
     doris::io::S3FileBufferPool* s3_buffer_pool = 
doris::io::S3FileBufferPool::GetInstance();
diff --git a/be/src/vec/exec/format/orc/vorc_reader.h 
b/be/src/vec/exec/format/orc/vorc_reader.h
index 4f11eb4de1..05fe7125cd 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.h
+++ b/be/src/vec/exec/format/orc/vorc_reader.h
@@ -406,7 +406,7 @@ private:
         if (data == nullptr) {
             return Status::InternalError("Wrong data type for colum '{}'", 
col_name);
         }
-        auto* __restrict date_day_offset_dict = get_date_day_offset_dict();
+        date_day_offset_dict& date_dict = date_day_offset_dict::get();
         auto& column_data = 
static_cast<ColumnVector<DorisColumnType>&>(*data_column).get_data();
         auto origin_size = column_data.size();
         column_data.resize(origin_size + num_values);
@@ -423,14 +423,12 @@ private:
                     }
                 }
                 int64_t date_value = data->data[i] + _offset_days;
-                DCHECK_LT(date_value, 25500);
-                DCHECK_GE(date_value, 0);
                 if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
-                    v.create_from_date_v2(date_day_offset_dict[date_value], 
TIME_DATE);
+                    v.create_from_date_v2(date_dict[date_value], TIME_DATE);
                     // we should cast to date if using date v1.
                     v.cast_to_date();
                 } else {
-                    v = date_day_offset_dict[date_value];
+                    v = date_dict[date_value];
                 }
             } else { // timestamp
                 if constexpr (is_filter) {
diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp 
b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
index c368868fd8..1047414899 100644
--- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
+++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
@@ -216,7 +216,7 @@ protected:
         size_t data_index = column_data.size();
         column_data.resize(data_index + select_vector.num_values() - 
select_vector.num_filtered());
         size_t dict_index = 0;
-        auto* __restrict date_day_offset_dict = get_date_day_offset_dict();
+        date_day_offset_dict& date_dict = date_day_offset_dict::get();
         ColumnSelectVector::DataReadType read_type;
         while (size_t run_length = 
select_vector.get_next_run<has_filter>(&read_type)) {
             switch (read_type) {
@@ -224,15 +224,14 @@ protected:
                 for (size_t i = 0; i < run_length; ++i) {
                     int64_t date_value =
                             _dict_items[_indexes[dict_index++]] + 
_decode_params->offset_days;
-                    DCHECK_LT(date_value, 25500);
                     if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
                         auto& v = 
reinterpret_cast<CppType&>(column_data[data_index++]);
-                        
v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE);
+                        v.create_from_date_v2(date_dict[date_value], 
TIME_DATE);
                         // we should cast to date if using date v1.
                         v.cast_to_date();
                     } else {
                         reinterpret_cast<CppType&>(column_data[data_index++]) =
-                                date_day_offset_dict[date_value];
+                                date_dict[date_value];
                     }
                 }
                 break;
diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp 
b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
index f4e24ca4ab..e94948ad40 100644
--- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
@@ -262,7 +262,7 @@ Status 
FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column,
     size_t data_index = column_data.size();
     column_data.resize(data_index + select_vector.num_values() - 
select_vector.num_filtered());
     ColumnSelectVector::DataReadType read_type;
-    auto* __restrict date_day_offset_dict = get_date_day_offset_dict();
+    date_day_offset_dict& date_dict = date_day_offset_dict::get();
 
     while (size_t run_length = 
select_vector.get_next_run<has_filter>(&read_type)) {
         switch (read_type) {
@@ -271,16 +271,13 @@ Status 
FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column,
                 char* buf_start = _data->data + _offset;
                 int64_t date_value = 
static_cast<int64_t>(*reinterpret_cast<int32_t*>(buf_start)) +
                                      _decode_params->offset_days;
-                DCHECK_LT(date_value, 25500);
-                DCHECK_GE(date_value, 0);
                 if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
                     auto& v = 
reinterpret_cast<CppType&>(column_data[data_index++]);
-                    v.create_from_date_v2(date_day_offset_dict[date_value], 
TIME_DATE);
+                    v.create_from_date_v2(date_dict[date_value], TIME_DATE);
                     // we should cast to date if using date v1.
                     v.cast_to_date();
                 } else {
-                    reinterpret_cast<CppType&>(column_data[data_index++]) =
-                            date_day_offset_dict[date_value];
+                    reinterpret_cast<CppType&>(column_data[data_index++]) = 
date_dict[date_value];
                 }
                 _offset += _type_length;
             }
diff --git a/be/src/vec/runtime/vdatetime_value.cpp 
b/be/src/vec/runtime/vdatetime_value.cpp
index 86d685a0b9..32a93424e7 100644
--- a/be/src/vec/runtime/vdatetime_value.cpp
+++ b/be/src/vec/runtime/vdatetime_value.cpp
@@ -1875,6 +1875,12 @@ void 
VecDateTimeValue::create_from_date_v2(DateV2Value<T>& value, TimeType type)
     this->_neg = 0;
 }
 
+template <typename T>
+void VecDateTimeValue::create_from_date_v2(DateV2Value<T>&& value, TimeType 
type) {
+    DateV2Value<T> v = value;
+    create_from_date_v2(v, type);
+}
+
 std::ostream& operator<<(std::ostream& os, const VecDateTimeValue& value) {
     char buf[64];
     value.to_string(buf);
@@ -2668,19 +2674,36 @@ typename DateV2Value<T>::underlying_value 
DateV2Value<T>::to_date_int_val() cons
     return int_val_;
 }
 
-static std::array<DateV2Value<DateV2ValueType>, 25500> DATE_DAY_OFFSET_DICT;
+static std::array<DateV2Value<DateV2ValueType>, 
date_day_offset_dict::DICT_DAYS>
+        DATE_DAY_OFFSET_ITEMS;
+date_day_offset_dict date_day_offset_dict::instance = date_day_offset_dict();
 
-void init_date_day_offset_dict() {
+date_day_offset_dict& date_day_offset_dict::get() {
+    return instance;
+}
+
+date_day_offset_dict::date_day_offset_dict() {
     DateV2Value<DateV2ValueType> d;
     d.set_time(1969, 12, 31, 0, 0, 0, 0);
-    for (int i = 0; i < DATE_DAY_OFFSET_DICT.size(); ++i) {
-        DATE_DAY_OFFSET_DICT[i] = d;
+    for (int i = 0; i < DAY_AFTER_EPOCH; ++i) {
+        DATE_DAY_OFFSET_ITEMS[DAY_BEFORE_EPOCH + i] = d;
         d += 1;
     }
+    d.set_time(1969, 12, 31, 0, 0, 0, 0);
+    for (int i = 0; i <= DAY_BEFORE_EPOCH; ++i) {
+        DATE_DAY_OFFSET_ITEMS[DAY_BEFORE_EPOCH - i] = d;
+        d -= 1;
+    }
 }
 
-DateV2Value<DateV2ValueType>* get_date_day_offset_dict() {
-    return DATE_DAY_OFFSET_DICT.data();
+DateV2Value<DateV2ValueType> date_day_offset_dict::operator[](int day) {
+    int index = day + DAY_BEFORE_EPOCH;
+    if (LIKELY(index >= 0 && index < DICT_DAYS)) {
+        return DATE_DAY_OFFSET_ITEMS[index];
+    } else {
+        DateV2Value<DateV2ValueType> d = DATE_DAY_OFFSET_ITEMS[0];
+        return d += index;
+    }
 }
 
 template <typename T>
@@ -3634,8 +3657,12 @@ template std::size_t operator-(const 
DateV2Value<DateTimeV2ValueType>& v1,
 
 template void VecDateTimeValue::create_from_date_v2<DateV2ValueType>(
         DateV2Value<DateV2ValueType>& value, TimeType type);
+template void VecDateTimeValue::create_from_date_v2<DateV2ValueType>(
+        DateV2Value<DateV2ValueType>&& value, TimeType type);
 template void VecDateTimeValue::create_from_date_v2<DateTimeV2ValueType>(
         DateV2Value<DateTimeV2ValueType>& value, TimeType type);
+template void VecDateTimeValue::create_from_date_v2<DateTimeV2ValueType>(
+        DateV2Value<DateTimeV2ValueType>&& value, TimeType type);
 
 template int64_t VecDateTimeValue::second_diff<DateV2Value<DateV2ValueType>>(
         const DateV2Value<DateV2ValueType>& rhs) const;
diff --git a/be/src/vec/runtime/vdatetime_value.h 
b/be/src/vec/runtime/vdatetime_value.h
index 68b1b1ad58..aa2b23d942 100644
--- a/be/src/vec/runtime/vdatetime_value.h
+++ b/be/src/vec/runtime/vdatetime_value.h
@@ -270,6 +270,9 @@ public:
     template <typename T>
     void create_from_date_v2(DateV2Value<T>& value, TimeType type);
 
+    template <typename T>
+    void create_from_date_v2(DateV2Value<T>&& value, TimeType type);
+
     void set_time(uint32_t year, uint32_t month, uint32_t day, uint32_t hour, 
uint32_t minute,
                   uint32_t second);
 
@@ -1496,8 +1499,24 @@ class DataTypeDateTime;
 class DataTypeDateV2;
 class DataTypeDateTimeV2;
 
-[[maybe_unused]] void init_date_day_offset_dict();
-[[maybe_unused]] DateV2Value<DateV2ValueType>* get_date_day_offset_dict();
+class date_day_offset_dict {
+private:
+    static date_day_offset_dict instance;
+
+    date_day_offset_dict();
+    ~date_day_offset_dict() = default;
+    date_day_offset_dict(const date_day_offset_dict&) = default;
+    date_day_offset_dict& operator=(const date_day_offset_dict&) = default;
+
+public:
+    static constexpr int DAY_BEFORE_EPOCH = 25566; // 1900-01-01
+    static constexpr int DAY_AFTER_EPOCH = 25500;  // 2039-10-24
+    static constexpr int DICT_DAYS = DAY_BEFORE_EPOCH + DAY_AFTER_EPOCH;
+
+    static date_day_offset_dict& get();
+
+    DateV2Value<DateV2ValueType> operator[](int day);
+};
 
 template <typename T>
 struct DateTraits {};
diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp 
b/be/test/vec/exec/parquet/parquet_thrift_test.cpp
index 08879e604a..06201f6378 100644
--- a/be/test/vec/exec/parquet/parquet_thrift_test.cpp
+++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp
@@ -437,7 +437,6 @@ static void read_parquet_data_and_check(const std::string& 
parquet_file,
 }
 
 TEST_F(ParquetThriftReaderTest, type_decoder) {
-    init_date_day_offset_dict();
     
read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet",
                                 
"./be/test/exec/test_data/parquet_scanner/type-decoder.txt", 10);
 }
diff --git a/regression-test/data/external_table_p2/hive/test_complex_types.out 
b/regression-test/data/external_table_p2/hive/test_complex_types.out
index c414a60a99..1bfb858fae 100644
--- a/regression-test/data/external_table_p2/hive/test_complex_types.out
+++ b/regression-test/data/external_table_p2/hive/test_complex_types.out
@@ -32,3 +32,6 @@
 -- !map_with_nullable_key --
 \N     \N      \N      \N      \N      \N      \N      \N      \N              
test            test    
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
 [...]
 
+-- !date_dict --
+2036-12-28     1898-12-28      2539-12-28
+
diff --git 
a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy 
b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
index 5422e1d9a4..c1f540e6ca 100644
--- a/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
+++ b/regression-test/suites/external_table_p2/hive/test_complex_types.groovy
@@ -55,6 +55,8 @@ suite("test_complex_types", "p2") {
 
         qt_map_with_nullable_key """select * from parquet_all_types limit 1"""
 
+        qt_date_dict """select max(date1), max(date2), max(date3) from 
date_dict"""
+
         sql """drop catalog ${catalog_name};"""
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to