(doris) branch branch-2.0 updated: [fix](paimon)set timestamp's scale for parquet which has no logical type for 2.0 (#30259)

yiguolei Tue, 23 Jan 2024 20:04:37 -0800

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 3e707e9dc4a [fix](paimon)set timestamp's scale for parquet which has 
no logical type for 2.0 (#30259)
3e707e9dc4a is described below

commit 3e707e9dc4a79ff8b1b8b2fe41d095bddf67b67c
Author: wuwenchi <[email protected]>
AuthorDate: Wed Jan 24 12:03:57 2024 +0800

    [fix](paimon)set timestamp's scale for parquet which has no logical type 
for 2.0 (#30259)
---
 be/src/vec/exec/format/parquet/decoder.cpp         | 24 ++++++++++
 be/src/vec/exec/format/parquet/decoder.h           |  2 +
 .../format/parquet/fix_length_dict_decoder.hpp     |  2 +
 .../format/parquet/fix_length_plain_decoder.cpp    |  2 +
 .../paimon/paimon_timestamp_types.out              | 13 +++++
 .../paimon/paimon_timestamp_types.groovy           | 55 ++++++++++++++++++++++
 6 files changed, 98 insertions(+)

diff --git a/be/src/vec/exec/format/parquet/decoder.cpp 
b/be/src/vec/exec/format/parquet/decoder.cpp
index 0a158176091..aa16eaf4b6c 100644
--- a/be/src/vec/exec/format/parquet/decoder.cpp
+++ b/be/src/vec/exec/format/parquet/decoder.cpp
@@ -184,4 +184,28 @@ void Decoder::init(FieldSchema* field_schema, 
cctz::time_zone* ctz) {
         _decode_params->offset_days = t.day() == 31 ? -1 : 0; // If 
1969-12-31, then returns -1.
     }
 }
+
+/**
+ * Some frameworks like paimon maybe writes non-standard parquet files. 
Timestamp field doesn't have
+ * logicalType or converted_type to indicates its precision. We have to reset 
the time mask.
+ */
+void Decoder::reset_time_scale_if_missing(int scale) {
+    const auto& schema = _field_schema->parquet_schema;
+    if (!schema.__isset.logicalType && !schema.__isset.converted_type) {
+        int ts_scale = 9;
+        if (scale <= 3) {
+            ts_scale = 3;
+        } else if (scale <= 6) {
+            ts_scale = 6;
+        }
+        _decode_params->second_mask = common::exp10_i64(ts_scale);
+        _decode_params->scale_to_nano_factor = common::exp10_i64(9 - ts_scale);
+
+        // The missing parque metadata makes it impossible for us to know the 
time zone information,
+        // so we default to UTC here.
+        if (_decode_params->ctz == nullptr) {
+            _decode_params->ctz = 
const_cast<cctz::time_zone*>(&_decode_params->utc0);
+        }
+    }
+}
 } // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/decoder.h 
b/be/src/vec/exec/format/parquet/decoder.h
index acd9965bad8..f3da9429dda 100644
--- a/be/src/vec/exec/format/parquet/decoder.h
+++ b/be/src/vec/exec/format/parquet/decoder.h
@@ -99,6 +99,8 @@ public:
     template <typename DecimalPrimitiveType>
     void init_decimal_converter(DataTypePtr& data_type);
 
+    void reset_time_scale_if_missing(int scale);
+
     // Write the decoded values batch to doris's column
     virtual Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& 
data_type,
                                  ColumnSelectVector& select_vector, bool 
is_dict_filter) = 0;
diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp 
b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
index a30c2dff3d1..3ca6193a4a1 100644
--- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
+++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
@@ -107,9 +107,11 @@ public:
             // Spark can set the timestamp precision by the following 
configuration:
             // spark.sql.parquet.outputTimestampType = INT96(NANOS), 
TIMESTAMP_MICROS, TIMESTAMP_MILLIS
             if constexpr (std::is_same_v<T, ParquetInt96>) {
+                reset_time_scale_if_missing(9);
                 return _decode_datetime96<DateV2Value<DateTimeV2ValueType>, 
UInt64, has_filter>(
                         doris_column, select_vector);
             } else if constexpr (std::is_same_v<T, Int64>) {
+                
reset_time_scale_if_missing(remove_nullable(data_type)->get_scale());
                 return _decode_datetime64<DateV2Value<DateTimeV2ValueType>, 
UInt64, has_filter>(
                         doris_column, select_vector);
             }
diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp 
b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
index af464c15545..c496cc175c9 100644
--- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
@@ -118,9 +118,11 @@ Status 
FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat
         // Spark can set the timestamp precision by the following 
configuration:
         // spark.sql.parquet.outputTimestampType = INT96(NANOS), 
TIMESTAMP_MICROS, TIMESTAMP_MILLIS
         if (_physical_type == tparquet::Type::INT96) {
+            reset_time_scale_if_missing(9);
             return _decode_datetime96<DateV2Value<DateTimeV2ValueType>, 
UInt64, has_filter>(
                     doris_column, select_vector);
         } else if (_physical_type == tparquet::Type::INT64) {
+            
reset_time_scale_if_missing(remove_nullable(data_type)->get_scale());
             return _decode_datetime64<DateV2Value<DateTimeV2ValueType>, 
UInt64, has_filter>(
                     doris_column, select_vector);
         }
diff --git 
a/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out 
b/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out
new file mode 100644
index 00000000000..641424b160e
--- /dev/null
+++ b/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out
@@ -0,0 +1,13 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !c1 --
+1      5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 
5432-08-30T05:43:21.123400      5432-08-30T05:43:21.123450      
5432-08-30T05:43:21.123456      5432-08-30T05:43:21.123456      
5432-08-30T05:43:21.123456      5432-08-30T05:43:21.123456
+
+-- !c2 --
+1      5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 
5432-08-30T05:43:21.123400      5432-08-30T05:43:21.123450      
5432-08-30T05:43:21.123456      5432-08-30T05:43:21.123456      
5432-08-30T05:43:21.123456      5432-08-30T05:43:21.123456
+
+-- !c3 --
+1      5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 
5432-08-30T05:43:21.123400      5432-08-30T05:43:21.123450      
5432-08-30T05:43:21.123456      5432-08-30T05:43:21.123456      
5432-08-30T05:43:21.123456      5432-08-30T05:43:21.123456
+
+-- !c4 --
+1      5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123 
5432-08-30T05:43:21.123400      5432-08-30T05:43:21.123450      
5432-08-30T05:43:21.123456      5432-08-30T05:43:21.123456      
5432-08-30T05:43:21.123456      5432-08-30T05:43:21.123456
+
diff --git 
a/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy 
b/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy
new file mode 100644
index 00000000000..dbb1f1d038c
--- /dev/null
+++ 
b/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("paimon_timestamp_types", 
"p2,external,paimon,external_remote,external_remote_paimon") {
+
+    def ts_orc = """select * from ts_orc"""
+    def ts_parquet = """select * from ts_parquet"""
+
+    String enabled = 
context.config.otherConfigs.get("enableExternalPaimonTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        String catalog_name = "paimon_timestamp_catalog"
+        String user_name = context.config.otherConfigs.get("extHiveHmsUser")
+        String hiveHost = context.config.otherConfigs.get("extHiveHmsHost")
+        String hivePort = context.config.otherConfigs.get("extHdfsPort")
+
+        sql """drop catalog if exists ${catalog_name};"""
+        sql """
+            create catalog if not exists ${catalog_name} properties (
+                "type" = "paimon",
+                "paimon.catalog.type" = "filesystem",
+                "warehouse" = "hdfs://${hiveHost}/${hivePort}/paimon/paimon1",
+                "hadoop.username" = "${user_name}"
+            );
+        """
+        logger.info("catalog " + catalog_name + " created")
+        sql """switch ${catalog_name};"""
+        logger.info("switched to catalog " + catalog_name)
+        sql """use db1;"""
+        logger.info("use db1")
+
+        sql """set force_jni_scanner=true"""
+        qt_c1 ts_orc
+        qt_c2 ts_parquet
+
+        sql """set force_jni_scanner=false"""
+        qt_c3 ts_orc
+        qt_c4 ts_parquet
+
+    }
+}
+


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-2.0 updated: [fix](paimon)set timestamp's scale for parquet which has no logical type for 2.0 (#30259)

Reply via email to