This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 3e707e9dc4a [fix](paimon)set timestamp's scale for parquet which has
no logical type for 2.0 (#30259)
3e707e9dc4a is described below
commit 3e707e9dc4a79ff8b1b8b2fe41d095bddf67b67c
Author: wuwenchi <[email protected]>
AuthorDate: Wed Jan 24 12:03:57 2024 +0800
[fix](paimon)set timestamp's scale for parquet which has no logical type
for 2.0 (#30259)
---
be/src/vec/exec/format/parquet/decoder.cpp | 24 ++++++++++
be/src/vec/exec/format/parquet/decoder.h | 2 +
.../format/parquet/fix_length_dict_decoder.hpp | 2 +
.../format/parquet/fix_length_plain_decoder.cpp | 2 +
.../paimon/paimon_timestamp_types.out | 13 +++++
.../paimon/paimon_timestamp_types.groovy | 55 ++++++++++++++++++++++
6 files changed, 98 insertions(+)
diff --git a/be/src/vec/exec/format/parquet/decoder.cpp
b/be/src/vec/exec/format/parquet/decoder.cpp
index 0a158176091..aa16eaf4b6c 100644
--- a/be/src/vec/exec/format/parquet/decoder.cpp
+++ b/be/src/vec/exec/format/parquet/decoder.cpp
@@ -184,4 +184,28 @@ void Decoder::init(FieldSchema* field_schema,
cctz::time_zone* ctz) {
_decode_params->offset_days = t.day() == 31 ? -1 : 0; // If
1969-12-31, then returns -1.
}
}
+
+/**
+ * Some frameworks like paimon maybe writes non-standard parquet files.
Timestamp field doesn't have
+ * logicalType or converted_type to indicates its precision. We have to reset
the time mask.
+ */
+void Decoder::reset_time_scale_if_missing(int scale) {
+ const auto& schema = _field_schema->parquet_schema;
+ if (!schema.__isset.logicalType && !schema.__isset.converted_type) {
+ int ts_scale = 9;
+ if (scale <= 3) {
+ ts_scale = 3;
+ } else if (scale <= 6) {
+ ts_scale = 6;
+ }
+ _decode_params->second_mask = common::exp10_i64(ts_scale);
+ _decode_params->scale_to_nano_factor = common::exp10_i64(9 - ts_scale);
+
+ // The missing parque metadata makes it impossible for us to know the
time zone information,
+ // so we default to UTC here.
+ if (_decode_params->ctz == nullptr) {
+ _decode_params->ctz =
const_cast<cctz::time_zone*>(&_decode_params->utc0);
+ }
+ }
+}
} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/decoder.h
b/be/src/vec/exec/format/parquet/decoder.h
index acd9965bad8..f3da9429dda 100644
--- a/be/src/vec/exec/format/parquet/decoder.h
+++ b/be/src/vec/exec/format/parquet/decoder.h
@@ -99,6 +99,8 @@ public:
template <typename DecimalPrimitiveType>
void init_decimal_converter(DataTypePtr& data_type);
+ void reset_time_scale_if_missing(int scale);
+
// Write the decoded values batch to doris's column
virtual Status decode_values(MutableColumnPtr& doris_column, DataTypePtr&
data_type,
ColumnSelectVector& select_vector, bool
is_dict_filter) = 0;
diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
index a30c2dff3d1..3ca6193a4a1 100644
--- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
+++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp
@@ -107,9 +107,11 @@ public:
// Spark can set the timestamp precision by the following
configuration:
// spark.sql.parquet.outputTimestampType = INT96(NANOS),
TIMESTAMP_MICROS, TIMESTAMP_MILLIS
if constexpr (std::is_same_v<T, ParquetInt96>) {
+ reset_time_scale_if_missing(9);
return _decode_datetime96<DateV2Value<DateTimeV2ValueType>,
UInt64, has_filter>(
doris_column, select_vector);
} else if constexpr (std::is_same_v<T, Int64>) {
+
reset_time_scale_if_missing(remove_nullable(data_type)->get_scale());
return _decode_datetime64<DateV2Value<DateTimeV2ValueType>,
UInt64, has_filter>(
doris_column, select_vector);
}
diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
index af464c15545..c496cc175c9 100644
--- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
+++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
@@ -118,9 +118,11 @@ Status
FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat
// Spark can set the timestamp precision by the following
configuration:
// spark.sql.parquet.outputTimestampType = INT96(NANOS),
TIMESTAMP_MICROS, TIMESTAMP_MILLIS
if (_physical_type == tparquet::Type::INT96) {
+ reset_time_scale_if_missing(9);
return _decode_datetime96<DateV2Value<DateTimeV2ValueType>,
UInt64, has_filter>(
doris_column, select_vector);
} else if (_physical_type == tparquet::Type::INT64) {
+
reset_time_scale_if_missing(remove_nullable(data_type)->get_scale());
return _decode_datetime64<DateV2Value<DateTimeV2ValueType>,
UInt64, has_filter>(
doris_column, select_vector);
}
diff --git
a/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out
b/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out
new file mode 100644
index 00000000000..641424b160e
--- /dev/null
+++ b/regression-test/data/external_table_p2/paimon/paimon_timestamp_types.out
@@ -0,0 +1,13 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !c1 --
+1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123
5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450
5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456
5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456
+
+-- !c2 --
+1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123
5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450
5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456
5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456
+
+-- !c3 --
+1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123
5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450
5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456
5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456
+
+-- !c4 --
+1 5432-08-30T05:43:21.100 5432-08-30T05:43:21.120 5432-08-30T05:43:21.123
5432-08-30T05:43:21.123400 5432-08-30T05:43:21.123450
5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456
5432-08-30T05:43:21.123456 5432-08-30T05:43:21.123456
+
diff --git
a/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy
b/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy
new file mode 100644
index 00000000000..dbb1f1d038c
--- /dev/null
+++
b/regression-test/suites/external_table_p2/paimon/paimon_timestamp_types.groovy
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("paimon_timestamp_types",
"p2,external,paimon,external_remote,external_remote_paimon") {
+
+ def ts_orc = """select * from ts_orc"""
+ def ts_parquet = """select * from ts_parquet"""
+
+ String enabled =
context.config.otherConfigs.get("enableExternalPaimonTest")
+ if (enabled != null && enabled.equalsIgnoreCase("true")) {
+ String catalog_name = "paimon_timestamp_catalog"
+ String user_name = context.config.otherConfigs.get("extHiveHmsUser")
+ String hiveHost = context.config.otherConfigs.get("extHiveHmsHost")
+ String hivePort = context.config.otherConfigs.get("extHdfsPort")
+
+ sql """drop catalog if exists ${catalog_name};"""
+ sql """
+ create catalog if not exists ${catalog_name} properties (
+ "type" = "paimon",
+ "paimon.catalog.type" = "filesystem",
+ "warehouse" = "hdfs://${hiveHost}/${hivePort}/paimon/paimon1",
+ "hadoop.username" = "${user_name}"
+ );
+ """
+ logger.info("catalog " + catalog_name + " created")
+ sql """switch ${catalog_name};"""
+ logger.info("switched to catalog " + catalog_name)
+ sql """use db1;"""
+ logger.info("use db1")
+
+ sql """set force_jni_scanner=true"""
+ qt_c1 ts_orc
+ qt_c2 ts_parquet
+
+ sql """set force_jni_scanner=false"""
+ qt_c3 ts_orc
+ qt_c4 ts_parquet
+
+ }
+}
+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]