This is an automated email from the ASF dual-hosted git repository. jcamacho pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 5aa0086 HIVE-24074: Incorrect handling of timestamp in Parquet/Avro when written in certain time zones in versions before Hive 3.x (Jesus Camacho Rodriguez, reviewed by Prasanth Jayachandran) 5aa0086 is described below commit 5aa00869a0dd5132431855f81431349dc53c61ce Author: Jesús Camacho Rodríguez <jcama...@apache.org> AuthorDate: Fri Aug 28 08:17:36 2020 -0700 HIVE-24074: Incorrect handling of timestamp in Parquet/Avro when written in certain time zones in versions before Hive 3.x (Jesus Camacho Rodriguez, reviewed by Prasanth Jayachandran) Closes apache/hive#1392 --- .../hadoop/hive/common/type/TimestampTZUtil.java | 35 ++++++++- .../java/org/apache/hadoop/hive/conf/HiveConf.java | 6 ++ data/files/tbl_avro1/000000_0 | Bin 0 -> 262 bytes data/files/tbl_avro1/000000_0_copy_1 | Bin 0 -> 263 bytes data/files/tbl_parq1/000000_0 | Bin 0 -> 286 bytes data/files/tbl_parq1/000000_0_copy_1 | Bin 0 -> 286 bytes data/files/tbl_parq1/000000_0_copy_2 | Bin 0 -> 327 bytes .../jdbc/TestJdbcWithMiniLlapVectorArrowBatch.java | 28 ++++---- .../java/org/apache/hadoop/hive/ql/QTestUtil.java | 2 + .../hive/ql/qoption/QTestTimezoneHandler.java | 68 ++++++++++++++++++ .../ql/io/parquet/ParquetRecordReaderBase.java | 4 ++ .../hive/ql/io/parquet/convert/ETypeConverter.java | 5 +- .../io/parquet/read/DataWritableReadSupport.java | 6 ++ .../ql/io/parquet/timestamp/NanoTimeUtils.java | 9 ++- .../parquet/vector/BaseVectorizedColumnReader.java | 9 ++- .../vector/ParquetDataColumnReaderFactory.java | 28 ++++---- .../parquet/vector/VectorizedListColumnReader.java | 5 +- .../vector/VectorizedParquetRecordReader.java | 17 +++-- .../vector/VectorizedPrimitiveColumnReader.java | 4 +- .../convert/TestGetDataColumnReaderByType.java | 26 +++---- .../parquet/serde/TestParquetTimestampUtils.java | 4 +- .../clientpositive/avro_legacy_mixed_timestamp.q | 9 +++ .../test/queries/clientpositive/avro_timestamp2.q | 23 ++++++ .../parquet_legacy_mixed_timestamp.q | 6 +- .../queries/clientpositive/parquet_timestamp.q | 23 ++++++ .../llap/avro_legacy_mixed_timestamp.q.out | 32 +++++++++ .../clientpositive/llap/avro_timestamp2.q.out | 66 +++++++++++++++++ .../llap/parquet_legacy_mixed_timestamp.q.out | 16 +++++ .../clientpositive/llap/parquet_timestamp.q.out | 78 +++++++++++++++++++++ .../hadoop/hive/serde2/avro/AvroDeserializer.java | 5 +- 30 files changed, 453 insertions(+), 61 deletions(-) diff --git a/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java b/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java index 862acb8..664bdb5 100644 --- a/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java +++ b/common/src/java/org/apache/hadoop/hive/common/type/TimestampTZUtil.java @@ -17,6 +17,9 @@ */ package org.apache.hadoop.hive.common.type; +import java.text.DateFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.time.DateTimeException; import java.time.Instant; import java.time.LocalDate; @@ -31,6 +34,7 @@ import java.time.format.DateTimeParseException; import java.time.format.TextStyle; import java.time.temporal.ChronoField; import java.time.temporal.TemporalAccessor; +import java.util.TimeZone; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -145,13 +149,42 @@ public class TimestampTZUtil { } } + private static final ThreadLocal<DateFormat> LEGACY_DATE_FORMATTER = new ThreadLocal<>(); + + private static DateFormat getLegacyDateFormatter() { + if (LEGACY_DATE_FORMATTER.get() == null) { + LEGACY_DATE_FORMATTER.set(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")); + } + return LEGACY_DATE_FORMATTER.get(); + } + + public static Timestamp convertTimestampToZone(Timestamp ts, ZoneId fromZone, ZoneId toZone) { + return convertTimestampToZone(ts, fromZone, toZone, false); + } + /** * Timestamps are technically time zone agnostic, and this method sort of cheats its logic. * Timestamps are supposed to represent nanos since [UTC epoch]. Here, * the input timestamp represents nanoseconds since [epoch at fromZone], and * we return a Timestamp representing nanoseconds since [epoch at toZone]. */ - public static Timestamp convertTimestampToZone(Timestamp ts, ZoneId fromZone, ZoneId toZone) { + public static Timestamp convertTimestampToZone(Timestamp ts, ZoneId fromZone, ZoneId toZone, + boolean legacyConversion) { + if (legacyConversion) { + try { + DateFormat formatter = getLegacyDateFormatter(); + formatter.setTimeZone(TimeZone.getTimeZone(fromZone)); + java.util.Date date = formatter.parse(ts.toString()); + // Set the formatter to use a different timezone + formatter.setTimeZone(TimeZone.getTimeZone(toZone)); + Timestamp result = Timestamp.valueOf(formatter.format(date)); + result.setNanos(ts.getNanos()); + return result; + } catch (ParseException e) { + throw new RuntimeException(e); + } + } + // get nanos since [epoch at fromZone] Instant instant = convert(ts, fromZone).getZonedDateTime().toInstant(); // get [local time at toZone] diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 57acddd..bdf3ae4 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2131,6 +2131,9 @@ public class HiveConf extends Configuration { HIVE_PARQUET_DATE_PROLEPTIC_GREGORIAN_DEFAULT("hive.parquet.date.proleptic.gregorian.default", false, "This value controls whether date type in Parquet files was written using the hybrid or proleptic\n" + "calendar. Hybrid is the default."), + HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED("hive.parquet.timestamp.legacy.conversion.enabled", true, + "This value controls whether we use former Java time API to convert between timezones on files where timezone\n" + + "is not encoded in the metadata. This is for debugging."), HIVE_AVRO_TIMESTAMP_SKIP_CONVERSION("hive.avro.timestamp.skip.conversion", false, "Some older Hive implementations (pre-3.1) wrote Avro timestamps in a UTC-normalized" + "manner, while from version 3.1 until now Hive wrote time zone agnostic timestamps. " + @@ -2143,6 +2146,9 @@ public class HiveConf extends Configuration { HIVE_AVRO_PROLEPTIC_GREGORIAN_DEFAULT("hive.avro.proleptic.gregorian.default", false, "This value controls whether date and timestamp type in Avro files was written using the hybrid or proleptic\n" + "calendar. Hybrid is the default."), + HIVE_AVRO_TIMESTAMP_LEGACY_CONVERSION_ENABLED("hive.avro.timestamp.legacy.conversion.enabled", true, + "This value controls whether we use former Java time API to convert between timezones on files where timezone\n" + + "is not encoded in the metadata. This is for debugging."), HIVE_INT_TIMESTAMP_CONVERSION_IN_SECONDS("hive.int.timestamp.conversion.in.seconds", false, "Boolean/tinyint/smallint/int/bigint value is interpreted as milliseconds during the timestamp conversion.\n" + "Set this flag to true to interpret the value as seconds to be consistent with float/double." ), diff --git a/data/files/tbl_avro1/000000_0 b/data/files/tbl_avro1/000000_0 new file mode 100644 index 0000000..0eb17bb Binary files /dev/null and b/data/files/tbl_avro1/000000_0 differ diff --git a/data/files/tbl_avro1/000000_0_copy_1 b/data/files/tbl_avro1/000000_0_copy_1 new file mode 100644 index 0000000..feee1f1 Binary files /dev/null and b/data/files/tbl_avro1/000000_0_copy_1 differ diff --git a/data/files/tbl_parq1/000000_0 b/data/files/tbl_parq1/000000_0 new file mode 100644 index 0000000..17a910f Binary files /dev/null and b/data/files/tbl_parq1/000000_0 differ diff --git a/data/files/tbl_parq1/000000_0_copy_1 b/data/files/tbl_parq1/000000_0_copy_1 new file mode 100644 index 0000000..cc0c27e Binary files /dev/null and b/data/files/tbl_parq1/000000_0_copy_1 differ diff --git a/data/files/tbl_parq1/000000_0_copy_2 b/data/files/tbl_parq1/000000_0_copy_2 new file mode 100644 index 0000000..ba04470 Binary files /dev/null and b/data/files/tbl_parq1/000000_0_copy_2 differ diff --git a/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestJdbcWithMiniLlapVectorArrowBatch.java b/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestJdbcWithMiniLlapVectorArrowBatch.java index 6b5c5df..9ed40d7 100644 --- a/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestJdbcWithMiniLlapVectorArrowBatch.java +++ b/itests/hive-unit/src/test/java/org/apache/hive/jdbc/TestJdbcWithMiniLlapVectorArrowBatch.java @@ -114,10 +114,10 @@ public class TestJdbcWithMiniLlapVectorArrowBatch extends BaseJdbcWithMiniLlap { expected.add(Lists.newArrayList("2014-02-11 07:08:09.123")); expected.add(Lists.newArrayList("1947-02-11 07:08:09.123")); expected.add(Lists.newArrayList("8200-02-11 07:08:09.123")); - expected.add(Lists.newArrayList("1012-02-21 07:15:11.123")); - expected.add(Lists.newArrayList("1014-02-11 07:15:11.123")); - expected.add(Lists.newArrayList("0947-02-11 07:15:11.123")); - expected.add(Lists.newArrayList("0200-02-11 07:15:11.123")); + expected.add(Lists.newArrayList("1012-02-21 07:08:09.123")); + expected.add(Lists.newArrayList("1014-02-11 07:08:09.123")); + expected.add(Lists.newArrayList("0947-02-11 07:08:09.123")); + expected.add(Lists.newArrayList("0200-02-11 07:08:09.123")); return expected; } @@ -140,10 +140,10 @@ public class TestJdbcWithMiniLlapVectorArrowBatch extends BaseJdbcWithMiniLlap { expected.add(Lists.newArrayList("2014-02-11 07:08:09.123")); expected.add(Lists.newArrayList("1947-02-11 07:08:09.123")); expected.add(Lists.newArrayList("8200-02-11 07:08:09.123")); - expected.add(Lists.newArrayList("1012-02-27 07:15:11.123")); - expected.add(Lists.newArrayList("1014-02-17 07:15:11.123")); - expected.add(Lists.newArrayList("0947-02-16 07:15:11.123")); - expected.add(Lists.newArrayList("0200-02-10 07:15:11.123")); + expected.add(Lists.newArrayList("1012-02-27 07:08:09.123")); + expected.add(Lists.newArrayList("1014-02-17 07:08:09.123")); + expected.add(Lists.newArrayList("0947-02-16 07:08:09.123")); + expected.add(Lists.newArrayList("0200-02-10 07:08:09.123")); return expected; } @@ -185,8 +185,8 @@ public class TestJdbcWithMiniLlapVectorArrowBatch extends BaseJdbcWithMiniLlap { final String tableName = "testOrcHybridMixedTimestamps"; executeSQL("create table " + tableName + " (d timestamp) stored as orc"); executeSQL("INSERT INTO " + tableName + " VALUES ('2012-02-21 07:08:09.123')," + "('2014-02-11 07:08:09.123')," - + "('1947-02-11 07:08:09.123')," + "('8200-02-11 07:08:09.123')," + "('1012-02-21 07:15:11.123')," - + "('1014-02-11 07:15:11.123')," + "('0947-02-11 07:15:11.123')," + "('0200-02-11 07:15:11.123')"); + + "('1947-02-11 07:08:09.123')," + "('8200-02-11 07:08:09.123')," + "('1012-02-21 07:08:09.123')," + + "('1014-02-11 07:08:09.123')," + "('0947-02-11 07:08:09.123')," + "('0200-02-11 07:08:09.123')"); final String query = "select * from " + tableName; @@ -246,8 +246,8 @@ public class TestJdbcWithMiniLlapVectorArrowBatch extends BaseJdbcWithMiniLlap { final String tableName = "testParquetHybridMixedTimestamps"; executeSQL("create table " + tableName + " (ts timestamp) stored as parquet"); executeSQL("INSERT INTO " + tableName + " VALUES ('2012-02-21 07:08:09.123')," + "('2014-02-11 07:08:09.123')," - + "('1947-02-11 07:08:09.123')," + "('8200-02-11 07:08:09.123')," + "('1012-02-21 07:15:11.123')," - + "('1014-02-11 07:15:11.123')," + "('0947-02-11 07:15:11.123')," + "('0200-02-11 07:15:11.123')"); + + "('1947-02-11 07:08:09.123')," + "('8200-02-11 07:08:09.123')," + "('1012-02-21 07:08:09.123')," + + "('1014-02-11 07:08:09.123')," + "('0947-02-11 07:08:09.123')," + "('0200-02-11 07:08:09.123')"); final String query = "select * from " + tableName; @@ -309,8 +309,8 @@ public class TestJdbcWithMiniLlapVectorArrowBatch extends BaseJdbcWithMiniLlap { executeSQL("create table " + tableName + " (d timestamp) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' " + "COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY ':' stored as avro"); executeSQL("INSERT INTO " + tableName + " VALUES ('2012-02-21 07:08:09.123')," + "('2014-02-11 07:08:09.123')," - + "('1947-02-11 07:08:09.123')," + "('8200-02-11 07:08:09.123')," + "('1012-02-21 07:15:11.123')," - + "('1014-02-11 07:15:11.123')," + "('0947-02-11 07:15:11.123')," + "('0200-02-11 07:15:11.123')"); + + "('1947-02-11 07:08:09.123')," + "('8200-02-11 07:08:09.123')," + "('1012-02-21 07:08:09.123')," + + "('1014-02-11 07:08:09.123')," + "('0947-02-11 07:08:09.123')," + "('0200-02-11 07:08:09.123')"); final String query = "select * from " + tableName; diff --git a/itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java b/itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java index 20ee2fe..a12fa7f 100644 --- a/itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java +++ b/itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java @@ -82,6 +82,7 @@ import org.apache.hadoop.hive.ql.qoption.QTestDisabledHandler; import org.apache.hadoop.hive.ql.qoption.QTestOptionDispatcher; import org.apache.hadoop.hive.ql.qoption.QTestReplaceHandler; import org.apache.hadoop.hive.ql.qoption.QTestSysDbHandler; +import org.apache.hadoop.hive.ql.qoption.QTestTimezoneHandler; import org.apache.hadoop.hive.ql.qoption.QTestTransactional; import org.apache.hadoop.hive.ql.scheduled.QTestScheduledQueryCleaner; import org.apache.hadoop.hive.ql.scheduled.QTestScheduledQueryServiceProvider; @@ -226,6 +227,7 @@ public class QTestUtil { dispatcher.register("transactional", new QTestTransactional()); dispatcher.register("scheduledqueryservice", new QTestScheduledQueryServiceProvider(conf)); dispatcher.register("scheduledquerycleaner", new QTestScheduledQueryCleaner()); + dispatcher.register("timezone", new QTestTimezoneHandler()); dispatcher.register("authorizer", new QTestAuthorizerHandler()); dispatcher.register("disabled", new QTestDisabledHandler()); diff --git a/itests/util/src/main/java/org/apache/hadoop/hive/ql/qoption/QTestTimezoneHandler.java b/itests/util/src/main/java/org/apache/hadoop/hive/ql/qoption/QTestTimezoneHandler.java new file mode 100644 index 0000000..cccc867 --- /dev/null +++ b/itests/util/src/main/java/org/apache/hadoop/hive/ql/qoption/QTestTimezoneHandler.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.qoption; + +import com.google.common.base.Strings; +import java.util.TimeZone; +import org.apache.hadoop.hive.ql.QTestUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * QTest custom timezone handler + * + * Enables a custom timezone for the test. + * + * Example: + * --! qt:timezone:Asia/Singapore + * + */ +public class QTestTimezoneHandler implements QTestOptionHandler { + + private static final Logger LOG = LoggerFactory.getLogger(QTestTimezoneHandler.class); + private boolean enabled = false; + private TimeZone originalTimeZone; + private TimeZone newTimeZone; + + @Override + public void processArguments(String arguments) { + if (Strings.isNullOrEmpty(arguments)) { + throw new RuntimeException("illegal timezone arg: " + arguments); + } + originalTimeZone = TimeZone.getDefault(); + newTimeZone = TimeZone.getTimeZone(arguments); + LOG.info("Enabling timezone change: {} => {}", originalTimeZone, newTimeZone); + enabled = true; + } + + @Override + public void beforeTest(QTestUtil qt) throws Exception { + if (enabled) { + TimeZone.setDefault(newTimeZone); + } + } + + @Override + public void afterTest(QTestUtil qt) throws Exception { + if (enabled) { + TimeZone.setDefault(originalTimeZone); + enabled = false; + } + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java index c52bc9d..7fff6f4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java @@ -16,6 +16,7 @@ package org.apache.hadoop.hive.ql.io.parquet; import com.google.common.base.Strings; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.ql.io.IOConstants; import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; import org.apache.hadoop.hive.ql.io.parquet.read.ParquetFilterPredicateConverter; @@ -55,6 +56,7 @@ public class ParquetRecordReaderBase { protected ProjectionPusher projectionPusher; protected boolean skipTimestampConversion = false; protected Boolean skipProlepticConversion; + protected Boolean legacyConversionEnabled; protected SerDeStats serDeStats; protected JobConf jobConf; @@ -140,6 +142,8 @@ public class ParquetRecordReaderBase { skipProlepticConversion = HiveConf.getBoolVar( conf, HiveConf.ConfVars.HIVE_PARQUET_DATE_PROLEPTIC_GREGORIAN_DEFAULT); } + legacyConversionEnabled = HiveConf.getBoolVar( + conf, ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED); split = new ParquetInputSplit(finalPath, splitStart, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java index 8e436bc..33acf1e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java @@ -23,6 +23,7 @@ import java.util.Optional; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime; import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; @@ -698,8 +699,10 @@ public enum ETypeConverter { // time zone in order to emulate time zone agnostic behavior. boolean skipConversion = Boolean.parseBoolean( metadata.get(HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION.varname)); + boolean legacyConversion = Boolean.parseBoolean( + metadata.get(ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED.varname)); Timestamp ts = NanoTimeUtils.getTimestamp(nt, skipConversion, - DataWritableReadSupport.getWriterTimeZoneId(metadata)); + DataWritableReadSupport.getWriterTimeZoneId(metadata), legacyConversion); return new TimestampWritableV2(ts); } }; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java index ba146c5..0604dcc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java @@ -523,6 +523,12 @@ public class DataWritableReadSupport extends ReadSupport<ArrayWritable> { configuration, HiveConf.ConfVars.HIVE_PARQUET_DATE_PROLEPTIC_GREGORIAN_DEFAULT))); } + String legacyConversion = ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED.varname; + if (!metadata.containsKey(legacyConversion)) { + metadata.put(legacyConversion, String.valueOf(HiveConf.getBoolVar( + configuration, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED))); + } + return new DataWritableRecordConverter(readContext.getRequestedSchema(), metadata, hiveTypeInfo); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java index f9d0a56..215c8cf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java @@ -95,7 +95,7 @@ public class NanoTimeUtils { } public static Timestamp getTimestamp(NanoTime nt, boolean skipConversion) { - return getTimestamp(nt, skipConversion, null); + return getTimestamp(nt, skipConversion, null, false); } /** @@ -110,10 +110,13 @@ public class NanoTimeUtils { * For skipConversion to be true it must be set in conf AND the parquet file must NOT be written * by parquet's java library (parquet-mr). This is enforced in ParquetRecordReaderBase#getSplit. */ - public static Timestamp getTimestamp(NanoTime nt, boolean skipConversion, ZoneId timeZoneId) { + public static Timestamp getTimestamp(NanoTime nt, boolean skipConversion, ZoneId timeZoneId, + boolean legacyConversionEnabled) { + boolean legacyConversion = false; if (skipConversion) { timeZoneId = ZoneOffset.UTC; } else if (timeZoneId == null) { + legacyConversion = legacyConversionEnabled; timeZoneId = TimeZone.getDefault().toZoneId(); } @@ -146,7 +149,7 @@ public class NanoTimeUtils { calendar.set(Calendar.SECOND, seconds); Timestamp ts = Timestamp.ofEpochMilli(calendar.getTimeInMillis(), (int) nanos); - ts = TimestampTZUtil.convertTimestampToZone(ts, ZoneOffset.UTC, timeZoneId); + ts = TimestampTZUtil.convertTimestampToZone(ts, ZoneOffset.UTC, timeZoneId, legacyConversion); return ts; } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java index 8d3cb7c..e6c07ce 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/BaseVectorizedColumnReader.java @@ -56,6 +56,7 @@ public abstract class BaseVectorizedColumnReader implements VectorizedColumnRead protected boolean skipTimestampConversion = false; protected ZoneId writerTimezone = null; protected boolean skipProlepticConversion = false; + protected boolean legacyConversionEnabled = true; /** * Total number of values read. @@ -121,6 +122,7 @@ public abstract class BaseVectorizedColumnReader implements VectorizedColumnRead boolean skipTimestampConversion, ZoneId writerTimezone, boolean skipProlepticConversion, + boolean legacyConversionEnabled, Type parquetType, TypeInfo hiveType) throws IOException { this.descriptor = descriptor; this.type = parquetType; @@ -129,6 +131,7 @@ public abstract class BaseVectorizedColumnReader implements VectorizedColumnRead this.skipTimestampConversion = skipTimestampConversion; this.writerTimezone = writerTimezone; this.skipProlepticConversion = skipProlepticConversion; + this.legacyConversionEnabled = legacyConversionEnabled; this.hiveType = hiveType; DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); @@ -137,7 +140,7 @@ public abstract class BaseVectorizedColumnReader implements VectorizedColumnRead this.dictionary = ParquetDataColumnReaderFactory .getDataColumnReaderByTypeOnDictionary(parquetType.asPrimitiveType(), hiveType, dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage), - skipTimestampConversion, writerTimezone); + skipTimestampConversion, writerTimezone, legacyConversionEnabled); this.isCurrentPageDictionaryEncoded = true; } catch (IOException e) { throw new IOException("could not decode the dictionary for " + descriptor, e); @@ -189,11 +192,11 @@ public abstract class BaseVectorizedColumnReader implements VectorizedColumnRead } dataColumn = ParquetDataColumnReaderFactory.getDataColumnReaderByType(type.asPrimitiveType(), hiveType, dataEncoding.getDictionaryBasedValuesReader(descriptor, VALUES, dictionary - .getDictionary()), skipTimestampConversion, writerTimezone); + .getDictionary()), skipTimestampConversion, writerTimezone, legacyConversionEnabled); this.isCurrentPageDictionaryEncoded = true; } else { dataColumn = ParquetDataColumnReaderFactory.getDataColumnReaderByType(type.asPrimitiveType(), hiveType, - dataEncoding.getValuesReader(descriptor, VALUES), skipTimestampConversion, writerTimezone); + dataEncoding.getValuesReader(descriptor, VALUES), skipTimestampConversion, writerTimezone, legacyConversionEnabled); this.isCurrentPageDictionaryEncoded = false; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java index 10dfe22..4289ddd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/ParquetDataColumnReaderFactory.java @@ -1213,19 +1213,22 @@ public final class ParquetDataColumnReaderFactory { public static class TypesFromInt96PageReader extends DefaultParquetDataColumnReader { private boolean skipTimestampConversion = false; private ZoneId writerTimezone; + private boolean legacyConversionEnabled; public TypesFromInt96PageReader(ValuesReader realReader, int length, - boolean skipTimestampConversion, ZoneId writerTimezone) { + boolean skipTimestampConversion, ZoneId writerTimezone, boolean legacyConversionEnabled) { super(realReader, length); this.skipTimestampConversion = skipTimestampConversion; this.writerTimezone = writerTimezone; + this.legacyConversionEnabled = legacyConversionEnabled; } public TypesFromInt96PageReader(Dictionary dict, int length, boolean skipTimestampConversion, - ZoneId writerTimezone) { + ZoneId writerTimezone, boolean legacyConversionEnabled) { super(dict, length); this.skipTimestampConversion = skipTimestampConversion; this.writerTimezone = writerTimezone; + this.legacyConversionEnabled = legacyConversionEnabled; } private Timestamp convert(Binary binary) { @@ -1234,7 +1237,7 @@ public final class ParquetDataColumnReaderFactory { long timeOfDayNanos = buf.getLong(); int julianDay = buf.getInt(); NanoTime nt = new NanoTime(julianDay, timeOfDayNanos); - return NanoTimeUtils.getTimestamp(nt, skipTimestampConversion, writerTimezone); + return NanoTimeUtils.getTimestamp(nt, skipTimestampConversion, writerTimezone, legacyConversionEnabled); } @Override @@ -1844,9 +1847,9 @@ public final class ParquetDataColumnReaderFactory { TypeInfo hiveType, Dictionary dictionary, ValuesReader valuesReader, - boolean - skipTimestampConversion, - ZoneId writerTimezone) + boolean skipTimestampConversion, + ZoneId writerTimezone, + boolean legacyConversionEnabled) throws IOException { // max length for varchar and char cases int length = getVarcharLength(hiveType); @@ -1905,8 +1908,8 @@ public final class ParquetDataColumnReaderFactory { hiveScale) : new TypesFromFloatPageReader(valuesReader, length, hivePrecision, hiveScale); case INT96: return isDictionary ? new TypesFromInt96PageReader(dictionary, length, - skipTimestampConversion, writerTimezone) : new - TypesFromInt96PageReader(valuesReader, length, skipTimestampConversion, writerTimezone); + skipTimestampConversion, writerTimezone, legacyConversionEnabled) : new + TypesFromInt96PageReader(valuesReader, length, skipTimestampConversion, writerTimezone, legacyConversionEnabled); case BOOLEAN: return isDictionary ? new TypesFromBooleanPageReader(dictionary, length) : new TypesFromBooleanPageReader(valuesReader, length); @@ -1983,18 +1986,19 @@ public final class ParquetDataColumnReaderFactory { TypeInfo hiveType, Dictionary realReader, boolean skipTimestampConversion, - ZoneId writerTimezone) + ZoneId writerTimezone, + boolean legacyConversionEnabled) throws IOException { return getDataColumnReaderByTypeHelper(true, parquetType, hiveType, realReader, null, - skipTimestampConversion, writerTimezone); + skipTimestampConversion, writerTimezone, legacyConversionEnabled); } public static ParquetDataColumnReader getDataColumnReaderByType(PrimitiveType parquetType, TypeInfo hiveType, ValuesReader realReader, boolean skipTimestampConversion, - ZoneId writerTimezone) + ZoneId writerTimezone, boolean legacyConversionEnabled) throws IOException { return getDataColumnReaderByTypeHelper(false, parquetType, hiveType, null, realReader, - skipTimestampConversion, writerTimezone); + skipTimestampConversion, writerTimezone, legacyConversionEnabled); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java index 6136ce0..93d9509 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedListColumnReader.java @@ -50,8 +50,9 @@ public class VectorizedListColumnReader extends BaseVectorizedColumnReader { public VectorizedListColumnReader(ColumnDescriptor descriptor, PageReader pageReader, boolean skipTimestampConversion, ZoneId writerTimezone, boolean skipProlepticConversion, - Type type, TypeInfo hiveType) throws IOException { - super(descriptor, pageReader, skipTimestampConversion, writerTimezone, skipProlepticConversion, type, hiveType); + boolean legacyConversionEnabled, Type type, TypeInfo hiveType) throws IOException { + super(descriptor, pageReader, skipTimestampConversion, writerTimezone, skipProlepticConversion, + legacyConversionEnabled, type, hiveType); } @Override diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java index 5cd25d9..b8a1f81 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedParquetRecordReader.java @@ -462,13 +462,15 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase for (int i = 0; i < types.size(); ++i) { columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i), - pages, requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion, 0); + pages, requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion, + legacyConversionEnabled, 0); } } } else { for (int i = 0; i < types.size(); ++i) { columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(i), types.get(i), pages, - requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion, 0); + requestedSchema.getColumns(), skipTimestampConversion, writerTimezone, skipProlepticConversion, + legacyConversionEnabled, 0); } } @@ -520,6 +522,7 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase boolean skipTimestampConversion, ZoneId writerTimezone, boolean skipProlepticConversion, + boolean legacyConversionEnabled, int depth) throws IOException { List<ColumnDescriptor> descriptors = getAllColumnDescriptorByType(depth, type, columnDescriptors); @@ -532,7 +535,7 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase if (fileSchema.getColumns().contains(descriptors.get(0))) { return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, - type, typeInfo); + legacyConversionEnabled, type, typeInfo); } else { // Support for schema evolution return new VectorizedDummyColumnReader(); @@ -545,7 +548,7 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase for (int i = 0; i < fieldTypes.size(); i++) { VectorizedColumnReader r = buildVectorizedParquetReader(fieldTypes.get(i), types.get(i), pages, descriptors, - skipTimestampConversion, writerTimezone, skipProlepticConversion, depth + 1); + skipTimestampConversion, writerTimezone, skipProlepticConversion, legacyConversionEnabled, depth + 1); if (r != null) { fieldReaders.add(r); } else { @@ -564,7 +567,7 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase return new VectorizedListColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, writerTimezone, skipProlepticConversion, - getElementType(type), typeInfo); + legacyConversionEnabled, getElementType(type), typeInfo); case MAP: if (columnDescriptors == null || columnDescriptors.isEmpty()) { throw new RuntimeException( @@ -596,10 +599,10 @@ public class VectorizedParquetRecordReader extends ParquetRecordReaderBase List<Type> kvTypes = groupType.getFields(); VectorizedListColumnReader keyListColumnReader = new VectorizedListColumnReader( descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, - writerTimezone, skipProlepticConversion, kvTypes.get(0), typeInfo); + writerTimezone, skipProlepticConversion, legacyConversionEnabled, kvTypes.get(0), typeInfo); VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader( descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, - writerTimezone, skipProlepticConversion, kvTypes.get(1), typeInfo); + writerTimezone, skipProlepticConversion, legacyConversionEnabled, kvTypes.get(1), typeInfo); return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader); case UNION: default: diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java index 62a94bc..bb08c27 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedPrimitiveColumnReader.java @@ -51,10 +51,12 @@ public class VectorizedPrimitiveColumnReader extends BaseVectorizedColumnReader boolean skipTimestampConversion, ZoneId writerTimezone, boolean skipProlepticConversion, + boolean legacyConversionEnabled, Type type, TypeInfo hiveType) throws IOException { - super(descriptor, pageReader, skipTimestampConversion, writerTimezone, skipProlepticConversion, type, hiveType); + super(descriptor, pageReader, skipTimestampConversion, writerTimezone, skipProlepticConversion, + legacyConversionEnabled, type, hiveType); } @Override diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestGetDataColumnReaderByType.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestGetDataColumnReaderByType.java index bfacf2e..f5cf19e 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestGetDataColumnReaderByType.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestGetDataColumnReaderByType.java @@ -51,7 +51,7 @@ public class TestGetDataColumnReaderByType { ParquetDataColumnReaderFactory.getDataColumnReaderByType( Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(20) .as(LogicalTypeAnnotation.decimalType(2, 5)).named("value"), - hiveTypeInfo, null, true, null); + hiveTypeInfo, null, true, null, false); assertTrue(reader instanceof TypesFromDecimalPageReader); } @@ -61,7 +61,7 @@ public class TestGetDataColumnReaderByType { hiveTypeInfo.setTypeName("string"); ParquetDataColumnReader reader = ParquetDataColumnReaderFactory.getDataColumnReaderByType(Types .optional(PrimitiveTypeName.BINARY).as(LogicalTypeAnnotation.stringType()).named("value"), - hiveTypeInfo, null, true, null); + hiveTypeInfo, null, true, null, false); assertTrue(reader instanceof TypesFromStringPageReader); } @@ -71,7 +71,7 @@ public class TestGetDataColumnReaderByType { ParquetDataColumnReader reader = ParquetDataColumnReaderFactory .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.BINARY) .as(LogicalTypeAnnotation.decimalType(2, 5)).named("value"), hiveTypeInfo, null, true, - null); + null, false); assertTrue(reader instanceof TypesFromDecimalPageReader); } @@ -81,7 +81,7 @@ public class TestGetDataColumnReaderByType { hiveTypeInfo.setTypeName("string"); ParquetDataColumnReader reader = ParquetDataColumnReaderFactory .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.BINARY).named("value"), hiveTypeInfo, null, true, - null); + null, false); assertTrue(reader instanceof DefaultParquetDataColumnReader); } @@ -91,7 +91,7 @@ public class TestGetDataColumnReaderByType { hiveTypeInfo.setTypeName("binary"); ParquetDataColumnReader reader = ParquetDataColumnReaderFactory.getDataColumnReaderByType(Types .optional(PrimitiveTypeName.BINARY).as(LogicalTypeAnnotation.jsonType()).named("value"), - hiveTypeInfo, null, true, null); + hiveTypeInfo, null, true, null, false); assertTrue(reader instanceof DefaultParquetDataColumnReader); } @@ -102,7 +102,7 @@ public class TestGetDataColumnReaderByType { ParquetDataColumnReader reader = ParquetDataColumnReaderFactory .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.INT32) .as(LogicalTypeAnnotation.intType(32, false)).named("value"), hiveTypeInfo, null, true, - null); + null, false); assertTrue(reader instanceof TypesFromInt32PageReader); } @@ -112,7 +112,7 @@ public class TestGetDataColumnReaderByType { hiveTypeInfo.setTypeName("int"); ParquetDataColumnReader reader = ParquetDataColumnReaderFactory .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.INT32).named("value"), hiveTypeInfo, null, true, - null); + null, false); assertTrue(reader instanceof TypesFromInt32PageReader); } @@ -121,7 +121,7 @@ public class TestGetDataColumnReaderByType { PrimitiveTypeInfo hiveTypeInfo = new PrimitiveTypeInfo(); hiveTypeInfo.setTypeName("bigint"); ParquetDataColumnReader reader = ParquetDataColumnReaderFactory.getDataColumnReaderByType( - Types.optional(PrimitiveTypeName.INT64).named("value"), hiveTypeInfo, null, true, null); + Types.optional(PrimitiveTypeName.INT64).named("value"), hiveTypeInfo, null, true, null, false); assertTrue(reader instanceof TypesFromInt64PageReader); } @@ -132,7 +132,7 @@ public class TestGetDataColumnReaderByType { ParquetDataColumnReader reader = ParquetDataColumnReaderFactory .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.INT64) .as(LogicalTypeAnnotation.intType(64, false)).named("value"), hiveTypeInfo, null, true, - null); + null, false); assertTrue(reader instanceof TypesFromInt64PageReader); } @@ -142,7 +142,7 @@ public class TestGetDataColumnReaderByType { hiveTypeInfo.setTypeName("float"); ParquetDataColumnReader reader = ParquetDataColumnReaderFactory .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.FLOAT).named("value"), hiveTypeInfo, null, true, - null); + null, false); assertTrue(reader instanceof TypesFromFloatPageReader); } @@ -152,7 +152,7 @@ public class TestGetDataColumnReaderByType { hiveTypeInfo.setTypeName("double"); ParquetDataColumnReader reader = ParquetDataColumnReaderFactory .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.DOUBLE).named("value"), hiveTypeInfo, null, true, - null); + null, false); assertTrue(reader instanceof TypesFromDoublePageReader); } @@ -162,7 +162,7 @@ public class TestGetDataColumnReaderByType { hiveTypeInfo.setTypeName("timestamp"); ParquetDataColumnReader reader = ParquetDataColumnReaderFactory .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.INT96).named("value"), hiveTypeInfo, null, true, - null); + null, false); assertTrue(reader instanceof TypesFromInt96PageReader); } @@ -172,7 +172,7 @@ public class TestGetDataColumnReaderByType { hiveTypeInfo.setTypeName("boolean"); ParquetDataColumnReader reader = ParquetDataColumnReaderFactory .getDataColumnReaderByType(Types.optional(PrimitiveTypeName.BOOLEAN).named("value"), hiveTypeInfo, null, true, - null); + null, false); assertTrue(reader instanceof TypesFromBooleanPageReader); } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampUtils.java index fd1b076..a0c8212 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/serde/TestParquetTimestampUtils.java @@ -187,9 +187,9 @@ public class TestParquetTimestampUtils { Assert.assertEquals(n2.getTimeOfDayNanos() - n1.getTimeOfDayNanos(), 600000000009L); NanoTime n3 = new NanoTime(n1.getJulianDay() - 1, n1.getTimeOfDayNanos() + TimeUnit.DAYS.toNanos(1)); - Assert.assertEquals(ts1, NanoTimeUtils.getTimestamp(n3, false, GMT)); + Assert.assertEquals(ts1, NanoTimeUtils.getTimestamp(n3, false, GMT, false)); n3 = new NanoTime(n1.getJulianDay() + 3, n1.getTimeOfDayNanos() - TimeUnit.DAYS.toNanos(3)); - Assert.assertEquals(ts1, NanoTimeUtils.getTimestamp(n3, false, GMT)); + Assert.assertEquals(ts1, NanoTimeUtils.getTimestamp(n3, false, GMT, false)); } @Test diff --git a/ql/src/test/queries/clientpositive/avro_legacy_mixed_timestamp.q b/ql/src/test/queries/clientpositive/avro_legacy_mixed_timestamp.q index e1e6870..9f0667a 100644 --- a/ql/src/test/queries/clientpositive/avro_legacy_mixed_timestamp.q +++ b/ql/src/test/queries/clientpositive/avro_legacy_mixed_timestamp.q @@ -7,7 +7,16 @@ load data local inpath '../../data/files/avro_legacy_mixed_timestamps.avro' into select * from legacy_table; +set hive.avro.timestamp.legacy.conversion.enabled=false; + +select * from legacy_table; + set hive.avro.proleptic.gregorian.default=true; +set hive.avro.timestamp.legacy.conversion.enabled=true; + +select * from legacy_table; + +set hive.avro.timestamp.legacy.conversion.enabled=false; select * from legacy_table; diff --git a/ql/src/test/queries/clientpositive/avro_timestamp2.q b/ql/src/test/queries/clientpositive/avro_timestamp2.q new file mode 100644 index 0000000..604be07 --- /dev/null +++ b/ql/src/test/queries/clientpositive/avro_timestamp2.q @@ -0,0 +1,23 @@ +--! qt:timezone:Asia/Singapore + +create table legacy_table_avro1 (date_test timestamp) +stored as avro; + +load data local inpath '../../data/files/tbl_avro1/' into table legacy_table_avro1; + +select * from legacy_table_avro1; + +set hive.avro.timestamp.legacy.conversion.enabled=false; + +select * from legacy_table_avro1; + +set hive.avro.timestamp.legacy.conversion.enabled=true; +set hive.vectorized.execution.enabled=false; + +select * from legacy_table_avro1; + +set hive.avro.timestamp.legacy.conversion.enabled=false; + +select * from legacy_table_avro1; + +drop table legacy_table_avro1; diff --git a/ql/src/test/queries/clientpositive/parquet_legacy_mixed_timestamp.q b/ql/src/test/queries/clientpositive/parquet_legacy_mixed_timestamp.q index 280df40..8c1bd88 100644 --- a/ql/src/test/queries/clientpositive/parquet_legacy_mixed_timestamp.q +++ b/ql/src/test/queries/clientpositive/parquet_legacy_mixed_timestamp.q @@ -5,4 +5,8 @@ load data local inpath '../../data/files/parquet_legacy_mixed_timestamps.parq' i select * from legacy_table; -drop table legacy_table; \ No newline at end of file +set hive.parquet.timestamp.legacy.conversion.enabled=false; + +select * from legacy_table; + +drop table legacy_table; diff --git a/ql/src/test/queries/clientpositive/parquet_timestamp.q b/ql/src/test/queries/clientpositive/parquet_timestamp.q new file mode 100644 index 0000000..f1af879 --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_timestamp.q @@ -0,0 +1,23 @@ +--! qt:timezone:Asia/Singapore + +create table legacy_table_parq1 (date_test timestamp) +stored as parquet; + +load data local inpath '../../data/files/tbl_parq1/' into table legacy_table_parq1; + +select * from legacy_table_parq1; + +set hive.parquet.timestamp.legacy.conversion.enabled=false; + +select * from legacy_table_parq1; + +set hive.parquet.timestamp.legacy.conversion.enabled=true; +set hive.vectorized.execution.enabled=false; + +select * from legacy_table_parq1; + +set hive.parquet.timestamp.legacy.conversion.enabled=false; + +select * from legacy_table_parq1; + +drop table legacy_table_parq1; diff --git a/ql/src/test/results/clientpositive/llap/avro_legacy_mixed_timestamp.q.out b/ql/src/test/results/clientpositive/llap/avro_legacy_mixed_timestamp.q.out index 27c6f3d..80e8c7f 100644 --- a/ql/src/test/results/clientpositive/llap/avro_legacy_mixed_timestamp.q.out +++ b/ql/src/test/results/clientpositive/llap/avro_legacy_mixed_timestamp.q.out @@ -32,6 +32,22 @@ POSTHOOK: Input: default@legacy_table 2014-02-11 07:08:09.123 1947-02-11 07:08:09.123 8200-02-11 07:08:09.123 +1012-02-21 07:08:09.123 +1014-02-11 07:08:09.123 +0947-02-11 07:08:09.123 +0200-02-11 07:08:09.123 +PREHOOK: query: select * from legacy_table +PREHOOK: type: QUERY +PREHOOK: Input: default@legacy_table +#### A masked pattern was here #### +POSTHOOK: query: select * from legacy_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@legacy_table +#### A masked pattern was here #### +2012-02-21 07:08:09.123 +2014-02-11 07:08:09.123 +1947-02-11 07:08:09.123 +8200-02-11 07:08:09.123 1012-02-21 07:15:11.123 1014-02-11 07:15:11.123 0947-02-11 07:15:11.123 @@ -48,6 +64,22 @@ POSTHOOK: Input: default@legacy_table 2014-02-11 07:08:09.123 1947-02-11 07:08:09.123 8200-02-11 07:08:09.123 +1012-02-27 07:08:09.123 +1014-02-17 07:08:09.123 +0947-02-16 07:08:09.123 +0200-02-10 07:08:09.123 +PREHOOK: query: select * from legacy_table +PREHOOK: type: QUERY +PREHOOK: Input: default@legacy_table +#### A masked pattern was here #### +POSTHOOK: query: select * from legacy_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@legacy_table +#### A masked pattern was here #### +2012-02-21 07:08:09.123 +2014-02-11 07:08:09.123 +1947-02-11 07:08:09.123 +8200-02-11 07:08:09.123 1012-02-27 07:15:11.123 1014-02-17 07:15:11.123 0947-02-16 07:15:11.123 diff --git a/ql/src/test/results/clientpositive/llap/avro_timestamp2.q.out b/ql/src/test/results/clientpositive/llap/avro_timestamp2.q.out new file mode 100644 index 0000000..6c374db --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/avro_timestamp2.q.out @@ -0,0 +1,66 @@ +PREHOOK: query: create table legacy_table_avro1 (date_test timestamp) +stored as avro +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@legacy_table_avro1 +POSTHOOK: query: create table legacy_table_avro1 (date_test timestamp) +stored as avro +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@legacy_table_avro1 +PREHOOK: query: load data local inpath '../../data/files/tbl_avro1/' into table legacy_table_avro1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@legacy_table_avro1 +POSTHOOK: query: load data local inpath '../../data/files/tbl_avro1/' into table legacy_table_avro1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@legacy_table_avro1 +PREHOOK: query: select * from legacy_table_avro1 +PREHOOK: type: QUERY +PREHOOK: Input: default@legacy_table_avro1 +#### A masked pattern was here #### +POSTHOOK: query: select * from legacy_table_avro1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@legacy_table_avro1 +#### A masked pattern was here #### +2020-04-08 13:17:05.215 +1900-01-01 00:00:00 +PREHOOK: query: select * from legacy_table_avro1 +PREHOOK: type: QUERY +PREHOOK: Input: default@legacy_table_avro1 +#### A masked pattern was here #### +POSTHOOK: query: select * from legacy_table_avro1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@legacy_table_avro1 +#### A masked pattern was here #### +2020-04-08 13:17:05.215 +1900-01-01 00:00:00 +PREHOOK: query: select * from legacy_table_avro1 +PREHOOK: type: QUERY +PREHOOK: Input: default@legacy_table_avro1 +#### A masked pattern was here #### +POSTHOOK: query: select * from legacy_table_avro1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@legacy_table_avro1 +#### A masked pattern was here #### +2020-04-08 13:17:05.215 +1900-01-01 00:00:00 +PREHOOK: query: select * from legacy_table_avro1 +PREHOOK: type: QUERY +PREHOOK: Input: default@legacy_table_avro1 +#### A masked pattern was here #### +POSTHOOK: query: select * from legacy_table_avro1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@legacy_table_avro1 +#### A masked pattern was here #### +2020-04-08 13:17:05.215 +1900-01-01 00:00:00 +PREHOOK: query: drop table legacy_table_avro1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@legacy_table_avro1 +PREHOOK: Output: default@legacy_table_avro1 +POSTHOOK: query: drop table legacy_table_avro1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@legacy_table_avro1 +POSTHOOK: Output: default@legacy_table_avro1 diff --git a/ql/src/test/results/clientpositive/llap/parquet_legacy_mixed_timestamp.q.out b/ql/src/test/results/clientpositive/llap/parquet_legacy_mixed_timestamp.q.out index 1259318..165f5b3 100644 --- a/ql/src/test/results/clientpositive/llap/parquet_legacy_mixed_timestamp.q.out +++ b/ql/src/test/results/clientpositive/llap/parquet_legacy_mixed_timestamp.q.out @@ -28,6 +28,22 @@ POSTHOOK: Input: default@legacy_table 2014-02-11 07:08:09.123 1947-02-11 07:08:09.123 8200-02-11 07:08:09.123 +1012-02-21 07:08:09.123 +1014-02-11 07:08:09.123 +0947-02-11 07:08:09.123 +0200-02-11 07:08:09.123 +PREHOOK: query: select * from legacy_table +PREHOOK: type: QUERY +PREHOOK: Input: default@legacy_table +#### A masked pattern was here #### +POSTHOOK: query: select * from legacy_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@legacy_table +#### A masked pattern was here #### +2012-02-21 07:08:09.123 +2014-02-11 07:08:09.123 +1947-02-11 07:08:09.123 +8200-02-11 07:08:09.123 1012-02-21 07:15:11.123 1014-02-11 07:15:11.123 0947-02-11 07:15:11.123 diff --git a/ql/src/test/results/clientpositive/llap/parquet_timestamp.q.out b/ql/src/test/results/clientpositive/llap/parquet_timestamp.q.out new file mode 100644 index 0000000..275e292 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/parquet_timestamp.q.out @@ -0,0 +1,78 @@ +PREHOOK: query: create table legacy_table_parq1 (date_test timestamp) +stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@legacy_table_parq1 +POSTHOOK: query: create table legacy_table_parq1 (date_test timestamp) +stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@legacy_table_parq1 +PREHOOK: query: load data local inpath '../../data/files/tbl_parq1/' into table legacy_table_parq1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@legacy_table_parq1 +POSTHOOK: query: load data local inpath '../../data/files/tbl_parq1/' into table legacy_table_parq1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@legacy_table_parq1 +PREHOOK: query: select * from legacy_table_parq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@legacy_table_parq1 +#### A masked pattern was here #### +POSTHOOK: query: select * from legacy_table_parq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@legacy_table_parq1 +#### A masked pattern was here #### +2020-04-08 13:17:05.215 +1900-01-01 00:00:00 +1900-01-01 00:00:00 +2020-04-08 13:17:05.215 +1899-12-31 22:55:25 +PREHOOK: query: select * from legacy_table_parq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@legacy_table_parq1 +#### A masked pattern was here #### +POSTHOOK: query: select * from legacy_table_parq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@legacy_table_parq1 +#### A masked pattern was here #### +2020-04-08 13:17:05.215 +1899-12-31 22:55:25 +1900-01-01 00:00:00 +2020-04-08 13:17:05.215 +1899-12-31 22:55:25 +PREHOOK: query: select * from legacy_table_parq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@legacy_table_parq1 +#### A masked pattern was here #### +POSTHOOK: query: select * from legacy_table_parq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@legacy_table_parq1 +#### A masked pattern was here #### +2020-04-08 13:17:05.215 +1900-01-01 00:00:00 +1900-01-01 00:00:00 +2020-04-08 13:17:05.215 +1899-12-31 22:55:25 +PREHOOK: query: select * from legacy_table_parq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@legacy_table_parq1 +#### A masked pattern was here #### +POSTHOOK: query: select * from legacy_table_parq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@legacy_table_parq1 +#### A masked pattern was here #### +2020-04-08 13:17:05.215 +1899-12-31 22:55:25 +1900-01-01 00:00:00 +2020-04-08 13:17:05.215 +1899-12-31 22:55:25 +PREHOOK: query: drop table legacy_table_parq1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@legacy_table_parq1 +PREHOOK: Output: default@legacy_table_parq1 +POSTHOOK: query: drop table legacy_table_parq1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@legacy_table_parq1 +POSTHOOK: Output: default@legacy_table_parq1 diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java b/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java index 27583b8..128cfa9 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroDeserializer.java @@ -355,12 +355,15 @@ class AvroDeserializer { } else { skipUTCConversion = HiveConf.ConfVars.HIVE_AVRO_TIMESTAMP_SKIP_CONVERSION.defaultBoolVal; } + boolean legacyConversion = false; ZoneId convertToTimeZone; if (writerTimezone != null) { convertToTimeZone = writerTimezone; } else if (skipUTCConversion) { convertToTimeZone = ZoneOffset.UTC; } else { + legacyConversion = configuration != null && HiveConf.getBoolVar( + configuration, HiveConf.ConfVars.HIVE_AVRO_TIMESTAMP_LEGACY_CONVERSION_ENABLED); convertToTimeZone = TimeZone.getDefault().toZoneId(); } final boolean skipProlepticConversion; @@ -375,7 +378,7 @@ class AvroDeserializer { } } Timestamp timestamp = TimestampTZUtil.convertTimestampToZone( - Timestamp.ofEpochMilli((Long) datum), ZoneOffset.UTC, convertToTimeZone); + Timestamp.ofEpochMilli((Long) datum), ZoneOffset.UTC, convertToTimeZone, legacyConversion); if (!skipProlepticConversion) { timestamp = Timestamp.ofEpochMilli( CalendarUtils.convertTimeToProleptic(timestamp.toEpochMilli()));