This is an automated email from the ASF dual-hosted git repository. zabetak pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
commit ebaba1436831831e5c60227c476e5fc362012fe4 Author: Stamatis Zampetakis <zabe...@gmail.com> AuthorDate: Mon May 30 12:18:06 2022 +0200 HIVE-26270: Wrong timestamps when reading Hive 3.1.x Parquet files with vectorized reader (Stamatis Zampetakis, reviewed by Peter Vary) Closes #3338 --- data/files/employee_hive_3_1_3_us_pacific.parquet | Bin 0 -> 446 bytes .../ql/io/parquet/ParquetRecordReaderBase.java | 9 +--- .../io/parquet/read/DataWritableReadSupport.java | 39 ++++++++------ ...rquet_timestamp_int96_compatibility_hive3_1_3.q | 24 +++++++++ ...t_timestamp_int96_compatibility_hive3_1_3.q.out | 60 +++++++++++++++++++++ 5 files changed, 110 insertions(+), 22 deletions(-) diff --git a/data/files/employee_hive_3_1_3_us_pacific.parquet b/data/files/employee_hive_3_1_3_us_pacific.parquet new file mode 100644 index 0000000000..2125bc688d Binary files /dev/null and b/data/files/employee_hive_3_1_3_us_pacific.parquet differ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java index 5235edc114..4cc32ae480 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java @@ -16,11 +16,9 @@ package org.apache.hadoop.hive.ql.io.parquet; import com.google.common.base.Strings; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.ql.io.IOConstants; import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; import org.apache.hadoop.hive.ql.io.parquet.read.ParquetFilterPredicateConverter; -import org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriteSupport; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.serde2.SerDeStats; @@ -143,11 +141,8 @@ public class ParquetRecordReaderBase { skipProlepticConversion = HiveConf.getBoolVar( conf, HiveConf.ConfVars.HIVE_PARQUET_DATE_PROLEPTIC_GREGORIAN_DEFAULT); } - legacyConversionEnabled = HiveConf.getBoolVar(conf, ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED); - if (fileMetaData.getKeyValueMetaData().containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)) { - legacyConversionEnabled = Boolean.parseBoolean( - fileMetaData.getKeyValueMetaData().get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)); - } + legacyConversionEnabled = + DataWritableReadSupport.getZoneConversionLegacy(fileMetaData.getKeyValueMetaData(), conf); split = new ParquetInputSplit(finalPath, splitStart, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java index 6aa6d2e412..ecdd155a31 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java @@ -289,6 +289,28 @@ public class DataWritableReadSupport extends ReadSupport<ArrayWritable> { return null; } + /** + * Returns whether legacy zone conversion should be used for transforming timestamps based on file metadata and + * configuration. + * + * @see ConfVars#HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED + */ + public static boolean getZoneConversionLegacy(Map<String, String> metadata, Configuration conf) { + assert conf != null : "Configuration must not be null"; + if (metadata != null) { + if (metadata.containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)) { + return Boolean.parseBoolean(metadata.get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)); + } + // There are no explicit meta about the legacy conversion + if (metadata.containsKey(DataWritableWriteSupport.WRITER_TIMEZONE)) { + // There is meta about the timezone thus we can infer that when the file was written, the new APIs were used. + return false; + } + } + // There is no (relevant) metadata in the file, use the configuration + return HiveConf.getBoolVar(conf, ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED); + } + /** * Return the columns which contains required nested attribute level * E.g., given struct a:<x:int, y:int> while 'x' is required and 'y' is not, the method will return @@ -509,21 +531,8 @@ public class DataWritableReadSupport extends ReadSupport<ArrayWritable> { } if (!metadata.containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)) { - final String legacyConversion; - if(keyValueMetaData.containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)) { - // If there is meta about the legacy conversion then the file should be read in the same way it was written. - legacyConversion = keyValueMetaData.get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY); - } else if(keyValueMetaData.containsKey(DataWritableWriteSupport.WRITER_TIMEZONE)) { - // If there is no meta about the legacy conversion but there is meta about the timezone then we can infer the - // file was written with the new rules. - legacyConversion = "false"; - } else { - // If there is no meta at all then it is not possible to determine which rules were used to write the file. - // Choose between old/new rules using the respective configuration property. - legacyConversion = String.valueOf( - HiveConf.getBoolVar(configuration, ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED)); - } - metadata.put(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY, legacyConversion); + boolean legacyConversion = getZoneConversionLegacy(keyValueMetaData, configuration); + metadata.put(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY, String.valueOf(legacyConversion)); } else { String ctxMeta = metadata.get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY); String fileMeta = keyValueMetaData.get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY); diff --git a/ql/src/test/queries/clientpositive/parquet_timestamp_int96_compatibility_hive3_1_3.q b/ql/src/test/queries/clientpositive/parquet_timestamp_int96_compatibility_hive3_1_3.q new file mode 100644 index 0000000000..c14e5fe02c --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_timestamp_int96_compatibility_hive3_1_3.q @@ -0,0 +1,24 @@ + create table employee (eid INT, birth timestamp) stored as parquet; + +-- The parquet file was written with Hive 3.1.3 using the new Date/Time APIs (legacy=false) to +-- convert from US/Pacific to UTC. The presence of writer.time.zone in the metadata of the file +-- allow us to infer that new Date/Time APIS should be used for the conversion. The +-- hive.parquet.timestamp.legacy.conversion.enabled property shouldn't be taken into account in this +-- case. +LOAD DATA LOCAL INPATH '../../data/files/employee_hive_3_1_3_us_pacific.parquet' into table employee; + +-- Read timestamps using the non-vectorized reader +set hive.vectorized.execution.enabled=false; +set hive.parquet.timestamp.legacy.conversion.enabled=true; +select * from employee; +set hive.parquet.timestamp.legacy.conversion.enabled=false; +select * from employee; + +-- Read timestamps using the vectorized reader +set hive.vectorized.execution.enabled=true; +-- Disable task conversion to allow vectorization to kick in +set hive.fetch.task.conversion=none; +set hive.parquet.timestamp.legacy.conversion.enabled=true; +select * from employee; +set hive.parquet.timestamp.legacy.conversion.enabled=false; +select * from employee; diff --git a/ql/src/test/results/clientpositive/llap/parquet_timestamp_int96_compatibility_hive3_1_3.q.out b/ql/src/test/results/clientpositive/llap/parquet_timestamp_int96_compatibility_hive3_1_3.q.out new file mode 100644 index 0000000000..560fc65b14 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/parquet_timestamp_int96_compatibility_hive3_1_3.q.out @@ -0,0 +1,60 @@ +PREHOOK: query: create table employee (eid INT, birth timestamp) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@employee +POSTHOOK: query: create table employee (eid INT, birth timestamp) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@employee +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/employee_hive_3_1_3_us_pacific.parquet' into table employee +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@employee +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/employee_hive_3_1_3_us_pacific.parquet' into table employee +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@employee +PREHOOK: query: select * from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select * from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +1 1880-01-01 00:00:00 +2 1884-01-01 00:00:00 +3 1990-01-01 00:00:00 +PREHOOK: query: select * from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select * from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +1 1880-01-01 00:00:00 +2 1884-01-01 00:00:00 +3 1990-01-01 00:00:00 +PREHOOK: query: select * from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select * from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +1 1880-01-01 00:00:00 +2 1884-01-01 00:00:00 +3 1990-01-01 00:00:00 +PREHOOK: query: select * from employee +PREHOOK: type: QUERY +PREHOOK: Input: default@employee +#### A masked pattern was here #### +POSTHOOK: query: select * from employee +POSTHOOK: type: QUERY +POSTHOOK: Input: default@employee +#### A masked pattern was here #### +1 1880-01-01 00:00:00 +2 1884-01-01 00:00:00 +3 1990-01-01 00:00:00