[hive] 02/02: HIVE-26270: Wrong timestamps when reading Hive 3.1.x Parquet files with vectorized reader (Stamatis Zampetakis, reviewed by Peter Vary)

zabetak Fri, 03 Jun 2022 01:29:24 -0700

This is an automated email from the ASF dual-hosted git repository.

zabetak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


commit ebaba1436831831e5c60227c476e5fc362012fe4
Author: Stamatis Zampetakis <zabe...@gmail.com>
AuthorDate: Mon May 30 12:18:06 2022 +0200

    HIVE-26270: Wrong timestamps when reading Hive 3.1.x Parquet files with 
vectorized reader (Stamatis Zampetakis, reviewed by Peter Vary)
    
    Closes #3338
---
 data/files/employee_hive_3_1_3_us_pacific.parquet  | Bin 0 -> 446 bytes
 .../ql/io/parquet/ParquetRecordReaderBase.java     |   9 +---
 .../io/parquet/read/DataWritableReadSupport.java   |  39 ++++++++------
 ...rquet_timestamp_int96_compatibility_hive3_1_3.q |  24 +++++++++
 ...t_timestamp_int96_compatibility_hive3_1_3.q.out |  60 +++++++++++++++++++++
 5 files changed, 110 insertions(+), 22 deletions(-)

diff --git a/data/files/employee_hive_3_1_3_us_pacific.parquet 
b/data/files/employee_hive_3_1_3_us_pacific.parquet
new file mode 100644
index 0000000000..2125bc688d
Binary files /dev/null and b/data/files/employee_hive_3_1_3_us_pacific.parquet 
differ
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
index 5235edc114..4cc32ae480 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
@@ -16,11 +16,9 @@ package org.apache.hadoop.hive.ql.io.parquet;
 import com.google.common.base.Strings;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.conf.HiveConf;
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
 import org.apache.hadoop.hive.ql.io.IOConstants;
 import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
 import 
org.apache.hadoop.hive.ql.io.parquet.read.ParquetFilterPredicateConverter;
-import org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriteSupport;
 import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg;
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
 import org.apache.hadoop.hive.serde2.SerDeStats;
@@ -143,11 +141,8 @@ public class ParquetRecordReaderBase {
         skipProlepticConversion = HiveConf.getBoolVar(
             conf, 
HiveConf.ConfVars.HIVE_PARQUET_DATE_PROLEPTIC_GREGORIAN_DEFAULT);
       }
-      legacyConversionEnabled = HiveConf.getBoolVar(conf, 
ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED);
-      if 
(fileMetaData.getKeyValueMetaData().containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY))
 {
-        legacyConversionEnabled = Boolean.parseBoolean(
-            
fileMetaData.getKeyValueMetaData().get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY));
-      }
+      legacyConversionEnabled =
+          
DataWritableReadSupport.getZoneConversionLegacy(fileMetaData.getKeyValueMetaData(),
 conf);
 
       split = new ParquetInputSplit(finalPath,
         splitStart,
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java
index 6aa6d2e412..ecdd155a31 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java
@@ -289,6 +289,28 @@ public class DataWritableReadSupport extends 
ReadSupport<ArrayWritable> {
     return null;
   }
   
+  /**
+   * Returns whether legacy zone conversion should be used for transforming 
timestamps based on file metadata and
+   * configuration.
+   *
+   * @see ConfVars#HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED
+   */
+  public static boolean getZoneConversionLegacy(Map<String, String> metadata, 
Configuration conf) {
+    assert conf != null : "Configuration must not be null";
+    if (metadata != null) {
+      if 
(metadata.containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)) {
+        return 
Boolean.parseBoolean(metadata.get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY));
+      }
+      // There are no explicit meta about the legacy conversion
+      if (metadata.containsKey(DataWritableWriteSupport.WRITER_TIMEZONE)) {
+        // There is meta about the timezone thus we can infer that when the 
file was written, the new APIs were used.
+        return false;
+      }
+    }
+    // There is no (relevant) metadata in the file, use the configuration
+    return HiveConf.getBoolVar(conf, 
ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED);
+  }
+
   /**
    * Return the columns which contains required nested attribute level
    * E.g., given struct a:<x:int, y:int> while 'x' is required and 'y' is not, 
the method will return
@@ -509,21 +531,8 @@ public class DataWritableReadSupport extends 
ReadSupport<ArrayWritable> {
     }
 
     if 
(!metadata.containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)) 
{
-      final String legacyConversion;
-      
if(keyValueMetaData.containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY))
 {
-        // If there is meta about the legacy conversion then the file should 
be read in the same way it was written. 
-        legacyConversion = 
keyValueMetaData.get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY);
-      } else 
if(keyValueMetaData.containsKey(DataWritableWriteSupport.WRITER_TIMEZONE)) {
-        // If there is no meta about the legacy conversion but there is meta 
about the timezone then we can infer the
-        // file was written with the new rules.
-        legacyConversion = "false";
-      } else {
-        // If there is no meta at all then it is not possible to determine 
which rules were used to write the file.
-        // Choose between old/new rules using the respective configuration 
property.
-        legacyConversion = String.valueOf(
-            HiveConf.getBoolVar(configuration, 
ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED));
-      }
-      metadata.put(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY, 
legacyConversion);
+      boolean legacyConversion = getZoneConversionLegacy(keyValueMetaData, 
configuration);
+      metadata.put(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY, 
String.valueOf(legacyConversion));
     } else {
       String ctxMeta = 
metadata.get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY);
       String fileMeta = 
keyValueMetaData.get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY);
diff --git 
a/ql/src/test/queries/clientpositive/parquet_timestamp_int96_compatibility_hive3_1_3.q
 
b/ql/src/test/queries/clientpositive/parquet_timestamp_int96_compatibility_hive3_1_3.q
new file mode 100644
index 0000000000..c14e5fe02c
--- /dev/null
+++ 
b/ql/src/test/queries/clientpositive/parquet_timestamp_int96_compatibility_hive3_1_3.q
@@ -0,0 +1,24 @@
+ create table employee (eid INT, birth timestamp) stored as parquet;
+ 
+-- The parquet file was written with Hive 3.1.3 using the new Date/Time APIs 
(legacy=false) to
+-- convert from US/Pacific to UTC. The presence of writer.time.zone in the 
metadata of the file
+-- allow us to infer that new Date/Time APIS should be used for the 
conversion. The
+-- hive.parquet.timestamp.legacy.conversion.enabled property shouldn't be 
taken into account in this
+-- case.
+LOAD DATA LOCAL INPATH 
'../../data/files/employee_hive_3_1_3_us_pacific.parquet' into table employee;  
  
+
+-- Read timestamps using the non-vectorized reader
+set hive.vectorized.execution.enabled=false;
+set hive.parquet.timestamp.legacy.conversion.enabled=true;
+select * from employee;
+set hive.parquet.timestamp.legacy.conversion.enabled=false;
+select * from employee;
+
+-- Read timestamps using the vectorized reader
+set hive.vectorized.execution.enabled=true;
+-- Disable task conversion to allow vectorization to kick in
+set hive.fetch.task.conversion=none;
+set hive.parquet.timestamp.legacy.conversion.enabled=true;
+select * from employee;
+set hive.parquet.timestamp.legacy.conversion.enabled=false;
+select * from employee;
diff --git 
a/ql/src/test/results/clientpositive/llap/parquet_timestamp_int96_compatibility_hive3_1_3.q.out
 
b/ql/src/test/results/clientpositive/llap/parquet_timestamp_int96_compatibility_hive3_1_3.q.out
new file mode 100644
index 0000000000..560fc65b14
--- /dev/null
+++ 
b/ql/src/test/results/clientpositive/llap/parquet_timestamp_int96_compatibility_hive3_1_3.q.out
@@ -0,0 +1,60 @@
+PREHOOK: query: create table employee (eid INT, birth timestamp) stored as 
parquet
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@employee
+POSTHOOK: query: create table employee (eid INT, birth timestamp) stored as 
parquet
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@employee
+PREHOOK: query: LOAD DATA LOCAL INPATH 
'../../data/files/employee_hive_3_1_3_us_pacific.parquet' into table employee
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@employee
+POSTHOOK: query: LOAD DATA LOCAL INPATH 
'../../data/files/employee_hive_3_1_3_us_pacific.parquet' into table employee
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@employee
+PREHOOK: query: select * from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select * from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+1      1880-01-01 00:00:00
+2      1884-01-01 00:00:00
+3      1990-01-01 00:00:00
+PREHOOK: query: select * from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select * from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+1      1880-01-01 00:00:00
+2      1884-01-01 00:00:00
+3      1990-01-01 00:00:00
+PREHOOK: query: select * from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select * from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+1      1880-01-01 00:00:00
+2      1884-01-01 00:00:00
+3      1990-01-01 00:00:00
+PREHOOK: query: select * from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select * from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+1      1880-01-01 00:00:00
+2      1884-01-01 00:00:00
+3      1990-01-01 00:00:00

[hive] 02/02: HIVE-26270: Wrong timestamps when reading Hive 3.1.x Parquet files with vectorized reader (Stamatis Zampetakis, reviewed by Peter Vary)

Reply via email to