hive git commit: HIVE-16231: Parquet timestamp may be stored differently since HIVE-12767 (Barna Zsombor Klara, reviewed by Sergio Pena)

spena Tue, 28 Mar 2017 12:08:49 -0700

Repository: hive
Updated Branches:
  refs/heads/branch-2 122350053 -> 8b866562b



HIVE-16231: Parquet timestamp may be stored differently since HIVE-12767 (Barna 
Zsombor Klara, reviewed by Sergio Pena)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/8b866562
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/8b866562
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/8b866562

Branch: refs/heads/branch-2
Commit: 8b866562b16a2b10880a4296fe133ef007a85c77
Parents: 1223500
Author: Barna Zsombor Klara <zsombor.kl...@cloudera.com>
Authored: Tue Mar 28 12:03:02 2017 -0700
Committer: Sergio Pena <sergio.p...@cloudera.com>
Committed: Tue Mar 28 12:04:32 2017 -0700

----------------------------------------------------------------------
 .../ql/io/parquet/MapredParquetOutputFormat.java     | 10 ++++------
 .../hive/ql/io/parquet/ParquetRecordReaderBase.java  | 14 +++++---------
 .../hive/ql/io/parquet/timestamp/NanoTimeUtils.java  | 15 ++++++++++++++-
 .../ql/io/parquet/timestamp/TestNanoTimeUtils.java   | 13 +++++++++++++
 .../queries/clientpositive/parquet_int96_timestamp.q |  2 +-
 5 files changed, 37 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/8b866562/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java
index 26f1e75..a7bb5ee 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java
@@ -21,6 +21,7 @@ import java.util.Properties;
 import java.util.TimeZone;
 
 import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetTableUtils;
+import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils;
 import org.apache.parquet.Strings;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -139,14 +140,11 @@ public class MapredParquetOutputFormat extends 
FileOutputFormat<NullWritable, Pa
     String timeZoneID =
         
tableProperties.getProperty(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY);
     if (!Strings.isNullOrEmpty(timeZoneID)) {
-      if (!Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZoneID)) {
-        throw new IllegalStateException("Unexpected timezone id found for 
parquet int96 conversion: " + timeZoneID);
-      }
+
+      NanoTimeUtils.validateTimeZone(timeZoneID);
       return TimeZone.getTimeZone(timeZoneID);
     }
 
-    // If no timezone is defined in table properties, then adjust timestamps 
using
-    // PARQUET_INT96_NO_ADJUSTMENT_ZONE timezone
-    return 
TimeZone.getTimeZone(ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE);
+    return TimeZone.getDefault();
   }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/8b866562/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
index 8e33b7d..2954601 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
@@ -20,6 +20,7 @@ import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
 import 
org.apache.hadoop.hive.ql.io.parquet.read.ParquetFilterPredicateConverter;
 import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetTableUtils;
+import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils;
 import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg;
 import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
 import org.apache.hadoop.hive.serde2.SerDeStats;
@@ -44,7 +45,6 @@ import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
 import java.util.TimeZone;
 
@@ -170,7 +170,7 @@ public class ParquetRecordReaderBase {
     boolean skipConversion = HiveConf.getBoolVar(configuration,
         HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION);
     FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
-    if 
(!Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr") ||
+    if 
(!Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr") &&
         skipConversion) {
       // Impala writes timestamp values using GMT only. We should not try to 
convert Impala
       // files to other type of timezones.
@@ -179,16 +179,12 @@ public class ParquetRecordReaderBase {
       // TABLE_PARQUET_INT96_TIMEZONE is a table property used to detect what 
timezone conversion
       // to use when reading Parquet timestamps.
       timeZoneID = 
configuration.get(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY,
-          ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE);
-
-      if (!Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZoneID)) {
-          throw new IllegalStateException("Unexpected timezone id found for 
parquet int96 conversion: " + timeZoneID);
-      }
+          TimeZone.getDefault().getID());
+      NanoTimeUtils.validateTimeZone(timeZoneID);
     }
 
     // 'timeZoneID' should be valid, since we did not throw exception above
-    configuration.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY,
-        TimeZone.getTimeZone(timeZoneID).getID());
+    
configuration.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY,timeZoneID);
   }
 
   public FilterCompat.Filter setFilter(final JobConf conf, MessageType schema) 
{

http://git-wip-us.apache.org/repos/asf/hive/blob/8b866562/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java
index 5dc8088..dbd6fb3 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java
@@ -152,13 +152,26 @@ public class NanoTimeUtils {
 
     calendar.setTimeInMillis(utcCalendar.getTimeInMillis());
 
-    Calendar adjusterCalendar = copyToCalendarWithTZ(calendar, 
Calendar.getInstance());
+    Calendar adjusterCalendar = copyToCalendarWithTZ(calendar, 
getLocalCalendar());
 
     Timestamp ts = new Timestamp(adjusterCalendar.getTimeInMillis());
     ts.setNanos((int) nanos);
     return ts;
   }
 
+  /**
+   * Check if the string id is a valid java TimeZone id.
+   * TimeZone#getTimeZone will return "GMT" if the id cannot be understood.
+   * @param timeZoneID
+   */
+  public static void validateTimeZone(String timeZoneID) {
+    if (TimeZone.getTimeZone(timeZoneID).getID().equals("GMT")
+        && !"GMT".equals(timeZoneID)) {
+      throw new IllegalStateException(
+          "Unexpected timezone id found for parquet int96 conversion: " + 
timeZoneID);
+    }
+  }
+
   private static Calendar copyToCalendarWithTZ(Calendar from, Calendar to) {
     if(from.getTimeZone().getID().equals(to.getTimeZone().getID())) {
       return from;

http://git-wip-us.apache.org/repos/asf/hive/blob/8b866562/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/timestamp/TestNanoTimeUtils.java
----------------------------------------------------------------------
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/timestamp/TestNanoTimeUtils.java
 
b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/timestamp/TestNanoTimeUtils.java
index 37cf0e2..1e10dbf 100644
--- 
a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/timestamp/TestNanoTimeUtils.java
+++ 
b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/timestamp/TestNanoTimeUtils.java
@@ -230,4 +230,17 @@ public class TestNanoTimeUtils {
     Assert.assertEquals(newNTUTC.getJulianDay(), depNTUTC.getJulianDay());
     Assert.assertEquals(newNTUTC.getTimeOfDayNanos(), 
depNTUTC.getTimeOfDayNanos());
   }
+
+  @Test
+  public void testTimeZoneValidationWithCorrectZoneId() {
+    NanoTimeUtils.validateTimeZone("GMT");
+    NanoTimeUtils.validateTimeZone("UTC");
+    NanoTimeUtils.validateTimeZone("GMT+10");
+    NanoTimeUtils.validateTimeZone("Europe/Budapest");
+  }
+
+  @Test(expected = IllegalStateException.class)
+  public void testTimeZoneValidationWithIncorrectZoneId() {
+    NanoTimeUtils.validateTimeZone("UCC");
+  }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hive/blob/8b866562/ql/src/test/queries/clientpositive/parquet_int96_timestamp.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/parquet_int96_timestamp.q 
b/ql/src/test/queries/clientpositive/parquet_int96_timestamp.q
index 5de2c3f..6eadd1b 100644
--- a/ql/src/test/queries/clientpositive/parquet_int96_timestamp.q
+++ b/ql/src/test/queries/clientpositive/parquet_int96_timestamp.q
@@ -2,7 +2,7 @@ create table dummy (id int);
 insert into table dummy values (1);
 
 set hive.parquet.mr.int96.enable.utc.write.zone=true;
-set hive.parquet.timestamp.skip.conversion=false;
+set hive.parquet.timestamp.skip.conversion=true;
 
 -- read/write timestamps using UTC as default write zone
 create table timestamps (ts timestamp) stored as parquet;

hive git commit: HIVE-16231: Parquet timestamp may be stored differently since HIVE-12767 (Barna Zsombor Klara, reviewed by Sergio Pena)

Reply via email to