Repository: hive Updated Branches: refs/heads/branch-2 122350053 -> 8b866562b
HIVE-16231: Parquet timestamp may be stored differently since HIVE-12767 (Barna Zsombor Klara, reviewed by Sergio Pena) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/8b866562 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/8b866562 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/8b866562 Branch: refs/heads/branch-2 Commit: 8b866562b16a2b10880a4296fe133ef007a85c77 Parents: 1223500 Author: Barna Zsombor Klara <zsombor.kl...@cloudera.com> Authored: Tue Mar 28 12:03:02 2017 -0700 Committer: Sergio Pena <sergio.p...@cloudera.com> Committed: Tue Mar 28 12:04:32 2017 -0700 ---------------------------------------------------------------------- .../ql/io/parquet/MapredParquetOutputFormat.java | 10 ++++------ .../hive/ql/io/parquet/ParquetRecordReaderBase.java | 14 +++++--------- .../hive/ql/io/parquet/timestamp/NanoTimeUtils.java | 15 ++++++++++++++- .../ql/io/parquet/timestamp/TestNanoTimeUtils.java | 13 +++++++++++++ .../queries/clientpositive/parquet_int96_timestamp.q | 2 +- 5 files changed, 37 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/8b866562/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java index 26f1e75..a7bb5ee 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetOutputFormat.java @@ -21,6 +21,7 @@ import java.util.Properties; import java.util.TimeZone; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetTableUtils; +import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; import org.apache.parquet.Strings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -139,14 +140,11 @@ public class MapredParquetOutputFormat extends FileOutputFormat<NullWritable, Pa String timeZoneID = tableProperties.getProperty(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY); if (!Strings.isNullOrEmpty(timeZoneID)) { - if (!Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZoneID)) { - throw new IllegalStateException("Unexpected timezone id found for parquet int96 conversion: " + timeZoneID); - } + + NanoTimeUtils.validateTimeZone(timeZoneID); return TimeZone.getTimeZone(timeZoneID); } - // If no timezone is defined in table properties, then adjust timestamps using - // PARQUET_INT96_NO_ADJUSTMENT_ZONE timezone - return TimeZone.getTimeZone(ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE); + return TimeZone.getDefault(); } } http://git-wip-us.apache.org/repos/asf/hive/blob/8b866562/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java index 8e33b7d..2954601 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java @@ -20,6 +20,7 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; import org.apache.hadoop.hive.ql.io.parquet.read.ParquetFilterPredicateConverter; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetTableUtils; +import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.serde2.SerDeStats; @@ -44,7 +45,6 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.TimeZone; @@ -170,7 +170,7 @@ public class ParquetRecordReaderBase { boolean skipConversion = HiveConf.getBoolVar(configuration, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); - if (!Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr") || + if (!Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr") && skipConversion) { // Impala writes timestamp values using GMT only. We should not try to convert Impala // files to other type of timezones. @@ -179,16 +179,12 @@ public class ParquetRecordReaderBase { // TABLE_PARQUET_INT96_TIMEZONE is a table property used to detect what timezone conversion // to use when reading Parquet timestamps. timeZoneID = configuration.get(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, - ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE); - - if (!Arrays.asList(TimeZone.getAvailableIDs()).contains(timeZoneID)) { - throw new IllegalStateException("Unexpected timezone id found for parquet int96 conversion: " + timeZoneID); - } + TimeZone.getDefault().getID()); + NanoTimeUtils.validateTimeZone(timeZoneID); } // 'timeZoneID' should be valid, since we did not throw exception above - configuration.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, - TimeZone.getTimeZone(timeZoneID).getID()); + configuration.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY,timeZoneID); } public FilterCompat.Filter setFilter(final JobConf conf, MessageType schema) { http://git-wip-us.apache.org/repos/asf/hive/blob/8b866562/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java index 5dc8088..dbd6fb3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/timestamp/NanoTimeUtils.java @@ -152,13 +152,26 @@ public class NanoTimeUtils { calendar.setTimeInMillis(utcCalendar.getTimeInMillis()); - Calendar adjusterCalendar = copyToCalendarWithTZ(calendar, Calendar.getInstance()); + Calendar adjusterCalendar = copyToCalendarWithTZ(calendar, getLocalCalendar()); Timestamp ts = new Timestamp(adjusterCalendar.getTimeInMillis()); ts.setNanos((int) nanos); return ts; } + /** + * Check if the string id is a valid java TimeZone id. + * TimeZone#getTimeZone will return "GMT" if the id cannot be understood. + * @param timeZoneID + */ + public static void validateTimeZone(String timeZoneID) { + if (TimeZone.getTimeZone(timeZoneID).getID().equals("GMT") + && !"GMT".equals(timeZoneID)) { + throw new IllegalStateException( + "Unexpected timezone id found for parquet int96 conversion: " + timeZoneID); + } + } + private static Calendar copyToCalendarWithTZ(Calendar from, Calendar to) { if(from.getTimeZone().getID().equals(to.getTimeZone().getID())) { return from; http://git-wip-us.apache.org/repos/asf/hive/blob/8b866562/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/timestamp/TestNanoTimeUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/timestamp/TestNanoTimeUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/timestamp/TestNanoTimeUtils.java index 37cf0e2..1e10dbf 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/timestamp/TestNanoTimeUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/timestamp/TestNanoTimeUtils.java @@ -230,4 +230,17 @@ public class TestNanoTimeUtils { Assert.assertEquals(newNTUTC.getJulianDay(), depNTUTC.getJulianDay()); Assert.assertEquals(newNTUTC.getTimeOfDayNanos(), depNTUTC.getTimeOfDayNanos()); } + + @Test + public void testTimeZoneValidationWithCorrectZoneId() { + NanoTimeUtils.validateTimeZone("GMT"); + NanoTimeUtils.validateTimeZone("UTC"); + NanoTimeUtils.validateTimeZone("GMT+10"); + NanoTimeUtils.validateTimeZone("Europe/Budapest"); + } + + @Test(expected = IllegalStateException.class) + public void testTimeZoneValidationWithIncorrectZoneId() { + NanoTimeUtils.validateTimeZone("UCC"); + } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/8b866562/ql/src/test/queries/clientpositive/parquet_int96_timestamp.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/parquet_int96_timestamp.q b/ql/src/test/queries/clientpositive/parquet_int96_timestamp.q index 5de2c3f..6eadd1b 100644 --- a/ql/src/test/queries/clientpositive/parquet_int96_timestamp.q +++ b/ql/src/test/queries/clientpositive/parquet_int96_timestamp.q @@ -2,7 +2,7 @@ create table dummy (id int); insert into table dummy values (1); set hive.parquet.mr.int96.enable.utc.write.zone=true; -set hive.parquet.timestamp.skip.conversion=false; +set hive.parquet.timestamp.skip.conversion=true; -- read/write timestamps using UTC as default write zone create table timestamps (ts timestamp) stored as parquet;