This is an automated email from the ASF dual-hosted git repository. hashutosh pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 659e28d HIVE-23345: Enable Parquet timestamps types (INT64 and INT96) conversion to Hive BIGINT type Adding test cases (Panos G via Ashutosh Chauhan) 659e28d is described below commit 659e28de0b609d114e20e6294348abb74f49f6e0 Author: Panos Garefalakis <pga...@cloudera.com> AuthorDate: Fri May 1 14:24:39 2020 +0100 HIVE-23345: Enable Parquet timestamps types (INT64 and INT96) conversion to Hive BIGINT type Adding test cases (Panos G via Ashutosh Chauhan) Change-Id: I8666a95cc7ff7495a86b960c2ea173cd875bfa4f Signed-off-by: Ashutosh Chauhan <hashut...@apache.org> --- .../test/resources/testconfiguration.properties | 1 + .../hive/ql/io/parquet/convert/ETypeConverter.java | 40 +++++++++++++-- .../ql/io/parquet/convert/TestETypeConverter.java | 29 +++++++++++ .../clientpositive/parquet_timestampt_to_bigint.q | 25 +++++++++ .../llap/parquet_timestampt_to_bigint.q.out | 60 ++++++++++++++++++++++ 5 files changed, 152 insertions(+), 3 deletions(-) diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index b639718..5468728 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -695,6 +695,7 @@ minillaplocal.query.files=\ parquet_legacy_mixed_date.q,\ parquet_legacy_mixed_timestamp.q,\ parquet_proleptic_mixed_date.q,\ + parquet_timestampt_to_bigint.q,\ partition_ctas.q,\ partition_multilevels.q,\ partition_shared_scan.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java index 6082321..8e436bc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java @@ -14,6 +14,8 @@ package org.apache.hadoop.hive.ql.io.parquet.convert; import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.ArrayList; import java.util.Map; import java.util.Optional; @@ -43,6 +45,7 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; +import org.apache.parquet.Preconditions; import org.apache.parquet.column.Dictionary; import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.PrimitiveConverter; @@ -662,9 +665,25 @@ public enum ETypeConverter { }; } }, - ETIMESTAMP_CONVERTER(TimestampWritableV2.class) { + EINT96_TIMESTAMP_CONVERTER(TimestampWritableV2.class) { @Override PrimitiveConverter getConverter(final PrimitiveType type, final int index, final ConverterParent parent, TypeInfo hiveTypeInfo) { + if (hiveTypeInfo != null) { + String typeName = TypeInfoUtils.getBaseName(hiveTypeInfo.getTypeName()); + switch (typeName) { + case serdeConstants.BIGINT_TYPE_NAME: + return new BinaryConverter<LongWritable>(type, parent, index) { + @Override + protected LongWritable convert(Binary binary) { + Preconditions.checkArgument(binary.length() == 12, "Must be 12 bytes"); + ByteBuffer buf = binary.toByteBuffer(); + buf.order(ByteOrder.LITTLE_ENDIAN); + long longVal = buf.getLong(); + return new LongWritable(longVal); + } + }; + } + } return new BinaryConverter<TimestampWritableV2>(type, parent, index) { @Override protected TimestampWritableV2 convert(Binary binary) { @@ -690,6 +709,22 @@ public enum ETypeConverter { @Override PrimitiveConverter getConverter(final PrimitiveType type, final int index, final ConverterParent parent, TypeInfo hiveTypeInfo) { + if (hiveTypeInfo != null) { + String typeName = TypeInfoUtils.getBaseName(hiveTypeInfo.getTypeName()); + switch (typeName) { + case serdeConstants.BIGINT_TYPE_NAME: + return new BinaryConverter<LongWritable>(type, parent, index) { + @Override + protected LongWritable convert(Binary binary) { + Preconditions.checkArgument(binary.length() == 8, "Must be 8 bytes"); + ByteBuffer buf = binary.toByteBuffer(); + buf.order(ByteOrder.LITTLE_ENDIAN); + long longVal = buf.getLong(); + return new LongWritable(longVal); + } + }; + } + } return new PrimitiveConverter() { @Override public void addLong(final long value) { @@ -735,8 +770,7 @@ public enum ETypeConverter { public static PrimitiveConverter getNewConverter(final PrimitiveType type, final int index, final ConverterParent parent, final TypeInfo hiveTypeInfo) { if (type.isPrimitive() && (type.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96))) { - //TODO- cleanup once parquet support Timestamp type annotation. - return ETypeConverter.ETIMESTAMP_CONVERTER.getConverter(type, index, parent, hiveTypeInfo); + return EINT96_TIMESTAMP_CONVERTER.getConverter(type, index, parent, hiveTypeInfo); } if (type.getLogicalTypeAnnotation() != null) { Optional<PrimitiveConverter> converter = type.getLogicalTypeAnnotation() diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java index be4c880..74e2495 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java @@ -21,6 +21,8 @@ package org.apache.hadoop.hive.ql.io.parquet.convert; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.time.ZoneId; import org.apache.hadoop.hive.common.type.Timestamp; @@ -109,6 +111,33 @@ public class TestETypeConverter { } @Test + public void testGetSmallBigIntConverter() { + Timestamp timestamp = Timestamp.valueOf("1998-10-03 09:58:31.231"); + long msTime = timestamp.toEpochMilli(); + ByteBuffer buf = ByteBuffer.allocate(12); + buf.order(ByteOrder.LITTLE_ENDIAN); + buf.putLong(msTime); + buf.flip(); + // Need TimeStamp logicalType annotation here + PrimitiveType primitiveType = createInt64TimestampType(false, TimeUnit.MILLIS); + Writable writable = getWritableFromBinaryConverter(createHiveTypeInfo("bigint"), primitiveType, Binary.fromByteBuffer(buf)); + // Retrieve as BigInt + LongWritable longWritable = (LongWritable) writable; + assertEquals(msTime, longWritable.get()); + } + + @Test + public void testGetBigIntConverter() { + Timestamp timestamp = Timestamp.valueOf("1998-10-03 09:58:31.231"); + NanoTime nanoTime = NanoTimeUtils.getNanoTime(timestamp, true); + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT96).named("value"); + Writable writable = getWritableFromBinaryConverter(createHiveTypeInfo("bigint"), primitiveType, nanoTime.toBinary()); + // Retrieve as BigInt + LongWritable longWritable = (LongWritable) writable; + assertEquals(nanoTime.getTimeOfDayNanos(), longWritable.get()); + } + + @Test public void testGetTimestampConverter() throws Exception { Timestamp timestamp = Timestamp.valueOf("2018-06-15 15:12:20.0"); NanoTime nanoTime = NanoTimeUtils.getNanoTime(timestamp, true); diff --git a/ql/src/test/queries/clientpositive/parquet_timestampt_to_bigint.q b/ql/src/test/queries/clientpositive/parquet_timestampt_to_bigint.q new file mode 100644 index 0000000..5aa4ab1 --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_timestampt_to_bigint.q @@ -0,0 +1,25 @@ +set hive.vectorized.execution.enabled=false; +set parquet.column.index.access=true; + +-- Test paquet table with Timestamp Col to BigInt convertion +dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/parquet_format_ts; + +DROP TABLE ts_pq; + +CREATE EXTERNAL TABLE ts_pq (ts1 TIMESTAMP) + STORED AS PARQUET + LOCATION '${system:test.tmp.dir}/parquet_format_ts'; + +INSERT INTO ts_pq VALUES ('1998-10-03 09:58:31.231'); + +SELECT * FROM ts_pq; + +-- Now use data from another table that uses TS as a BIGINT + +CREATE EXTERNAL TABLE ts_pq_2 (ts2 BIGINT) + STORED AS PARQUET + LOCATION '${system:test.tmp.dir}/parquet_format_ts'; + +SELECT * FROM ts_pq_2; + +dfs -rmr ${system:test.tmp.dir}/parquet_format_ts; \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/llap/parquet_timestampt_to_bigint.q.out b/ql/src/test/results/clientpositive/llap/parquet_timestampt_to_bigint.q.out new file mode 100644 index 0000000..63af5b8 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/parquet_timestampt_to_bigint.q.out @@ -0,0 +1,60 @@ +PREHOOK: query: DROP TABLE ts_pq +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE ts_pq +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE EXTERNAL TABLE ts_pq (ts1 TIMESTAMP) + STORED AS PARQUET +#### A masked pattern was here #### +PREHOOK: type: CREATETABLE +#### A masked pattern was here #### +PREHOOK: Output: database:default +PREHOOK: Output: default@ts_pq +POSTHOOK: query: CREATE EXTERNAL TABLE ts_pq (ts1 TIMESTAMP) + STORED AS PARQUET +#### A masked pattern was here #### +POSTHOOK: type: CREATETABLE +#### A masked pattern was here #### +POSTHOOK: Output: database:default +POSTHOOK: Output: default@ts_pq +PREHOOK: query: INSERT INTO ts_pq VALUES ('1998-10-03 09:58:31.231') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@ts_pq +POSTHOOK: query: INSERT INTO ts_pq VALUES ('1998-10-03 09:58:31.231') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@ts_pq +POSTHOOK: Lineage: ts_pq.ts1 SCRIPT [] +PREHOOK: query: SELECT * FROM ts_pq +PREHOOK: type: QUERY +PREHOOK: Input: default@ts_pq +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM ts_pq +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ts_pq +#### A masked pattern was here #### +1998-10-03 09:58:31.231 +PREHOOK: query: CREATE EXTERNAL TABLE ts_pq_2 (ts2 BIGINT) + STORED AS PARQUET +#### A masked pattern was here #### +PREHOOK: type: CREATETABLE +#### A masked pattern was here #### +PREHOOK: Output: database:default +PREHOOK: Output: default@ts_pq_2 +POSTHOOK: query: CREATE EXTERNAL TABLE ts_pq_2 (ts2 BIGINT) + STORED AS PARQUET +#### A masked pattern was here #### +POSTHOOK: type: CREATETABLE +#### A masked pattern was here #### +POSTHOOK: Output: database:default +POSTHOOK: Output: default@ts_pq_2 +PREHOOK: query: SELECT * FROM ts_pq_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@ts_pq_2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM ts_pq_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ts_pq_2 +#### A masked pattern was here #### +61111231000000 +#### A masked pattern was here ####