This is an automated email from the ASF dual-hosted git repository. zabetak pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 5519bb37fb5 HIVE-26658: INT64 Parquet timestamps cannot be mapped to most Hive numeric types (Stamatis Zampetakis reviewed by Chris Nauroth, Steve Carlin, Ayush Saxena) 5519bb37fb5 is described below commit 5519bb37fb5189004804435f09ca9227bd6d9d6b Author: Stamatis Zampetakis <zabe...@gmail.com> AuthorDate: Mon Oct 24 11:50:42 2022 +0200 HIVE-26658: INT64 Parquet timestamps cannot be mapped to most Hive numeric types (Stamatis Zampetakis reviewed by Chris Nauroth, Steve Carlin, Ayush Saxena) Closes #3698 --- .../hive/ql/io/parquet/convert/ETypeConverter.java | 53 +++---- .../ql/io/parquet/convert/TestETypeConverter.java | 85 ++++++++++- .../parquet_int64_timestamp_to_numeric.q | 37 +++++ .../llap/parquet_int64_timestamp_to_numeric.q.out | 162 +++++++++++++++++++++ 4 files changed, 295 insertions(+), 42 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java index 4c3ab70958e..91f19418356 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java @@ -45,6 +45,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BooleanWritable; @@ -448,6 +449,21 @@ public enum ETypeConverter { } } }; + case serdeConstants.TIMESTAMP_TYPE_NAME: + case serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME: + if (type.getLogicalTypeAnnotation() instanceof TimestampLogicalTypeAnnotation) { + TimestampLogicalTypeAnnotation logicalType = + (TimestampLogicalTypeAnnotation) type.getLogicalTypeAnnotation(); + return new PrimitiveConverter() { + @Override + public void addLong(final long value) { + Timestamp timestamp = + ParquetTimestampUtils.getTimestamp(value, logicalType.getUnit(), logicalType.isAdjustedToUTC()); + parent.set(index, new TimestampWritableV2(timestamp)); + } + }; + } + throw new IllegalStateException("Cannot reliably convert INT64 value to timestamp without type annotation"); default: return new PrimitiveConverter() { @Override @@ -743,40 +759,6 @@ public enum ETypeConverter { }; } }, - EINT64_TIMESTAMP_CONVERTER(TimestampWritableV2.class) { - @Override - PrimitiveConverter getConverter(final PrimitiveType type, final int index, final ConverterParent parent, - TypeInfo hiveTypeInfo) { - if (hiveTypeInfo != null) { - String typeName = TypeInfoUtils.getBaseName(hiveTypeInfo.getTypeName()); - final long min = getMinValue(type, typeName, Long.MIN_VALUE); - final long max = getMaxValue(typeName, Long.MAX_VALUE); - - switch (typeName) { - case serdeConstants.BIGINT_TYPE_NAME: - return new PrimitiveConverter() { - @Override - public void addLong(long value) { - if ((value >= min) && (value <= max)) { - parent.set(index, new LongWritable(value)); - } else { - parent.set(index, null); - } - } - }; - } - } - return new PrimitiveConverter() { - @Override - public void addLong(final long value) { - TimestampLogicalTypeAnnotation logicalType = (TimestampLogicalTypeAnnotation) type.getLogicalTypeAnnotation(); - Timestamp timestamp = - ParquetTimestampUtils.getTimestamp(value, logicalType.getUnit(), logicalType.isAdjustedToUTC()); - parent.set(index, new TimestampWritableV2(timestamp)); - } - }; - } - }, EDATE_CONVERTER(DateWritableV2.class) { @Override PrimitiveConverter getConverter(final PrimitiveType type, final int index, final ConverterParent parent, TypeInfo hiveTypeInfo) { @@ -833,7 +815,8 @@ public enum ETypeConverter { @Override public Optional<PrimitiveConverter> visit(TimestampLogicalTypeAnnotation logicalTypeAnnotation) { - return Optional.of(EINT64_TIMESTAMP_CONVERTER.getConverter(type, index, parent, hiveTypeInfo)); + TypeInfo info = hiveTypeInfo == null ? TypeInfoFactory.timestampTypeInfo : hiveTypeInfo; + return Optional.of(EINT64_CONVERTER.getConverter(type, index, parent, info)); } }); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java index cf6444c9c04..3173d2db900 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.io.parquet.convert; +import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.getPrimitiveTypeInfo; import static org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory.stringTypeInfo; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -27,6 +28,7 @@ import java.nio.ByteOrder; import java.time.ZoneId; import java.time.ZoneOffset; +import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.common.type.Timestamp; import org.apache.hadoop.hive.ql.io.parquet.convert.ETypeConverter.BinaryConverter; import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime; @@ -39,6 +41,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; @@ -115,16 +118,84 @@ public class TestETypeConverter { assertEquals(22, (int) doubleWritable.get()); } + @Test + public void testGetInt64TimestampConverterTinyIntHiveType() { + testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "tinyint", 5); + } + + @Test + public void testGetInt64TimestampConverterSmallIntHiveType() { + testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "smallint", 5); + } + + @Test + public void testGetInt64TimestampConverterIntHiveType() { + testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "int", 5); + } + @Test public void testGetInt64TimestampConverterBigIntHiveType() { - Timestamp timestamp = Timestamp.valueOf("1998-10-03 09:58:31.231"); - long msTime = timestamp.toEpochMilli(); - // Need TimeStamp logicalType annotation here + testGetInt64TimestampConverterNumericHiveType("1998-10-03 09:58:31.231", "bigint", 907408711231L); + } + + @Test + public void testGetInt64TimestampConverterFloatHiveType() { + testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "float", 5.0f); + } + + @Test + public void testGetInt64TimestampConverterDoubleHiveType() { + testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "double", 5.0d); + } + + @Test + public void testGetInt64TimestampConverterDecimalHiveType() { + testGetInt64TimestampConverterNumericHiveType("1970-01-01 00:00:00.005", "decimal(1,0)", HiveDecimal.create(5)); + } + + @Test + public void testGetInt64TimestampConverterNoHiveType() { + Timestamp ts = Timestamp.valueOf("2022-10-24 11:35:00.005"); PrimitiveType primitiveType = createInt64TimestampType(false, TimeUnit.MILLIS); - Writable writable = getWritableFromPrimitiveConverter(createHiveTypeInfo("bigint"), primitiveType, msTime); - // Retrieve as BigInt - LongWritable longWritable = (LongWritable) writable; - assertEquals(msTime, longWritable.get()); + Writable writable = getWritableFromPrimitiveConverter(null, primitiveType, ts.toEpochMilli()); + assertEquals("2022-10-24 11:35:00.005", ((TimestampWritableV2) writable).getTimestamp().toString()); + } + + @Test(expected = IllegalStateException.class) + public void testGetInt64NoLogicalAnnotationTimestampHiveType() { + Timestamp ts = Timestamp.valueOf("2022-10-24 11:43:00.005"); + PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT64).named("int64"); + getWritableFromPrimitiveConverter(TypeInfoFactory.timestampTypeInfo, primitiveType, ts.toEpochMilli()); + } + + private void testGetInt64TimestampConverterNumericHiveType(String timestamp, String type, Object expected) { + Timestamp ts = Timestamp.valueOf(timestamp); + PrimitiveType primitiveType = createInt64TimestampType(false, TimeUnit.MILLIS); + PrimitiveTypeInfo info = getPrimitiveTypeInfo(type); + Writable writable = getWritableFromPrimitiveConverter(info, primitiveType, ts.toEpochMilli()); + final Object actual; + switch (info.getPrimitiveCategory()) { + case BYTE: + case SHORT: + case INT: + actual = ((IntWritable) writable).get(); + break; + case LONG: + actual = ((LongWritable) writable).get(); + break; + case FLOAT: + actual = ((FloatWritable) writable).get(); + break; + case DOUBLE: + actual = ((DoubleWritable) writable).get(); + break; + case DECIMAL: + actual = ((HiveDecimalWritable) writable).getHiveDecimal(); + break; + default: + throw new IllegalStateException(info.toString()); + } + assertEquals(expected, actual); } @Test diff --git a/ql/src/test/queries/clientpositive/parquet_int64_timestamp_to_numeric.q b/ql/src/test/queries/clientpositive/parquet_int64_timestamp_to_numeric.q new file mode 100644 index 00000000000..24b3336028d --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_int64_timestamp_to_numeric.q @@ -0,0 +1,37 @@ +set hive.parquet.write.int64.timestamp=true; +set hive.parquet.timestamp.time.unit=micros; +CREATE TABLE hive_26658_table (ts TIMESTAMP) STORED AS PARQUET; + +INSERT INTO hive_26658_table VALUES ('2022-10-21 15:58:32'); +INSERT INTO hive_26658_table VALUES ('1970-01-01 00:00:00.000009'); + +SELECT * FROM hive_26658_table; + +set metastore.disallow.incompatible.col.type.changes=false; +ALTER TABLE hive_26658_table CHANGE ts ts TINYINT; + +SELECT * FROM hive_26658_table; + +ALTER TABLE hive_26658_table CHANGE ts ts SMALLINT; + +SELECT * FROM hive_26658_table; + +ALTER TABLE hive_26658_table CHANGE ts ts INT; + +SELECT * FROM hive_26658_table; + +ALTER TABLE hive_26658_table CHANGE ts ts BIGINT; + +SELECT * FROM hive_26658_table; + +ALTER TABLE hive_26658_table CHANGE ts ts DOUBLE; + +SELECT * FROM hive_26658_table; + +ALTER TABLE hive_26658_table CHANGE ts ts FLOAT; + +SELECT * FROM hive_26658_table; + +ALTER TABLE hive_26658_table CHANGE ts ts Decimal; + +SELECT * FROM hive_26658_table; diff --git a/ql/src/test/results/clientpositive/llap/parquet_int64_timestamp_to_numeric.q.out b/ql/src/test/results/clientpositive/llap/parquet_int64_timestamp_to_numeric.q.out new file mode 100644 index 00000000000..dc2be4032a4 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/parquet_int64_timestamp_to_numeric.q.out @@ -0,0 +1,162 @@ +PREHOOK: query: CREATE TABLE hive_26658_table (ts TIMESTAMP) STORED AS PARQUET +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@hive_26658_table +POSTHOOK: query: CREATE TABLE hive_26658_table (ts TIMESTAMP) STORED AS PARQUET +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@hive_26658_table +PREHOOK: query: INSERT INTO hive_26658_table VALUES ('2022-10-21 15:58:32') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@hive_26658_table +POSTHOOK: query: INSERT INTO hive_26658_table VALUES ('2022-10-21 15:58:32') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@hive_26658_table +POSTHOOK: Lineage: hive_26658_table.ts SCRIPT [] +PREHOOK: query: INSERT INTO hive_26658_table VALUES ('1970-01-01 00:00:00.000009') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@hive_26658_table +POSTHOOK: query: INSERT INTO hive_26658_table VALUES ('1970-01-01 00:00:00.000009') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@hive_26658_table +POSTHOOK: Lineage: hive_26658_table.ts SCRIPT [] +PREHOOK: query: SELECT * FROM hive_26658_table +PREHOOK: type: QUERY +PREHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM hive_26658_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +2022-10-21 15:58:32 +1970-01-01 00:00:00.000009 +PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts TINYINT +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@hive_26658_table +PREHOOK: Output: default@hive_26658_table +POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts TINYINT +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@hive_26658_table +POSTHOOK: Output: default@hive_26658_table +PREHOOK: query: SELECT * FROM hive_26658_table +PREHOOK: type: QUERY +PREHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM hive_26658_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +NULL +9 +PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts SMALLINT +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@hive_26658_table +PREHOOK: Output: default@hive_26658_table +POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts SMALLINT +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@hive_26658_table +POSTHOOK: Output: default@hive_26658_table +PREHOOK: query: SELECT * FROM hive_26658_table +PREHOOK: type: QUERY +PREHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM hive_26658_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +NULL +9 +PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts INT +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@hive_26658_table +PREHOOK: Output: default@hive_26658_table +POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts INT +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@hive_26658_table +POSTHOOK: Output: default@hive_26658_table +PREHOOK: query: SELECT * FROM hive_26658_table +PREHOOK: type: QUERY +PREHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM hive_26658_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +NULL +9 +PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts BIGINT +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@hive_26658_table +PREHOOK: Output: default@hive_26658_table +POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts BIGINT +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@hive_26658_table +POSTHOOK: Output: default@hive_26658_table +PREHOOK: query: SELECT * FROM hive_26658_table +PREHOOK: type: QUERY +PREHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM hive_26658_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +1666367912000000 +9 +PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts DOUBLE +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@hive_26658_table +PREHOOK: Output: default@hive_26658_table +POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts DOUBLE +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@hive_26658_table +POSTHOOK: Output: default@hive_26658_table +PREHOOK: query: SELECT * FROM hive_26658_table +PREHOOK: type: QUERY +PREHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM hive_26658_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +1.666367912E15 +9.0 +PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts FLOAT +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@hive_26658_table +PREHOOK: Output: default@hive_26658_table +POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts FLOAT +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@hive_26658_table +POSTHOOK: Output: default@hive_26658_table +PREHOOK: query: SELECT * FROM hive_26658_table +PREHOOK: type: QUERY +PREHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM hive_26658_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +1.66636785E15 +9.0 +PREHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts Decimal +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@hive_26658_table +PREHOOK: Output: default@hive_26658_table +POSTHOOK: query: ALTER TABLE hive_26658_table CHANGE ts ts Decimal +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@hive_26658_table +POSTHOOK: Output: default@hive_26658_table +PREHOOK: query: SELECT * FROM hive_26658_table +PREHOOK: type: QUERY +PREHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM hive_26658_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hive_26658_table +#### A masked pattern was here #### +NULL +9