DRILL-5971: Fix INT64, INT32 logical types in complex parquet reader Added the following types : ENUM (Binary annotated as ENUM) INT96 (Dictionary encoded) Fixed issue with reading Dictionary encoded fixed width reader Added test file generator
This closes #1049 Project: http://git-wip-us.apache.org/repos/asf/drill/repo Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/90eb23ba Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/90eb23ba Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/90eb23ba Branch: refs/heads/master Commit: 90eb23baa6a1a205fd0821c7af708969fcb98c5c Parents: c5af3ae Author: Parth Chandra <par...@apache.org> Authored: Tue Aug 29 11:20:47 2017 -0700 Committer: Parth Chandra <par...@apache.org> Committed: Thu Jan 11 17:13:47 2018 -0800 ---------------------------------------------------------------------- .../columnreaders/ColumnReaderFactory.java | 11 +- .../ParquetFixedWidthDictionaryReaders.java | 2 +- .../ParquetToDrillTypeConverter.java | 4 + .../parquet2/DrillParquetGroupConverter.java | 18 + .../parquet/ParquetSimpleTestFileGenerator.java | 461 +++++++++++++++++++ .../exec/store/parquet/TestParquetComplex.java | 177 +++++++ .../store/parquet2/TestDrillParquetReader.java | 173 +++++++ .../test/resources/parquet/logical_int.parquet | Bin 0 -> 1133 bytes .../parquet_logical_types_simple.parquet | Bin 0 -> 3366 bytes ...arquet_logical_types_simple_nullable.parquet | Bin 0 -> 3518 bytes .../parquet/complex/logical_int_complex.parquet | Bin 0 -> 1377 bytes .../parquet_logical_types_complex.parquet | Bin 0 -> 3908 bytes ...rquet_logical_types_complex_nullable.parquet | Bin 0 -> 4081 bytes 13 files changed, 843 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ColumnReaderFactory.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ColumnReaderFactory.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ColumnReaderFactory.java index 495f70b..09cdc5d 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ColumnReaderFactory.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ColumnReaderFactory.java @@ -82,8 +82,9 @@ public class ColumnReaderFactory { if (columnChunkMetaData.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN){ return new BitReader(recordReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (BitVector) v, schemaElement); - } else if (columnChunkMetaData.getType() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY || - columnChunkMetaData.getType() == PrimitiveType.PrimitiveTypeName.INT96) { + } else if (!columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY) && ( + columnChunkMetaData.getType() == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY + || columnChunkMetaData.getType() == PrimitiveType.PrimitiveTypeName.INT96)) { if (convertedType == ConvertedType.DECIMAL){ int length = schemaElement.type_length; if (length <= 12) { @@ -125,6 +126,7 @@ public class ColumnReaderFactory { return new ParquetFixedWidthDictionaryReaders.DictionaryTimeReader(recordReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (TimeVector) v, schemaElement); case INT_8: case INT_16: + case INT_32: return new ParquetFixedWidthDictionaryReaders.DictionaryIntReader(recordReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (IntVector) v, schemaElement); case UINT_8: case UINT_16: @@ -138,6 +140,8 @@ public class ColumnReaderFactory { return new ParquetFixedWidthDictionaryReaders.DictionaryBigIntReader(recordReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (BigIntVector) v, schemaElement); } switch (convertedType) { + case INT_64: + return new ParquetFixedWidthDictionaryReaders.DictionaryBigIntReader(recordReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (BigIntVector) v, schemaElement); case UINT_64: return new ParquetFixedWidthDictionaryReaders.DictionaryUInt8Reader(recordReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (UInt8Vector) v, schemaElement); case DECIMAL: @@ -152,6 +156,7 @@ public class ColumnReaderFactory { case DOUBLE: return new ParquetFixedWidthDictionaryReaders.DictionaryFloat8Reader(recordReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (Float8Vector) v, schemaElement); case FIXED_LEN_BYTE_ARRAY: + case INT96: return new ParquetFixedWidthDictionaryReaders.DictionaryFixedBinaryReader(recordReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); default: throw new ExecutionSetupException("Unsupported dictionary column type " + descriptor.getType().name() ); @@ -213,6 +218,7 @@ public class ColumnReaderFactory { } switch (convertedType) { case UTF8: + case ENUM: return new VarLengthColumnReaders.VarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement); case DECIMAL: if (v instanceof Decimal28SparseVector) { @@ -230,6 +236,7 @@ public class ColumnReaderFactory { switch (convertedType) { case UTF8: + case ENUM: return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (NullableVarCharVector) v, schemaElement); case DECIMAL: if (v instanceof NullableDecimal28SparseVector) { http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetFixedWidthDictionaryReaders.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetFixedWidthDictionaryReaders.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetFixedWidthDictionaryReaders.java index 53a68ab..5fbac20 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetFixedWidthDictionaryReaders.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetFixedWidthDictionaryReaders.java @@ -114,7 +114,7 @@ public class ParquetFixedWidthDictionaryReaders { Binary currDictValToWrite = null; for (int i = 0; i < recordsReadInThisIteration; i++){ currDictValToWrite = pageReader.dictionaryValueReader.readBytes(); - mutator.setSafe(valuesReadInCurrentPass + i, currDictValToWrite.toByteBuffer(), 0, + mutator.setSafe(valuesReadInCurrentPass + i, currDictValToWrite.toByteBuffer().slice(), 0, currDictValToWrite.length()); } // Set the write Index. The next page that gets read might be a page that does not use dictionary encoding http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetToDrillTypeConverter.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetToDrillTypeConverter.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetToDrillTypeConverter.java index 3f5f3b2..ad1c4bf 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetToDrillTypeConverter.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/columnreaders/ParquetToDrillTypeConverter.java @@ -49,6 +49,7 @@ public class ParquetToDrillTypeConverter { } switch (convertedType) { case UTF8: + case ENUM: return (TypeProtos.MinorType.VARCHAR); case DECIMAL: ParquetReaderUtility.checkDecimalTypeEnabled(options); @@ -61,6 +62,8 @@ public class ParquetToDrillTypeConverter { return (TypeProtos.MinorType.BIGINT); } switch(convertedType) { + case INT_64: + return TypeProtos.MinorType.BIGINT; case UINT_64: return TypeProtos.MinorType.UINT8; case DECIMAL: @@ -85,6 +88,7 @@ public class ParquetToDrillTypeConverter { return TypeProtos.MinorType.UINT4; case INT_8: case INT_16: + case INT_32: return TypeProtos.MinorType.INT; case DECIMAL: ParquetReaderUtility.checkDecimalTypeEnabled(options); http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetGroupConverter.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetGroupConverter.java b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetGroupConverter.java index 5c7c8e1..7f50d2d 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetGroupConverter.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet2/DrillParquetGroupConverter.java @@ -178,6 +178,15 @@ public class DrillParquetGroupConverter extends GroupConverter { return new DrillIntConverter(writer); } switch(type.getOriginalType()) { + case UINT_8 : + case UINT_16: + case UINT_32: + case INT_8 : + case INT_16 : + case INT_32 : { + IntWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).integer() : mapWriter.integer(name); + return new DrillIntConverter(writer); + } case DECIMAL: { ParquetReaderUtility.checkDecimalTypeEnabled(options); Decimal9Writer writer = type.getRepetition() == Repetition.REPEATED @@ -216,6 +225,11 @@ public class DrillParquetGroupConverter extends GroupConverter { return new DrillBigIntConverter(writer); } switch(type.getOriginalType()) { + case UINT_64: + case INT_64 : { + BigIntWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).bigInt() : mapWriter.bigInt(name); + return new DrillBigIntConverter(writer); + } case DECIMAL: { ParquetReaderUtility.checkDecimalTypeEnabled(options); Decimal18Writer writer = type.getRepetition() == Repetition.REPEATED @@ -267,6 +281,10 @@ public class DrillParquetGroupConverter extends GroupConverter { VarCharWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).varChar() : mapWriter.varChar(name); return new DrillVarCharConverter(writer, mutator.getManagedBuffer()); } + case ENUM: { + VarCharWriter writer = type.getRepetition() == Repetition.REPEATED ? mapWriter.list(name).varChar() : mapWriter.varChar(name); + return new DrillVarCharConverter(writer, mutator.getManagedBuffer()); + } //TODO not sure if BINARY/DECIMAL is actually supported case DECIMAL: { ParquetReaderUtility.checkDecimalTypeEnabled(options); http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/ParquetSimpleTestFileGenerator.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/ParquetSimpleTestFileGenerator.java b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/ParquetSimpleTestFileGenerator.java new file mode 100644 index 0000000..720498b --- /dev/null +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/ParquetSimpleTestFileGenerator.java @@ -0,0 +1,461 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.drill.exec.store.parquet; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.column.ParquetProperties; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.example.data.GroupFactory; +import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.MessageTypeParser; + +import java.io.IOException; +import java.util.Arrays; + +import static org.apache.drill.exec.store.parquet.ParquetSimpleTestFileGenerator.EnumType.MAX_VALUE; +import static org.apache.drill.exec.store.parquet.ParquetSimpleTestFileGenerator.EnumType.MIN_VALUE; +import static org.apache.drill.exec.store.parquet.ParquetSimpleTestFileGenerator.EnumType.RANDOM_VALUE; + + +/** + * Use the Parquet examples to build a simple to use Parquet test file generator. + * Files currently generated by the main program were renamed to be used in the unit tests for DRILL-5971 + * and cover all logical types defined in <a href="https://github.com/apache/parquet-format/blob/master/LogicalTypes.md">Parquet Logical Types </a> + * that are supported by Drill. Embedded types specified in the Parquet specification are not covered by the + * examples but can be added. + * To create a new parquet file, define a schema, create a GroupWriter based on the schema, then add values + * for individual records to the GroupWriter. + * @see org.apache.drill.exec.store.parquet.TestFileGenerator TestFileGenerator + * @see org.apache.parquet.hadoop.example.GroupWriteSupport GroupWriteSupport + * @see org.apache.parquet.example.Paper Dremel Example + */ +public class ParquetSimpleTestFileGenerator { + + public enum EnumType { + RANDOM_VALUE, MAX_VALUE, MIN_VALUE; + } + + public static Path root = new Path("file:/tmp/parquet/"); + public static Configuration conf = new Configuration(); + + public static String simpleSchemaMsg = + "message ParquetLogicalDataTypes { \n" + + " required int32 rowKey; \n" + + " required binary _UTF8 ( UTF8 ) ; \n" + + " required binary _Enum ( ENUM ) ; \n" + + // " required binary _UUID ( UUID ) ; \n" + + " required int32 _INT32_RAW ; \n" + + " required int32 _INT_8 ( INT_8 ) ; \n" + + " required int32 _INT_16 ( INT_16 ) ; \n" + + " required int32 _INT_32 ( INT_32 ) ; \n" + + " required int32 _UINT_8 ( UINT_8 ) ; \n" + + " required int32 _UINT_16 ( UINT_16 ) ; \n" + + " required int32 _UINT_32 ( UINT_32 ) ; \n" + + " required int32 _DECIMAL_decimal9 ( DECIMAL (9,2) ) ; \n" + + " required int64 _INT64_RAW ; \n" + + " required int64 _INT_64 ( INT_64 ) ; \n" + + " required int64 _UINT_64 ( UINT_64 ) ; \n" + + " required int64 _DECIMAL_decimal18 ( DECIMAL (18,2) ) ; \n" + + " required fixed_len_byte_array(20) _DECIMAL_fixed_n ( DECIMAL (20, 2) ) ; \n" + + " required binary _DECIMAL_unlimited ( DECIMAL (30,2) ) ; \n" + + " required int32 _DATE_int32 ( DATE ) ; \n" + + " required int32 _TIME_MILLIS_int32 ( TIME_MILLIS ) ; \n" + + // " required int64 _TIME_MICROS_int64 ( TIME_MICROS ) ; \n" + + " required int64 _TIMESTAMP_MILLIS_int64 ( TIMESTAMP_MILLIS ) ; \n" + + // " required int64 _TIMESTAMP_MICROS_int64 ( TIMESTAMP_MICROS ) ; \n" + + " required fixed_len_byte_array(12) _INTERVAL_fixed_len_byte_array_12 ( INTERVAL ) ; \n" + + " required int96 _INT96_RAW ; \n" + + "} \n" ; + public static String simpleNullableSchemaMsg = + "message ParquetLogicalDataTypes { \n" + + " required int32 rowKey; \n" + + " optional binary _UTF8 ( UTF8 ) ; \n" + + " optional binary _Enum ( ENUM ) ; \n" + + // " optional binary _UUID ( UUID ) ; \n" + + " optional int32 _INT32_RAW ; \n" + + " optional int32 _INT_8 ( INT_8 ) ; \n" + + " optional int32 _INT_16 ( INT_16 ) ; \n" + + " optional int32 _INT_32 ( INT_32 ) ; \n" + + " optional int32 _UINT_8 ( UINT_8 ) ; \n" + + " optional int32 _UINT_16 ( UINT_16 ) ; \n" + + " optional int32 _UINT_32 ( UINT_32 ) ; \n" + + " optional int32 _DECIMAL_decimal9 ( DECIMAL (9,2) ) ; \n" + + " optional int64 _INT64_RAW ; \n" + + " optional int64 _INT_64 ( INT_64 ) ; \n" + + " optional int64 _UINT_64 ( UINT_64 ) ; \n" + + " optional int64 _DECIMAL_decimal18 ( DECIMAL (18,2) ) ; \n" + + " optional fixed_len_byte_array(20) _DECIMAL_fixed_n ( DECIMAL (20, 2) ) ; \n" + + " optional binary _DECIMAL_unlimited ( DECIMAL (30,2) ) ; \n" + + " optional int32 _DATE_int32 ( DATE ) ; \n" + + " optional int32 _TIME_MILLIS_int32 ( TIME_MILLIS ) ; \n" + + // " optional int64 _TIME_MICROS_int64 ( TIME_MICROS ) ; \n" + + " optional int64 _TIMESTAMP_MILLIS_int64 ( TIMESTAMP_MILLIS ) ; \n" + + // " optional int64 _TIMESTAMP_MICROS_int64 ( TIMESTAMP_MICROS ) ; \n" + + " optional fixed_len_byte_array(12) _INTERVAL_fixed_len_byte_array_12 ( INTERVAL ) ; \n" + + " optional int96 _INT96_RAW ; \n" + + "} \n" ; + + public static String complexSchemaMsg = + "message ParquetLogicalDataTypes { \n" + + " required int32 rowKey; \n" + + " required group StringTypes { \n" + + " required binary _UTF8 ( UTF8 ) ; \n" + + " required binary _Enum ( ENUM ) ; \n" + + // " required binary _UUID ( UUID ) ; \n" + + " } \n" + + " required group NumericTypes { \n" + + " required group Int32 { \n" + + " required int32 _INT32_RAW ; \n" + + " required int32 _INT_8 ( INT_8 ) ; \n" + + " required int32 _INT_16 ( INT_16 ) ; \n" + + " required int32 _INT_32 ( INT_32 ) ; \n" + + " required int32 _UINT_8 ( UINT_8 ) ; \n" + + " required int32 _UINT_16 ( UINT_16 ) ; \n" + + " required int32 _UINT_32 ( UINT_32 ) ; \n" + + " required int32 _DECIMAL_decimal9 ( DECIMAL (9,2) ) ; \n" + + " } \n" + + " required group Int64 { \n" + + " required int64 _INT64_RAW ; \n" + + " required int64 _INT_64 ( INT_64 ) ; \n" + + " required int64 _UINT_64 ( UINT_64 ) ; \n" + + " required int64 _DECIMAL_decimal18 ( DECIMAL (18,2) ) ; \n" + + " } \n" + + " required group FixedLen { \n" + + " required fixed_len_byte_array(20) _DECIMAL_fixed_n ( DECIMAL (20, 2) ) ; \n" + + " } \n" + + " required group Binary { \n" + + " required binary _DECIMAL_unlimited ( DECIMAL (30,2) ) ; \n" + + " } \n" + + " required group DateTimeTypes { \n" + + " required int32 _DATE_int32 ( DATE ) ; \n" + + " required int32 _TIME_MILLIS_int32 ( TIME_MILLIS ) ; \n" + + // " required int64 _TIME_MICROS_int64 ( TIME_MICROS ) ; \n" + + " required int64 _TIMESTAMP_MILLIS_int64 ( TIMESTAMP_MILLIS ) ; \n" + + // " required int64 _TIMESTAMP_MICROS_int64 ( TIMESTAMP_MICROS ) ; \n" + + " required fixed_len_byte_array(12) _INTERVAL_fixed_len_byte_array_12 ( INTERVAL ) ; \n" + + " } \n" + + " required group Int96 { \n" + + " required int96 _INT96_RAW ; \n" + + " } \n" + + " } \n" + + "} \n" ; + public static String complexNullableSchemaMsg = + "message ParquetLogicalDataTypes { \n" + + " required int32 rowKey; \n" + + " optional group StringTypes { \n" + + " optional binary _UTF8 ( UTF8 ) ; \n" + + " optional binary _Enum ( ENUM ) ; \n" + + // " optional binary _UUID ( UUID ) ; \n" + + " } \n" + + " optional group NumericTypes { \n" + + " optional group Int32 { \n" + + " optional int32 _INT32_RAW ; \n" + + " optional int32 _INT_8 ( INT_8 ) ; \n" + + " optional int32 _INT_16 ( INT_16 ) ; \n" + + " optional int32 _INT_32 ( INT_32 ) ; \n" + + " optional int32 _UINT_8 ( UINT_8 ) ; \n" + + " optional int32 _UINT_16 ( UINT_16 ) ; \n" + + " optional int32 _UINT_32 ( UINT_32 ) ; \n" + + " optional int32 _DECIMAL_decimal9 ( DECIMAL (9,2) ) ; \n" + + " } \n" + + " optional group Int64 { \n" + + " optional int64 _INT64_RAW ; \n" + + " optional int64 _INT_64 ( INT_64 ) ; \n" + + " optional int64 _UINT_64 ( UINT_64 ) ; \n" + + " optional int64 _DECIMAL_decimal18 ( DECIMAL (18,2) ) ; \n" + + " } \n" + + " optional group FixedLen { \n" + + " optional fixed_len_byte_array(20) _DECIMAL_fixed_n ( DECIMAL (20, 2) ) ; \n" + + " } \n" + + " optional group Binary { \n" + + " optional binary _DECIMAL_unlimited ( DECIMAL (30,2) ) ; \n" + + " } \n" + + " optional group DateTimeTypes { \n" + + " optional int32 _DATE_int32 ( DATE ) ; \n" + + " optional int32 _TIME_MILLIS_int32 ( TIME_MILLIS ) ; \n" + + // " optional int64 _TIME_MICROS_int64 ( TIME_MICROS ) ; \n" + + " optional int64 _TIMESTAMP_MILLIS_int64 ( TIMESTAMP_MILLIS ) ; \n" + + // " optional int64 _TIMESTAMP_MICROS_int64 ( TIMESTAMP_MICROS ) ; \n" + + " optional fixed_len_byte_array(12) _INTERVAL_fixed_len_byte_array_12 ( INTERVAL ) ; \n" + + " } \n" + + " optional group Int96 { \n" + + " optional int96 _INT96_RAW ; \n" + + " } \n" + + " } \n" + + "} \n" ; + + public static MessageType simpleSchema = MessageTypeParser.parseMessageType(simpleSchemaMsg); + public static MessageType complexSchema = MessageTypeParser.parseMessageType(complexSchemaMsg); + public static MessageType simpleNullableSchema = MessageTypeParser.parseMessageType(simpleNullableSchemaMsg); + public static MessageType complexNullableSchema = MessageTypeParser.parseMessageType(complexNullableSchemaMsg); + + + public static Path initFile(String fileName) { + return new Path(root, fileName); + } + + public static ParquetWriter<Group> initWriter(MessageType schema, String fileName) throws IOException { + + GroupWriteSupport.setSchema(schema, conf); + + ParquetWriter<Group> writer = + new ParquetWriter<Group>(initFile(fileName), + ParquetFileWriter.Mode.OVERWRITE, + new GroupWriteSupport(), + CompressionCodecName.SNAPPY, + 1024, + 1024, + 512, + true, // enable dictionary encoding, + false, + ParquetProperties.WriterVersion.PARQUET_1_0, conf + ); + + return writer; + } + + public static void writeComplexValues(GroupFactory gf, ParquetWriter<Group> complexWriter, boolean writeNulls) throws IOException { + int rowKey = 0; + byte[] bytes12 = {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'a', 'b' }; + // Write complex values + { + Group complexGroup = gf.newGroup(); + complexGroup.add("rowKey", ++rowKey); + complexGroup.addGroup("StringTypes").append("_UTF8", "UTF8 string" + rowKey).append("_Enum", RANDOM_VALUE + .toString()); + // .append("_UUID", "00112233445566778899aabbccddeeff"); + Group numeric = complexGroup.addGroup("NumericTypes"); + numeric.addGroup("Int32") + .append("_INT32_RAW", 1234567) + .append("_INT_8", 123) + .append("_INT_16", 12345) + .append("_INT_32", 1234567) + .append("_UINT_8", 123) + .append("_UINT_16", 1234) + .append("_UINT_32", 1234567) + .append("_DECIMAL_decimal9", 1234567); + numeric.addGroup("Int64") + .append("_INT64_RAW", 1234567890123456L) + .append("_INT_64", 1234567890123456L) + .append("_UINT_64", 1234567890123456L) + .append("_DECIMAL_decimal18", 1234567890123456L); + numeric.addGroup("FixedLen").append("_DECIMAL_fixed_n", "12345678901234567890"); + numeric.addGroup("Binary").append("_DECIMAL_unlimited", "123456789012345678901234567890"); + numeric.addGroup("DateTimeTypes") + .append("_DATE_int32", 1234567) + .append("_TIME_MILLIS_int32", 1234567) + .append("_TIMESTAMP_MILLIS_int64", 123456789012L) + .append("_INTERVAL_fixed_len_byte_array_12", Binary.fromConstantByteArray(bytes12, 0, 12)); + numeric.addGroup("Int96").append("_INT96_RAW", Binary.fromConstantByteArray(bytes12, 0, 12)); + complexWriter.write(complexGroup); + } + { + Group complexGroup = gf.newGroup(); + complexGroup.add("rowKey", ++rowKey); + complexGroup.addGroup("StringTypes").append("_UTF8", "UTF8 string" + rowKey).append("_Enum", MAX_VALUE + .toString()); + // .append("_UUID", "00112233445566778899aabbccddeeff"); + Group numeric = complexGroup.addGroup("NumericTypes"); + numeric.addGroup("Int32") + .append("_INT32_RAW", 0x7FFFFFFF) + .append("_INT_8", 0x7F) + .append("_INT_16", 0x7FFF) + .append("_INT_32", 0x7FFFFFFF) + .append("_UINT_8", 0xFF) + .append("_UINT_16", 0xFFFF) + .append("_UINT_32", 0xFFFFFFFF) + .append("_DECIMAL_decimal9", 0xFFFFFFFF); + numeric.addGroup("Int64") + .append("_INT64_RAW", 0x7FFFFFFFFFFFFFFFL) + .append("_INT_64", 0x7FFFFFFFFFFFFFFFL) + .append("_UINT_64", 0xFFFFFFFFFFFFFFFFL) + .append("_DECIMAL_decimal18", 0xFFFFFFFFFFFFFFFFL); + byte[] bytes = new byte[30]; Arrays.fill(bytes, (byte)1); + numeric.addGroup("FixedLen").append("_DECIMAL_fixed_n", Binary.fromConstantByteArray(bytes, 0, 20)); + numeric.addGroup("Binary").append("_DECIMAL_unlimited", Binary.fromConstantByteArray(bytes, 0, 30)); + numeric.addGroup("DateTimeTypes") + .append("_DATE_int32", 0xFFFFFFFF) + .append("_TIME_MILLIS_int32", 0xFFFFFFFF) + .append("_TIMESTAMP_MILLIS_int64", 0x1F3FFFFFFFFL) + .append("_INTERVAL_fixed_len_byte_array_12", Binary.fromConstantByteArray(bytes, 0, 12)); + numeric.addGroup("Int96").append("_INT96_RAW", Binary.fromConstantByteArray(bytes, 0, 12)); + complexWriter.write(complexGroup); + } + { + Group complexGroup = gf.newGroup(); + complexGroup.add("rowKey", ++rowKey); + complexGroup.addGroup("StringTypes").append("_UTF8", "UTF8 string" + rowKey).append("_Enum", MIN_VALUE + .toString()); + // .append("_UUID", "00112233445566778899aabbccddeeff"); + Group numeric = complexGroup.addGroup("NumericTypes"); + numeric.addGroup("Int32") + .append("_INT32_RAW", 0x80000000) + .append("_INT_8", 0xFFFFFF80) + .append("_INT_16", 0xFFFF8000) + .append("_INT_32", 0x80000000) + .append("_UINT_8", 0x0) + .append("_UINT_16", 0x0) + .append("_UINT_32", 0x0) + .append("_DECIMAL_decimal9", 0x0); + numeric.addGroup("Int64") + .append("_INT64_RAW", 0x8000000000000000L) + .append("_INT_64", 0x8000000000000000L) + .append("_UINT_64", 0x0L) + .append("_DECIMAL_decimal18", 0x0L); + numeric.addGroup("FixedLen").append("_DECIMAL_fixed_n", Binary.fromConstantByteArray(new byte[20], 0, 20)); + numeric.addGroup("Binary").append("_DECIMAL_unlimited", Binary.fromConstantByteArray(new byte[30], 0, 30)); + numeric.addGroup("DateTimeTypes") + .append("_DATE_int32", 0x0) + .append("_TIME_MILLIS_int32", 0x0) + .append("_TIMESTAMP_MILLIS_int64", 0x0L) + .append("_INTERVAL_fixed_len_byte_array_12", Binary.fromConstantByteArray( new byte[12], 0, 12)); + numeric.addGroup("Int96").append("_INT96_RAW", Binary.fromConstantByteArray( new byte[12], 0, 12)); + complexWriter.write(complexGroup); + } + if (writeNulls) { + Group simpleGroup = gf.newGroup(); + simpleGroup.append("rowKey", ++rowKey); + complexWriter.write(simpleGroup); + } + + } + + + public static void writeSimpleValues(SimpleGroupFactory sgf, ParquetWriter<Group> simpleWriter, boolean writeNulls) throws IOException { + int rowKey = 0; + // Write simple values + { + byte[] bytes12 = {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'a', 'b' }; + Group simpleGroup = sgf.newGroup(); + simpleGroup.append("rowKey", ++rowKey); + simpleGroup.append("_UTF8", "UTF8 string" + rowKey).append("_Enum", RANDOM_VALUE.toString()) + // .append("_UUID", "00112233445566778899aabbccddeeff"); + .append("_INT32_RAW", 1234567) + .append("_INT_8", 123) + .append("_INT_16", 12345) + .append("_INT_32", 1234567) + .append("_UINT_8", 123) + .append("_UINT_16", 1234) + .append("_UINT_32", 1234567) + .append("_DECIMAL_decimal9", 1234567) + .append("_INT64_RAW", 1234567890123456L) + .append("_INT_64", 1234567890123456L) + .append("_UINT_64", 1234567890123456L) + .append("_DECIMAL_decimal18", 1234567890123456L) + .append("_DECIMAL_fixed_n", "12345678901234567890") + .append("_DECIMAL_unlimited", "123456789012345678901234567890") + .append("_DATE_int32", 1234567) + .append("_TIME_MILLIS_int32", 1234567) + .append("_TIMESTAMP_MILLIS_int64", 123456789012L) + .append("_INTERVAL_fixed_len_byte_array_12", Binary.fromConstantByteArray(bytes12, 0, 12)) + .append("_INT96_RAW", Binary.fromConstantByteArray(bytes12, 0, 12)); + simpleWriter.write(simpleGroup); + } + { + Group simpleGroup = sgf.newGroup(); + byte[] bytes = new byte[30]; Arrays.fill(bytes, (byte)1); + simpleGroup.append("rowKey", ++rowKey); + simpleGroup.append("_UTF8", "UTF8 string" + rowKey) + .append("_Enum", MAX_VALUE.toString()) + // .append("_UUID", "00112233445566778899aabbccddeeff"); + .append("_INT32_RAW", 0x7FFFFFFF) + .append("_INT_8", 0x7F) + .append("_INT_16", 0x7FFF) + .append("_INT_32", 0x7FFFFFFF) + .append("_UINT_8", 0xFF) + .append("_UINT_16", 0xFFFF) + .append("_UINT_32", 0xFFFFFFFF) + .append("_DECIMAL_decimal9", 0xFFFFFFFF) + .append("_INT64_RAW", 0x7FFFFFFFFFFFFFFFL) + .append("_INT_64", 0x7FFFFFFFFFFFFFFFL) + .append("_UINT_64", 0xFFFFFFFFFFFFFFFFL) + .append("_DECIMAL_decimal18", 0xFFFFFFFFFFFFFFFFL) + .append("_DECIMAL_fixed_n", Binary.fromConstantByteArray(bytes, 0, 20)) + .append("_DECIMAL_unlimited", Binary.fromConstantByteArray(bytes, 0, 30)) + .append("_DATE_int32", 0xFFFFFFFF) + .append("_TIME_MILLIS_int32", 0xFFFFFFFF) + .append("_TIMESTAMP_MILLIS_int64", 0x1F3FFFFFFFFL) + .append("_INTERVAL_fixed_len_byte_array_12", Binary.fromConstantByteArray(bytes, 0, 12)) + .append("_INT96_RAW", Binary.fromConstantByteArray(bytes, 0, 12)); + simpleWriter.write(simpleGroup); + } + { + Group simpleGroup = sgf.newGroup(); + simpleGroup.append("rowKey", ++rowKey); + simpleGroup.append("_UTF8", "UTF8 string" + rowKey).append("_Enum", MIN_VALUE.toString()) + // .append("_UUID", "00112233445566778899aabbccddeeff"); + .append("_INT32_RAW", 0x80000000) + .append("_INT_8", 0xFFFFFF80) + .append("_INT_16", 0xFFFF8000) + .append("_INT_32", 0x80000000) + .append("_UINT_8", 0x0) + .append("_UINT_16", 0x0) + .append("_UINT_32", 0x0) + .append("_DECIMAL_decimal9", 0x0) + .append("_INT64_RAW", 0x8000000000000000L) + .append("_INT_64", 0x8000000000000000L) + .append("_UINT_64", 0x0L) + .append("_DECIMAL_decimal18", 0x0L) + .append("_DECIMAL_fixed_n", Binary.fromConstantByteArray(new byte[20], 0, 20)) + .append("_DECIMAL_unlimited", Binary.fromConstantByteArray(new byte[30], 0, 30)) + .append("_DATE_int32", 0x0) + .append("_TIME_MILLIS_int32", 0x0) + .append("_TIMESTAMP_MILLIS_int64", 0x0L) + .append("_INTERVAL_fixed_len_byte_array_12", Binary.fromConstantByteArray( new byte[12], 0, 12)) + .append("_INT96_RAW", Binary.fromConstantByteArray( new byte[12], 0, 12)); + simpleWriter.write(simpleGroup); + } + if (writeNulls) { + Group simpleGroup = sgf.newGroup(); + simpleGroup.append("rowKey", ++rowKey); + simpleWriter.write(simpleGroup); + } + } + + public static void main(String[] args) throws IOException { + + SimpleGroupFactory sgf = new SimpleGroupFactory(simpleSchema); + GroupFactory gf = new SimpleGroupFactory(complexSchema); + SimpleGroupFactory sngf = new SimpleGroupFactory(simpleNullableSchema); + GroupFactory ngf = new SimpleGroupFactory(complexNullableSchema); + + ParquetWriter<Group> simpleWriter = initWriter(simpleSchema, "drill/parquet_test_file_simple"); + ParquetWriter<Group> complexWriter = initWriter(complexSchema, "drill/parquet_test_file_complex"); + ParquetWriter<Group> simpleNullableWriter = initWriter(simpleNullableSchema, "drill/parquet_test_file_simple_nullable"); + ParquetWriter<Group> complexNullableWriter = initWriter(complexNullableSchema, "drill/parquet_test_file_complex_nullable"); + + ParquetSimpleTestFileGenerator.writeSimpleValues(sgf, simpleWriter, false); + ParquetSimpleTestFileGenerator.writeSimpleValues(sngf, simpleNullableWriter, true); + ParquetSimpleTestFileGenerator.writeComplexValues(gf, complexWriter, false); + ParquetSimpleTestFileGenerator.writeComplexValues(ngf, complexNullableWriter, true); + + simpleWriter.close(); + complexWriter.close(); + simpleNullableWriter.close(); + complexNullableWriter.close(); + + } + +} http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetComplex.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetComplex.java b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetComplex.java index 7c148cb..8252834 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetComplex.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet/TestParquetComplex.java @@ -18,8 +18,15 @@ package org.apache.drill.exec.store.parquet; import org.apache.drill.test.BaseTestQuery; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.Period; import org.junit.Test; +import java.util.Arrays; + +import static org.apache.drill.test.TestBuilder.mapOf; + public class TestParquetComplex extends BaseTestQuery { private static final String DATAFILE = "cp.`store/parquet/complex/complex.parquet`"; @@ -193,4 +200,174 @@ public class TestParquetComplex extends BaseTestQuery { .run(); } + @Test //DRILL-5971 + public void testComplexLogicalIntTypes() throws Exception { + String query = String.format("select t.complextype as complextype, " + + "t.uint_64 as uint_64, t.uint_32 as uint_32, t.uint_16 as uint_16, t.uint_8 as uint_8, " + + "t.int_64 as int_64, t.int_32 as int_32, t.int_16 as int_16, t.int_8 as int_8 " + + "from cp.`store/parquet/complex/logical_int_complex.parquet` t" ); + String[] columns = {"complextype", "uint_64", "uint_32", "uint_16", "uint_8", "int_64", "int_32", "int_16", "int_8" }; + testBuilder() + .sqlQuery(query) + .unOrdered() + .baselineColumns(columns) + .baselineValues(mapOf("a","a","b","b") , 0L , 0 , 0 , 0 , 0L , 0 , 0 ,0 ) + .baselineValues(mapOf("a","a","b","b") , -1L , -1 , -1 , -1 , -1L , -1 , -1 , -1 ) + .baselineValues(mapOf("a","a","b","b") , 1L , 1 , 1 , 1 , -9223372036854775808L , 1 , 1 , 1 ) + .baselineValues(mapOf("a","a","b","b") , 9223372036854775807L , 2147483647 , 65535 , 255 , 9223372036854775807L , -2147483648 , -32768 , -128 ) + .build() + .run(); + } + + @Test //DRILL-5971 + public void testComplexLogicalIntTypes2() throws Exception { + byte[] bytes12 = {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'a', 'b' }; + byte[] bytesOnes = new byte[12]; + byte[] bytesZeros = new byte[12]; + Arrays.fill(bytesOnes, (byte) 1); + String query = String.format( + " select " + + " t.rowKey as rowKey, " + + " t.StringTypes._UTF8 as _UTF8, " + + " t.StringTypes._Enum as _Enum, " + + " t.NumericTypes.Int32._INT32_RAW as _INT32_RAW, " + + " t.NumericTypes.Int32._INT_8 as _INT_8, " + + " t.NumericTypes.Int32._INT_16 as _INT_16, " + + " t.NumericTypes.Int32._INT_32 as _INT_32, " + + " t.NumericTypes.Int32._UINT_8 as _UINT_8, " + + " t.NumericTypes.Int32._UINT_16 as _UINT_16, " + + " t.NumericTypes.Int32._UINT_32 as _UINT_32, " + + " t.NumericTypes.Int64._INT64_RAW as _INT64_RAW, " + + " t.NumericTypes.Int64._INT_64 as _INT_64, " + + " t.NumericTypes.Int64._UINT_64 as _UINT_64, " + + " t.NumericTypes.DateTimeTypes._DATE_int32 as _DATE_int32, " + + " t.NumericTypes.DateTimeTypes._TIME_MILLIS_int32 as _TIME_MILLIS_int32, " + + " t.NumericTypes.DateTimeTypes._TIMESTAMP_MILLIS_int64 as _TIMESTAMP_MILLIS_int64, " + + " t.NumericTypes.DateTimeTypes._INTERVAL_fixed_len_byte_array_12 as _INTERVAL_fixed_len_byte_array_12, " + + " t.NumericTypes.Int96._INT96_RAW as _INT96_RAW " + + " from " + + " cp.`store/parquet/complex/parquet_logical_types_complex.parquet` t " + + " order by t.rowKey " + ); + String[] columns = { + "rowKey " , + "_UTF8" , + "_Enum" , + "_INT32_RAW" , + "_INT_8" , + "_INT_16" , + "_INT_32" , + "_UINT_8" , + "_UINT_16" , + "_UINT_32" , + "_INT64_RAW" , + "_INT_64" , + "_UINT_64" , + "_DATE_int32" , + "_TIME_MILLIS_int32" , + "_TIMESTAMP_MILLIS_int64" , + "_INTERVAL_fixed_len_byte_array_12" , + "_INT96_RAW" + + }; + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns(columns) + .baselineValues(1, "UTF8 string1", "RANDOM_VALUE", 1234567, 123, 12345, 1234567, 123, 1234, 1234567, + 1234567890123456L, 1234567890123456L, 1234567890123456L, new DateTime("5350-02-17"), + new DateTime(1234567, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("1973-11-29T21:33:09.012"), + new Period().plusMonths(875770417).plusDays(943142453).plusMillis(1650536505), + bytes12) + .baselineValues(2, "UTF8 string2", "MAX_VALUE", 2147483647, 127, 32767, 2147483647, 255, 65535, -1, + 9223372036854775807L, 9223372036854775807L, -1L, new DateTime("1969-12-31"), + new DateTime(0xFFFFFFFF, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("2038-01-19T03:14:07.999"), + new Period().plusMonths(16843009).plusDays(16843009).plusMillis(16843009), + bytesOnes) + .baselineValues(3, "UTF8 string3", "MIN_VALUE", -2147483648, -128, -32768, -2147483648, 0, 0, 0, + -9223372036854775808L, -9223372036854775808L, 0L, new DateTime("1970-01-01"), + new DateTime(0, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("1970-01-01T00:00:00.0"), new Period("PT0S"), bytesZeros) + .build() + .run(); + } + + @Test //DRILL-5971 + public void testComplexLogicalIntTypes3() throws Exception { + byte[] bytes12 = {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'a', 'b' }; + byte[] bytesOnes = new byte[12]; + byte[] bytesZeros = new byte[12]; + Arrays.fill(bytesOnes, (byte) 1); + String query = String.format( + " select " + + " t.rowKey as rowKey, " + + " t.StringTypes._UTF8 as _UTF8, " + + " t.StringTypes._Enum as _Enum, " + + " t.NumericTypes.Int32._INT32_RAW as _INT32_RAW, " + + " t.NumericTypes.Int32._INT_8 as _INT_8, " + + " t.NumericTypes.Int32._INT_16 as _INT_16, " + + " t.NumericTypes.Int32._INT_32 as _INT_32, " + + " t.NumericTypes.Int32._UINT_8 as _UINT_8, " + + " t.NumericTypes.Int32._UINT_16 as _UINT_16, " + + " t.NumericTypes.Int32._UINT_32 as _UINT_32, " + + " t.NumericTypes.Int64._INT64_RAW as _INT64_RAW, " + + " t.NumericTypes.Int64._INT_64 as _INT_64, " + + " t.NumericTypes.Int64._UINT_64 as _UINT_64, " + + " t.NumericTypes.DateTimeTypes._DATE_int32 as _DATE_int32, " + + " t.NumericTypes.DateTimeTypes._TIME_MILLIS_int32 as _TIME_MILLIS_int32, " + + " t.NumericTypes.DateTimeTypes._TIMESTAMP_MILLIS_int64 as _TIMESTAMP_MILLIS_int64, " + + " t.NumericTypes.DateTimeTypes._INTERVAL_fixed_len_byte_array_12 as _INTERVAL_fixed_len_byte_array_12, " + + " t.NumericTypes.Int96._INT96_RAW as _INT96_RAW " + + " from " + + " cp.`store/parquet/complex/parquet_logical_types_complex_nullable.parquet` t " + + " order by t.rowKey " + ); + String[] columns = { + "rowKey " , + "_UTF8" , + "_Enum" , + "_INT32_RAW" , + "_INT_8" , + "_INT_16" , + "_INT_32" , + "_UINT_8" , + "_UINT_16" , + "_UINT_32" , + "_INT64_RAW" , + "_INT_64" , + "_UINT_64" , + "_DATE_int32" , + "_TIME_MILLIS_int32" , + "_TIMESTAMP_MILLIS_int64" , + "_INTERVAL_fixed_len_byte_array_12" , + "_INT96_RAW" + + }; + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns(columns) + .baselineValues(1, "UTF8 string1", "RANDOM_VALUE", 1234567, 123, 12345, 1234567, 123, 1234, 1234567, + 1234567890123456L, 1234567890123456L, 1234567890123456L, new DateTime("5350-02-17"), + new DateTime(1234567, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("1973-11-29T21:33:09.012"), + new Period().plusMonths(875770417).plusDays(943142453).plusMillis(1650536505), + bytes12) + .baselineValues(2, "UTF8 string2", "MAX_VALUE", 2147483647, 127, 32767, 2147483647, 255, 65535, -1, + 9223372036854775807L, 9223372036854775807L, -1L, new DateTime("1969-12-31"), + new DateTime(0xFFFFFFFF, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("2038-01-19T03:14:07.999"), + new Period().plusMonths(16843009).plusDays(16843009).plusMillis(16843009), + bytesOnes) + .baselineValues(3, "UTF8 string3", "MIN_VALUE", -2147483648, -128, -32768, -2147483648, 0, 0, 0, + -9223372036854775808L, -9223372036854775808L, 0L, new DateTime("1970-01-01"), + new DateTime(0, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("1970-01-01T00:00:00.0"), new Period("PT0S"), bytesZeros) + .baselineValues(4, null, null, null, null, null, null, null, null, null, null, null, null, null, + null, null, null, null) + .build().run(); + } + } http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet2/TestDrillParquetReader.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet2/TestDrillParquetReader.java b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet2/TestDrillParquetReader.java index ec2651d..38f0fa0 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet2/TestDrillParquetReader.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/store/parquet2/TestDrillParquetReader.java @@ -19,11 +19,15 @@ package org.apache.drill.exec.store.parquet2; import org.apache.drill.test.BaseTestQuery; import org.apache.drill.exec.planner.physical.PlannerSettings; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; +import org.joda.time.Period; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import java.math.BigDecimal; +import java.util.Arrays; public class TestDrillParquetReader extends BaseTestQuery { // enable decimal data type @@ -98,4 +102,173 @@ public class TestDrillParquetReader extends BaseTestQuery { .baselineValues(null, null, null, null, null, null, 0, 0, 0, 0L, 0, 0) .go(); } + + @Test + public void testLogicalIntTypes() throws Exception { + String query = String.format("select " + + "t.uint_64 as uint_64, t.uint_32 as uint_32, t.uint_16 as uint_16, t.uint_8 as uint_8, " + + "t.int_64 as int_64, t.int_32 as int_32, t.int_16 as int_16, t.int_8 as int_8 " + + "from cp.`parquet/logical_int.parquet` t" ); + String[] columns = {"uint_64", "uint_32", "uint_16", "uint_8", "int_64", "int_32", "int_16", "int_8" }; + testBuilder() + .sqlQuery(query) + .unOrdered() + .baselineColumns(columns) + .baselineValues( 0L , 0 , 0 , 0 , 0L , 0 , 0 ,0 ) + .baselineValues( -1L , -1 , -1 , -1 , -1L , -1 , -1 , -1 ) + .baselineValues( 1L , 1 , 1 , 1 , -9223372036854775808L , 1 , 1 , 1 ) + .baselineValues( 9223372036854775807L , 2147483647 , 65535 , 255 , 9223372036854775807L , -2147483648 , -32768 , -128 ) + .build() + .run(); + } + + @Test //DRILL-5971 + public void testLogicalIntTypes2() throws Exception { + byte[] bytes12 = {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'a', 'b' }; + byte[] bytesOnes = new byte[12]; Arrays.fill(bytesOnes, (byte)1); + byte[] bytesZeros = new byte[12]; + String query = String.format( + " select " + + " t.rowKey as rowKey, " + + " t._UTF8 as _UTF8, " + + " t._Enum as _Enum, " + + " t._INT32_RAW as _INT32_RAW, " + + " t._INT_8 as _INT_8, " + + " t._INT_16 as _INT_16, " + + " t._INT_32 as _INT_32, " + + " t._UINT_8 as _UINT_8, " + + " t._UINT_16 as _UINT_16, " + + " t._UINT_32 as _UINT_32, " + + " t._INT64_RAW as _INT64_RAW, " + + " t._INT_64 as _INT_64, " + + " t._UINT_64 as _UINT_64, " + + " t._DATE_int32 as _DATE_int32, " + + " t._TIME_MILLIS_int32 as _TIME_MILLIS_int32, " + + " t._TIMESTAMP_MILLIS_int64 as _TIMESTAMP_MILLIS_int64, " + + " t._INTERVAL_fixed_len_byte_array_12 as _INTERVAL_fixed_len_byte_array_12, " + + " t._INT96_RAW as _INT96_RAW " + + " from " + + " cp.`parquet/parquet_logical_types_simple.parquet` t " + + " order by t.rowKey " + ); + String[] columns = { + "rowKey " , + "_UTF8" , + "_Enum" , + "_INT32_RAW" , + "_INT_8" , + "_INT_16" , + "_INT_32" , + "_UINT_8" , + "_UINT_16" , + "_UINT_32" , + "_INT64_RAW" , + "_INT_64" , + "_UINT_64" , + "_DATE_int32" , + "_TIME_MILLIS_int32" , + "_TIMESTAMP_MILLIS_int64" , + "_INTERVAL_fixed_len_byte_array_12" , + "_INT96_RAW" + + }; + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns(columns) + .baselineValues(1, "UTF8 string1", "RANDOM_VALUE", 1234567, 123, 12345, 1234567, 123, 1234, 1234567, + 1234567890123456L, 1234567890123456L, 1234567890123456L, new DateTime("5350-02-17"), + new DateTime(1234567, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("1973-11-29T21:33:09.012"), + new Period().plusMonths(875770417).plusDays(943142453).plusMillis(1650536505), + bytes12) + .baselineValues(2, "UTF8 string2", "MAX_VALUE", 2147483647, 127, 32767, 2147483647, 255, 65535, -1, + 9223372036854775807L, 9223372036854775807L, -1L, new DateTime("1969-12-31"), + new DateTime(0xFFFFFFFF, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("2038-01-19T03:14:07.999"), + new Period().plusMonths(16843009).plusDays(16843009).plusMillis(16843009), + bytesOnes) + .baselineValues(3, "UTF8 string3", "MIN_VALUE", -2147483648, -128, -32768, -2147483648, 0, 0, 0, + -9223372036854775808L, -9223372036854775808L, 0L, new DateTime("1970-01-01"), + new DateTime(0, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("1970-01-01T00:00:00.0"), new Period("PT0S"), bytesZeros) + .build() + .run(); + } + + @Test //DRILL-5971 + public void testLogicalIntTypes3() throws Exception { + byte[] bytes12 = {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'a', 'b' }; + byte[] bytesOnes = new byte[12]; Arrays.fill(bytesOnes, (byte)1); + byte[] bytesZeros = new byte[12]; + String query = String.format( + " select " + + " t.rowKey as rowKey, " + + " t._UTF8 as _UTF8, " + + " t._Enum as _Enum, " + + " t._INT32_RAW as _INT32_RAW, " + + " t._INT_8 as _INT_8, " + + " t._INT_16 as _INT_16, " + + " t._INT_32 as _INT_32, " + + " t._UINT_8 as _UINT_8, " + + " t._UINT_16 as _UINT_16, " + + " t._UINT_32 as _UINT_32, " + + " t._INT64_RAW as _INT64_RAW, " + + " t._INT_64 as _INT_64, " + + " t._UINT_64 as _UINT_64, " + + " t._DATE_int32 as _DATE_int32, " + + " t._TIME_MILLIS_int32 as _TIME_MILLIS_int32, " + + " t._TIMESTAMP_MILLIS_int64 as _TIMESTAMP_MILLIS_int64, " + + " t._INTERVAL_fixed_len_byte_array_12 as _INTERVAL_fixed_len_byte_array_12, " + + " t._INT96_RAW as _INT96_RAW " + + " from " + + " cp.`parquet/parquet_logical_types_simple_nullable.parquet` t " + + " order by t.rowKey " + ); + String[] columns = { + "rowKey " , + "_UTF8" , + "_Enum" , + "_INT32_RAW" , + "_INT_8" , + "_INT_16" , + "_INT_32" , + "_UINT_8" , + "_UINT_16" , + "_UINT_32" , + "_INT64_RAW" , + "_INT_64" , + "_UINT_64" , + "_DATE_int32" , + "_TIME_MILLIS_int32" , + "_TIMESTAMP_MILLIS_int64" , + "_INTERVAL_fixed_len_byte_array_12" , + "_INT96_RAW" + + }; + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns(columns) + .baselineValues(1, "UTF8 string1", "RANDOM_VALUE", 1234567, 123, 12345, 1234567, 123, 1234, 1234567, + 1234567890123456L, 1234567890123456L, 1234567890123456L, new DateTime("5350-02-17"), + new DateTime(1234567, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("1973-11-29T21:33:09.012"), + new Period().plusMonths(875770417).plusDays(943142453).plusMillis(1650536505), + bytes12) + .baselineValues(2, "UTF8 string2", "MAX_VALUE", 2147483647, 127, 32767, 2147483647, 255, 65535, -1, + 9223372036854775807L, 9223372036854775807L, -1L, new DateTime("1969-12-31"), + new DateTime(0xFFFFFFFF, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("2038-01-19T03:14:07.999"), + new Period().plusMonths(16843009).plusDays(16843009).plusMillis(16843009), + bytesOnes) + .baselineValues(3, "UTF8 string3", "MIN_VALUE", -2147483648, -128, -32768, -2147483648, 0, 0, 0, + -9223372036854775808L, -9223372036854775808L, 0L, new DateTime("1970-01-01"), + new DateTime(0, DateTimeZone.UTC).withZoneRetainFields(DateTimeZone.getDefault()), + new DateTime("1970-01-01T00:00:00.0"), new Period("PT0S"), bytesZeros) + .baselineValues(4, null, null, null, null, null, null, null, null, null, null, null, null, null, + null, null, null, null) + .build().run(); + } + } http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/test/resources/parquet/logical_int.parquet ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/parquet/logical_int.parquet b/exec/java-exec/src/test/resources/parquet/logical_int.parquet new file mode 100644 index 0000000..288150c Binary files /dev/null and b/exec/java-exec/src/test/resources/parquet/logical_int.parquet differ http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/test/resources/parquet/parquet_logical_types_simple.parquet ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/parquet/parquet_logical_types_simple.parquet b/exec/java-exec/src/test/resources/parquet/parquet_logical_types_simple.parquet new file mode 100644 index 0000000..edae0e5 Binary files /dev/null and b/exec/java-exec/src/test/resources/parquet/parquet_logical_types_simple.parquet differ http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/test/resources/parquet/parquet_logical_types_simple_nullable.parquet ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/parquet/parquet_logical_types_simple_nullable.parquet b/exec/java-exec/src/test/resources/parquet/parquet_logical_types_simple_nullable.parquet new file mode 100644 index 0000000..4c97da9 Binary files /dev/null and b/exec/java-exec/src/test/resources/parquet/parquet_logical_types_simple_nullable.parquet differ http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/test/resources/store/parquet/complex/logical_int_complex.parquet ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/store/parquet/complex/logical_int_complex.parquet b/exec/java-exec/src/test/resources/store/parquet/complex/logical_int_complex.parquet new file mode 100644 index 0000000..5d73799 Binary files /dev/null and b/exec/java-exec/src/test/resources/store/parquet/complex/logical_int_complex.parquet differ http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/test/resources/store/parquet/complex/parquet_logical_types_complex.parquet ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/store/parquet/complex/parquet_logical_types_complex.parquet b/exec/java-exec/src/test/resources/store/parquet/complex/parquet_logical_types_complex.parquet new file mode 100644 index 0000000..ec797e0 Binary files /dev/null and b/exec/java-exec/src/test/resources/store/parquet/complex/parquet_logical_types_complex.parquet differ http://git-wip-us.apache.org/repos/asf/drill/blob/90eb23ba/exec/java-exec/src/test/resources/store/parquet/complex/parquet_logical_types_complex_nullable.parquet ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/store/parquet/complex/parquet_logical_types_complex_nullable.parquet b/exec/java-exec/src/test/resources/store/parquet/complex/parquet_logical_types_complex_nullable.parquet new file mode 100644 index 0000000..59b7a75 Binary files /dev/null and b/exec/java-exec/src/test/resources/store/parquet/complex/parquet_logical_types_complex_nullable.parquet differ