>From <[email protected]>: [email protected] has uploaded this change for review. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19559 )
Change subject: [ASTERIXDB-3392] Add type hierarchy in schema inference for copy to parquet ...................................................................... [ASTERIXDB-3392] Add type hierarchy in schema inference for copy to parquet Details: Implemented type hierarchy: double > float > int64 > int32. Schema inference will now assign the largest type encountered for a given field. Ext-ref: MB-65895 Change-Id: Iabc5e2d8e68bf17f4c080ddaf6cc41141a0c3345 --- M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java M asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml A asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.query.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.01.ddl.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.03.update.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.02.update.sqlpp M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java M asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.04.update.sqlpp 11 files changed, 242 insertions(+), 18 deletions(-) git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/59/19559/1 diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.01.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.01.ddl.sqlpp new file mode 100644 index 0000000..6ab6331 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.01.ddl.sqlpp @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +DROP DATAVERSE test if exists; +CREATE DATAVERSE test; +USE test; + +CREATE TYPE ColumnType1 AS { + id: int + }; + + +CREATE COLLECTION TestCollection(ColumnType1) PRIMARY KEY id; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.02.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.02.update.sqlpp new file mode 100644 index 0000000..9a6f3c4 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.02.update.sqlpp @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use test; + + +insert into TestCollection({"id":2}); +insert into TestCollection({"id":5,"rating" :1 , "ratings" : [] }); +insert into TestCollection({"id":8,"rating" :2 , "ratings" : [ 1 ] }); +insert into TestCollection({"id":10,"rating" :3.0 , "ratings" : [ 1, 2, 3] }); +insert into TestCollection({"id":12,"rating" :4.3 , "ratings" : [1,2,3.0,4,5]}); +insert into TestCollection({"id":15,"rating" :4.7 , "ratings" : [1.0,2.0,3.0,4.0,5.0]}); +insert into TestCollection({"id":17,"rating" :4.22222 , "ratings" : [1.1111,2.222222,3.3333,4.44444,5.555555]}); +insert into TestCollection({"id":20,"rating" :5.455555555 , "ratings" : [1,2,3,4,5]}); +insert into TestCollection({"id":21,"rating" :1 , "ratings" : [0.0,6.7]}); +insert into TestCollection({"id":27,"rating" :8 , "ratings" : [1]}); +insert into TestCollection({"id":28,"rating" :3 , "ratings" : []}); + + + + + + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.03.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.03.update.sqlpp new file mode 100644 index 0000000..b4a7b18 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.03.update.sqlpp @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + +COPY ( +select c.* from TestCollection c + ) toWriter +TO %adapter% +PATH (%pathprefix% "copy-to-result", "parquet-type-hierarchy") +WITH { + %template_colons%, + %additionalProperties% + "format":"parquet", + "max-schemas" : "1" + }; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.04.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.04.update.sqlpp new file mode 100644 index 0000000..dad1090 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.04.update.sqlpp @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + + +CREATE TYPE ColumnType2 AS { + }; + + + +CREATE EXTERNAL DATASET TestDataset(ColumnType2) USING %adapter% +( + %template%, + %additional_Properties%, + ("definition"="%path_prefix%copy-to-result/parquet-type-hierarchy/"), + ("include"="*.parquet"), + ("requireVersionChangeDetection"="false"), + ("format" = "parquet") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.query.sqlpp new file mode 100644 index 0000000..3d26c18 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.query.sqlpp @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + + +SELECT c.* +FROM TestDataset c +ORDER BY c.id; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm new file mode 100644 index 0000000..5c4c334 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm @@ -0,0 +1,11 @@ +{ "id": 2 } +{ "ratings": [ ], "rating": 1.0, "id": 5 } +{ "ratings": [ 1 ], "rating": 2, "id": 8 } +{ "ratings": [ 1, 2, 3 ], "rating": 3, "id": 10 } +{ "ratings": [ 1.0, 2.0, 3.0, 4.0, 5.0 ], "rating": 4.3, "id": 12 } +{ "ratings": [ 1.0, 2.0, 3.0, 4.0, 5.0 ], "rating": 4.7, "id": 15 } +{ "ratings": [ 1.1111, 2.222222, 3.3333, 4.44444, 5.555555 ], "rating": 4.22222, "id": 17 } +{ "ratings": [ 1.0, 2.0, 3.0, 4.0, 5.0 ], "rating": 5.455555555, "id": 20 } +{ "ratings": [ 0.0, 6.7 ], "rating": 1.0, "id": 21 } +{ "ratings": [ 1.0 ], "rating": 8.0, "id": 27 } +{ "ratings": [ ], "rating": 3, "id": 28 } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml index a76f9de..3851bc0 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml @@ -125,6 +125,16 @@ </compilation-unit> </test-case> <test-case FilePath="copy-to"> + <compilation-unit name="parquet-type-hierarchy"> + <placeholder name="adapter" value="S3" /> + <placeholder name="pathprefix" value="" /> + <placeholder name="path_prefix" value="" /> + <placeholder name="additionalProperties" value='"container":"playground",' /> + <placeholder name="additional_Properties" value='("container"="playground")' /> + <output-dir compare="Text">parquet-type-hierarchy</output-dir> + </compilation-unit> + </test-case> + <test-case FilePath="copy-to"> <compilation-unit name="empty-path"> <output-dir compare="Text">empty-path</output-dir> </compilation-unit> diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java index 1eb8e84..6bdc586 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java @@ -26,6 +26,14 @@ public class AsterixParquetTypeMap { + public static final Map<ATypeTag, Integer> HIERARCHIAL_TYPES = Map.ofEntries(Map.entry(ATypeTag.TINYINT, 1), + Map.entry(ATypeTag.SMALLINT, 1), Map.entry(ATypeTag.INTEGER, 1), Map.entry(ATypeTag.BIGINT, 2), + Map.entry(ATypeTag.FLOAT, 3), Map.entry(ATypeTag.DOUBLE, 4)); + + public static ATypeTag maxHierarchicalType(ATypeTag a, ATypeTag b) { + return HIERARCHIAL_TYPES.get(a) > HIERARCHIAL_TYPES.get(b) ? a : b; + } + public static final Map<ATypeTag, PrimitiveType.PrimitiveTypeName> PRIMITIVE_TYPE_NAME_MAP = Map.ofEntries(Map.entry(ATypeTag.BOOLEAN, PrimitiveType.PrimitiveTypeName.BOOLEAN), Map.entry(ATypeTag.STRING, PrimitiveType.PrimitiveTypeName.BINARY), diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java index b591175..39e4e74 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java @@ -109,21 +109,20 @@ if (!AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.containsKey(pointable.getTypeTag())) { throw RuntimeDataException.create(TYPE_UNSUPPORTED_PARQUET_WRITE, pointable.getTypeTag()); } - schemaNode.setType(new ParquetSchemaTree.FlatType( - AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.get(pointable.getTypeTag()), - AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP.get(pointable.getTypeTag()))); + schemaNode.setType(new ParquetSchemaTree.FlatType(pointable.getTypeTag())); return null; } if (!(schemaNode.getType() instanceof ParquetSchemaTree.FlatType)) { throw new HyracksDataException(ErrorCode.RESULT_DOES_NOT_FOLLOW_SCHEMA); } ParquetSchemaTree.FlatType flatType = (ParquetSchemaTree.FlatType) schemaNode.getType(); - if (!(flatType.getPrimitiveTypeName() == AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP - .get(pointable.getTypeTag())) - || !(flatType.getLogicalTypeAnnotation() == AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP - .get(pointable.getTypeTag()))) { + + if (!flatType.isCompatibleWith(pointable.getTypeTag())) { throw new HyracksDataException(ErrorCode.RESULT_DOES_NOT_FOLLOW_SCHEMA); } + + flatType.coalesce(pointable.getTypeTag()); + return null; } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java index ff512df..35d8e22 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java @@ -25,6 +25,7 @@ import java.util.HashMap; import java.util.Map; +import org.apache.asterix.om.types.ATypeTag; import org.apache.hyracks.api.exceptions.ErrorCode; import org.apache.hyracks.api.exceptions.HyracksDataException; import org.apache.parquet.schema.LogicalTypeAnnotation; @@ -73,21 +74,44 @@ } static class FlatType extends AbstractType { - private final PrimitiveType.PrimitiveTypeName primitiveTypeName; - private final LogicalTypeAnnotation logicalTypeAnnotation; + // private final PrimitiveType.PrimitiveTypeName primitiveTypeName; + // private final LogicalTypeAnnotation logicalTypeAnnotation; + // + // public FlatType(PrimitiveType.PrimitiveTypeName primitiveTypeName, + // LogicalTypeAnnotation logicalTypeAnnotation) { + // this.primitiveTypeName = primitiveTypeName; + // this.logicalTypeAnnotation = logicalTypeAnnotation; + // } - public FlatType(PrimitiveType.PrimitiveTypeName primitiveTypeName, - LogicalTypeAnnotation logicalTypeAnnotation) { - this.primitiveTypeName = primitiveTypeName; - this.logicalTypeAnnotation = logicalTypeAnnotation; + private ATypeTag typeTag; + private boolean isHierarchical; + + public FlatType(ATypeTag typeTag) { + this.typeTag = typeTag; + isHierarchical = AsterixParquetTypeMap.HIERARCHIAL_TYPES.containsKey(typeTag); + } + + public boolean isCompatibleWith(ATypeTag typeTag) { + if (isHierarchical) { + return AsterixParquetTypeMap.HIERARCHIAL_TYPES.containsKey(typeTag); + } else { + return this.typeTag == typeTag; + } + } + + public void coalesce(ATypeTag typeTag) { + if (!isCompatibleWith(typeTag) || !isHierarchical) { + return; + } + this.typeTag = AsterixParquetTypeMap.maxHierarchicalType(this.typeTag, typeTag); } public LogicalTypeAnnotation getLogicalTypeAnnotation() { - return logicalTypeAnnotation; + return AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP.get(typeTag); } public PrimitiveType.PrimitiveTypeName getPrimitiveTypeName() { - return primitiveTypeName; + return AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.get(typeTag); } } diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java index 44cd5b2..d9fff6d 100644 --- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java +++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java @@ -113,9 +113,7 @@ ParquetSchemaTree.FlatType flatType = (ParquetSchemaTree.FlatType) schemaNode.getType(); - if (flatType.getPrimitiveTypeName() != AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.get(pointable.getTypeTag()) - || !(flatType.getLogicalTypeAnnotation() == AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP - .get(pointable.getTypeTag()))) { + if (!flatType.isCompatibleWith(pointable.getTypeTag())) { return ISchemaChecker.SchemaComparisonType.CONFLICTING; } -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19559 To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-Project: asterixdb Gerrit-Branch: ionic Gerrit-Change-Id: Iabc5e2d8e68bf17f4c080ddaf6cc41141a0c3345 Gerrit-Change-Number: 19559 Gerrit-PatchSet: 1 Gerrit-Owner: [email protected] Gerrit-MessageType: newchange
