>From Preetham Poluparthi <[email protected]>: Preetham Poluparthi has uploaded this change for review. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21139?usp=email )
Change subject: [ASTERIXDB-3755][EXT] Improve parquet row group filters whhile querying external collections ...................................................................... [ASTERIXDB-3755][EXT] Improve parquet row group filters whhile querying external collections - user model changes: no - storage format changes: no - interface changes: no Ext-ref: MB-70714 Change-Id: I04c20ddb68b9bee8e3b92d4f90796bc57c965840 --- A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.01.ddl.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.02.update.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.03.ddl.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.04.query.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.05.query.sqlpp A asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-nested-fields/parquet-nested-fields.04.adm A asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-nested-fields/parquet-nested-fields.05.adm M asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml M asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/utils/filter/ParquetFilterBuilder.java 9 files changed, 258 insertions(+), 13 deletions(-) git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/39/21139/1 diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.01.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.01.ddl.sqlpp new file mode 100644 index 0000000..abff4f0 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.01.ddl.sqlpp @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +DROP DATAVERSE test if exists; +CREATE DATAVERSE test; +USE test; + +CREATE TYPE ColumnType2 AS { +}; + + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.02.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.02.update.sqlpp new file mode 100644 index 0000000..28d6def --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.02.update.sqlpp @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + + +COPY ( +select c.* + from + [ + { + "id" : 1, + "name" : "Alice", + "address" : { "street" : "100 MG Road", "city" : "Bangalore", "zip" : "560001" }, + "phones" : ["9876543210", "9123456780"], + "orders" : [ + { "order_id" : 101, "amount" : 250.0, "items" : ["laptop", "mouse"] }, + { "order_id" : 102, "amount" : 30.5, "items" : ["keyboard"] } + ] + }, + { + "id" : 2, + "name" : "Bob", + "address" : { "street" : "200 Wall St", "city" : "New York", "zip" : "10005" }, + "phones" : ["2125551234"], + "orders" : [ + { "order_id" : 201, "amount" : 99.99, "items" : ["monitor"] } + ] + }, + { + "id" : 3, + "name" : "Charlie", + "address" : { "street" : "50 Indiranagar", "city" : "Bangalore", "zip" : "560038" }, + "phones" : [], + "orders" : [ + { "order_id" : 301, "amount" : 500.0, "items" : ["phone", "case", "charger"] }, + { "order_id" : 302, "amount" : 15.0, "items" : ["cable"] }, + { "order_id" : 303, "amount" : 75.0, "items" : ["headphones", "adapter"] } + ] + }, + { + "id" : 4, + "name" : "Diana", + "address" : { "street" : "10 Downing St", "city" : "London", "zip" : "SW1A 2AA" }, + "phones" : ["4420712345", "4420798765"], + "orders" : [] + } + ] c + + + ) toWriter +TO %adapter% +PATH (%pathprefix% "copy-to-result", "parquet-nested-fields") +WITH { + %template_colons%, + %additionalProperties% + "format":"parquet", + "version" : "2" +}; \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.03.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.03.ddl.sqlpp new file mode 100644 index 0000000..826d052 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.03.ddl.sqlpp @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + + +CREATE EXTERNAL DATASET DatasetCopy(ColumnType2) USING %adapter% +( + %template%, + %additional_Properties% + ("definition"="%path_prefix%copy-to-result/parquet-nested-fields"), + ("format" = "parquet"), + ("requireVersionChangeDetection"="false"), + ("include"="*.parquet") +); \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.04.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.04.query.sqlpp new file mode 100644 index 0000000..5707636 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.04.query.sqlpp @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + + +/* + * This test is to make sure + * row group filters are applied correctly while reading parquet files. + * Logs should not contain "Error creating Parquet row-group filter expression" + */ + +SELECT c.* +FROM DatasetCopy c +where c.address.city = "Bangalore" +and "560001" = c.zip +order by c.id ; + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.05.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.05.query.sqlpp new file mode 100644 index 0000000..06c297a --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-nested-fields/parquet-nested-fields.05.query.sqlpp @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +USE test; + + +/* + * This test is to make sure + * row group filters are applied correctly while reading parquet files. + * Logs should not contain "Error creating Parquet row-group filter expression" + * + */ + + + +SELECT c.id, c.name , o.order_id, o.amount, o.items +FROM DatasetCopy c +UNNEST c.orders o +where c.address.city = "Bangalore" and o.order_id = 101 +order by c.id , o.order_id; + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-nested-fields/parquet-nested-fields.04.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-nested-fields/parquet-nested-fields.04.adm new file mode 100644 index 0000000..5a1508e --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-nested-fields/parquet-nested-fields.04.adm @@ -0,0 +1,2 @@ +{ "address": { "zip": "560001", "city": "Bangalore", "street": "100 MG Road" }, "name": "Alice", "phones": [ "9876543210", "9123456780" ], "orders": [ { "amount": 250.0, "order_id": 101, "items": [ "laptop", "mouse" ] }, { "amount": 30.5, "order_id": 102, "items": [ "keyboard" ] } ], "id": 1 } +{ "address": { "zip": "560038", "city": "Bangalore", "street": "50 Indiranagar" }, "name": "Charlie", "phones": [ ], "orders": [ { "amount": 500.0, "order_id": 301, "items": [ "phone", "case", "charger" ] }, { "amount": 15.0, "order_id": 302, "items": [ "cable" ] }, { "amount": 75.0, "order_id": 303, "items": [ "headphones", "adapter" ] } ], "id": 3 } diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-nested-fields/parquet-nested-fields.05.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-nested-fields/parquet-nested-fields.05.adm new file mode 100644 index 0000000..0a463a2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-nested-fields/parquet-nested-fields.05.adm @@ -0,0 +1 @@ +{ "id": 1, "name": "Alice", "order_id": 101, "amount": 250.0, "items": [ "laptop", "mouse" ] } \ No newline at end of file diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml index 09743fe..6023fd7 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml @@ -80,6 +80,16 @@ </compilation-unit> </test-case> <test-case FilePath="copy-to"> + <compilation-unit name="parquet-nested-fields"> + <placeholder name="adapter" value="S3" /> + <placeholder name="pathprefix" value="" /> + <placeholder name="path_prefix" value="" /> + <placeholder name="additionalProperties" value='"container":"playground",' /> + <placeholder name="additional_Properties" value='("container"="playground"),' /> + <output-dir compare="Text">parquet-nested-fields</output-dir> + </compilation-unit> + </test-case> + <test-case FilePath="copy-to"> <compilation-unit name="parquet-null1"> <placeholder name="adapter" value="S3" /> <placeholder name="pathprefix" value="" /> diff --git a/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/utils/filter/ParquetFilterBuilder.java b/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/utils/filter/ParquetFilterBuilder.java index 6cdef59..0130756 100644 --- a/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/utils/filter/ParquetFilterBuilder.java +++ b/asterixdb/asterix-metadata/src/main/java/org/apache/asterix/metadata/utils/filter/ParquetFilterBuilder.java @@ -74,9 +74,20 @@ return parquetFilterPredicate; } - private FilterPredicate createComparisonExpression(ILogicalExpression columnName, ILogicalExpression constValue, + private FilterPredicate createComparisonExpression(ILogicalExpression arg1, ILogicalExpression arg2, FunctionIdentifier fid) throws AlgebricksException { - ConstantExpression constExpr = (ConstantExpression) constValue; + ILogicalExpression columnName; + ConstantExpression constExpr; + if (arg1.getExpressionTag().equals(LogicalExpressionTag.CONSTANT)) { + constExpr = (ConstantExpression) arg1; + columnName = arg2; + } else if (arg2.getExpressionTag().equals(LogicalExpressionTag.CONSTANT)) { + constExpr = (ConstantExpression) arg2; + columnName = constExpr; + } else { + throw new RuntimeException("Unsupported filter expression type"); + } + if (constExpr.getValue().isNull() || constExpr.getValue().isMissing()) { throw new RuntimeException("Unsupported literal type: " + constExpr.getValue()); } @@ -136,7 +147,7 @@ } List<Mutable<ILogicalExpression>> args = funcExpr.getArguments(); if (fid.equals(AlgebricksBuiltinFunctions.AND) || fid.equals(AlgebricksBuiltinFunctions.OR)) { - return createAndOrPredicate(fid, args, 0); + return createAndOrPredicate(fid, args, 0, args.size()); } else { return createComparisonExpression(args.get(0).getValue(), args.get(1).getValue(), fid); } @@ -192,23 +203,40 @@ // Converts or(pred1, pred2, pred3) to or(pred1, or(pred2, pred3)) private FilterPredicate createAndOrPredicate(FunctionIdentifier function, List<Mutable<ILogicalExpression>> args, - int index) throws AlgebricksException { - if (index == args.size() - 2) { + int leftInclusive, int rightExclusive) throws AlgebricksException { + if (rightExclusive - leftInclusive == 1) { + return createLeafFilterPredicate(args.get(leftInclusive)); + } else if (rightExclusive - leftInclusive == 2) { + FilterPredicate left = createLeafFilterPredicate(args.get(leftInclusive)), + right = createLeafFilterPredicate(args.get(leftInclusive + 1)); if (function.equals(AlgebricksBuiltinFunctions.AND)) { - return FilterApi.and(createFilterExpression(args.get(0).getValue()), - createFilterExpression(args.get(1).getValue())); + return FilterApi.and(left, right); } else { - return FilterApi.or(createFilterExpression(args.get(0).getValue()), - createFilterExpression(args.get(1).getValue())); + return FilterApi.or(left, right); } } else { + int middle = (leftInclusive + rightExclusive) / 2; + FilterPredicate left = createAndOrPredicate(function, args, leftInclusive, middle), + right = createAndOrPredicate(function, args, middle, rightExclusive); if (function.equals(AlgebricksBuiltinFunctions.AND)) { - return FilterApi.and(createFilterExpression(args.get(index).getValue()), - createAndOrPredicate(function, args, index + 1)); + return FilterApi.and(left, right); } else { - return FilterApi.or(createFilterExpression(args.get(index).getValue()), - createAndOrPredicate(function, args, index + 1)); + return FilterApi.or(left, right); } } } + + private FilterPredicate createLeafFilterPredicate(Mutable<ILogicalExpression> expression) + throws AlgebricksException { + if (expression.get().getExpressionTag() == LogicalExpressionTag.FUNCTION_CALL) { + AbstractFunctionCallExpression functionCall = (AbstractFunctionCallExpression) expression.get(); + if (functionCall.getArguments().size() != 2) { + throw new RuntimeException("Error creating Filter for functions with arguments size other than 2"); + } + return createComparisonExpression(functionCall.getArguments().get(0).get(), + functionCall.getArguments().get(1).get(), functionCall.getFunctionIdentifier()); + } else { + throw new RuntimeException("Unsupported expression: " + expression.get()); + } + } } -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/21139?usp=email To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings?usp=email Gerrit-MessageType: newchange Gerrit-Project: asterixdb Gerrit-Branch: lumina Gerrit-Change-Id: I04c20ddb68b9bee8e3b92d4f90796bc57c965840 Gerrit-Change-Number: 21139 Gerrit-PatchSet: 1 Gerrit-Owner: Preetham Poluparthi <[email protected]>
