[NO ISSUE][COMP][RT] Enable multiway similarity joins - Enable the FuzzyJoinRule that transforms a nested-loop-similarity-join plan to a three-stage-similarity join. - Modify FuzzyJoinRuleCollections. - Add the ExtractCommonExpressionRule to extract common expressions in the star-like multiple similarity join substitutions. - Add the InlineSubplanInputForNestedTupleSourceRule to translate the generated subplan from the similarity function-derived substitution into join in case of nested schemas. - Use similarity-jaccard-prefix to enable the pp+ join strategy. - Use the right side to build the heavy hash join on the prefix tokens from both sides. - Add RemoveAssign/Variables/AggRules to iteratively remove unused assign/vars once FuzzyJoinRule is applied in each round. - Add three new optimization cases for multiway similarity joins. - link-like multiway similarity joins - star-like multiway similarity joins - hybrid multiway similarity joins with the both styles of similarity joins. - Add a check whether a similarity function is on a select over an existing similarity join. - Change the inverted-index-based similarity join to the three-stage-similarity join due to efficiency considerations.
Change-Id: I8736f104905eeda763d39709e002c2b9629278cc Reviewed-on: https://asterix-gerrit.ics.uci.edu/1076 Sonar-Qube: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Tested-by: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Contrib: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Integration-Tests: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Reviewed-by: Dmitry Lychagin <dmitry.lycha...@couchbase.com> Reviewed-by: Taewoo Kim <wangs...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/asterixdb/repo Commit: http://git-wip-us.apache.org/repos/asf/asterixdb/commit/d906bd89 Tree: http://git-wip-us.apache.org/repos/asf/asterixdb/tree/d906bd89 Diff: http://git-wip-us.apache.org/repos/asf/asterixdb/diff/d906bd89 Branch: refs/heads/master Commit: d906bd89e48351156262c1c22096d23269bf0f0a Parents: e7fa4b3 Author: Taewoo Kim <wangs...@yahoo.com> Authored: Mon Oct 8 18:01:23 2018 -0700 Committer: Taewoo Kim <wangs...@gmail.com> Committed: Tue Oct 9 17:58:47 2018 -0700 ---------------------------------------------------------------------- .../provider/DefaultRuleSetFactory.java | 8 +- .../asterix/optimizer/base/FuzzyUtils.java | 12 + .../asterix/optimizer/base/RuleCollections.java | 12 +- .../asterix/optimizer/rules/FuzzyJoinRule.java | 508 ++-- .../data/dblp-small/csx-small-multi-id.txt | 100 + .../data/dblp-small/dblp-small-multi-id.txt | 100 + .../data/pub-small/csx-small-multi-id.txt | 100 + .../asterix-app/data/pub-small/csxauthors.adm | 196 ++ .../data/pub-small/dblp-small-multi-id.txt | 100 + .../asterix-app/data/pub-small/dblpauthors.adm | 194 ++ .../src/test/resources/optimizerts/ignore.txt | 1 - .../optimizerts/queries/fj-dblp-csx-hybrid.aql | 69 + .../queries/fj-dblp-csx-selflink.aql | 66 + .../optimizerts/queries/fj-dblp-csx-simple.aql | 54 + .../optimizerts/queries/fj-dblp-csx-star.aql | 69 + .../optimizerts/queries/fj-dblp-csx.aql | 101 +- .../jaccard-similarity-join-dual-order.aql | 109 + .../jaccard-similarity-join-right-ahead.aql | 89 + .../optimizerts/results/fj-dblp-csx-hybrid.plan | 613 +++++ .../results/fj-dblp-csx-selflink.plan | 2436 +++++++++++++++++ .../optimizerts/results/fj-dblp-csx-simple.plan | 146 ++ .../optimizerts/results/fj-dblp-csx-star.plan | 2481 ++++++++++++++++++ .../optimizerts/results/fj-dblp-csx.plan | 134 + .../ngram-jaccard-inline.plan | 170 +- .../word-jaccard-inline.plan | 170 +- .../results/inverted-index-join/issue741.plan | 155 +- ...obe-pidx-with-join-jaccard-check-idx_01.plan | 166 +- .../ngram-fuzzyeq-jaccard_01.plan | 157 +- .../ngram-fuzzyeq-jaccard_02.plan | 157 +- .../ngram-fuzzyeq-jaccard_03.plan | 158 +- .../ngram-jaccard-check_01.plan | 157 +- .../ngram-jaccard-check_02.plan | 157 +- .../ngram-jaccard-check_03.plan | 158 +- .../ngram-jaccard-check_04.plan | 160 +- .../inverted-index-join/ngram-jaccard_01.plan | 157 +- .../inverted-index-join/ngram-jaccard_02.plan | 157 +- .../inverted-index-join/ngram-jaccard_03.plan | 158 +- .../inverted-index-join/ngram-jaccard_04.plan | 160 +- .../word-fuzzyeq-jaccard_01.plan | 157 +- .../word-fuzzyeq-jaccard_02.plan | 157 +- .../word-fuzzyeq-jaccard_03.plan | 158 +- .../word-jaccard-check-after-btree-access.plan | 178 +- .../word-jaccard-check_01.plan | 157 +- .../word-jaccard-check_02.plan | 157 +- .../word-jaccard-check_03.plan | 158 +- .../word-jaccard-check_04.plan | 160 +- .../inverted-index-join/word-jaccard_01.plan | 157 +- .../inverted-index-join/word-jaccard_02.plan | 157 +- .../inverted-index-join/word-jaccard_03.plan | 158 +- .../inverted-index-join/word-jaccard_04.plan | 160 +- .../ngram-jaccard-check-multi-let.plan | 2 +- .../word-jaccard-check-multi-let.plan | 2 +- ...obe-pidx-with-join-jaccard-check-idx_01.plan | 171 +- .../ngram-fuzzyeq-jaccard_01.plan | 164 +- .../ngram-jaccard-check_01.plan | 164 +- .../ngram-jaccard-inline.plan | 199 +- .../inverted-index-join/ngram-jaccard_01.plan | 164 +- .../olist-jaccard-inline.plan | 2 +- .../ulist-jaccard-inline.plan | 2 +- .../word-fuzzyeq-jaccard_01.plan | 164 +- .../word-jaccard-check-after-btree-access.plan | 202 +- .../word-jaccard-check_01.plan | 164 +- .../word-jaccard-inline.plan | 199 +- .../inverted-index-join/word-jaccard_01.plan | 164 +- .../ngram-jaccard-check-multi-let.plan | 2 +- .../word-jaccard-check-multi-let.plan | 2 +- .../ngram-fuzzyeq-jaccard_01.plan | 164 +- .../ngram-fuzzyeq-jaccard_02.plan | 164 +- .../ngram-fuzzyeq-jaccard_03.plan | 163 +- .../ngram-fuzzyeq-jaccard_04.plan | 164 +- .../ngram-jaccard-check_01.plan | 164 +- .../ngram-jaccard-check_02.plan | 164 +- .../ngram-jaccard-check_03.plan | 163 +- .../ngram-jaccard-check_04.plan | 164 +- .../ngram-jaccard-inline.plan | 199 +- .../inverted-index-join/ngram-jaccard_01.plan | 164 +- .../inverted-index-join/ngram-jaccard_02.plan | 164 +- .../inverted-index-join/ngram-jaccard_03.plan | 163 +- .../inverted-index-join/ngram-jaccard_04.plan | 164 +- .../word-fuzzyeq-jaccard_01.plan | 164 +- .../word-fuzzyeq-jaccard_02.plan | 164 +- .../word-fuzzyeq-jaccard_03.plan | 163 +- .../word-fuzzyeq-jaccard_04.plan | 164 +- .../word-jaccard-check-after-btree-access.plan | 202 +- .../word-jaccard-check_01.plan | 164 +- .../word-jaccard-check_02.plan | 164 +- .../word-jaccard-check_03.plan | 163 +- .../word-jaccard-check_04.plan | 164 +- .../word-jaccard-inline.plan | 199 +- .../inverted-index-join/word-jaccard_01.plan | 164 +- .../inverted-index-join/word-jaccard_02.plan | 164 +- .../inverted-index-join/word-jaccard_03.plan | 163 +- .../inverted-index-join/word-jaccard_04.plan | 164 +- .../ngram-fuzzyeq-jaccard_01.plan | 157 +- .../ngram-fuzzyeq-jaccard_02.plan | 157 +- .../ngram-fuzzyeq-jaccard_03.plan | 158 +- .../ngram-fuzzyeq-jaccard_04.plan | 157 +- .../ngram-jaccard-check_01.plan | 157 +- .../ngram-jaccard-check_02.plan | 157 +- .../ngram-jaccard-check_03.plan | 158 +- .../ngram-jaccard-check_04.plan | 157 +- .../ngram-jaccard-check_inline_03.plan | 160 +- .../inverted-index-join/ngram-jaccard_01.plan | 157 +- .../inverted-index-join/ngram-jaccard_02.plan | 157 +- .../inverted-index-join/ngram-jaccard_03.plan | 158 +- .../inverted-index-join/ngram-jaccard_04.plan | 157 +- .../ngram-jaccard_inline_03.plan | 160 +- .../word-fuzzyeq-jaccard_01.plan | 157 +- .../word-fuzzyeq-jaccard_02.plan | 157 +- .../word-fuzzyeq-jaccard_03.plan | 158 +- .../word-fuzzyeq-jaccard_04.plan | 157 +- .../word-jaccard-check-after-btree-access.plan | 178 +- .../word-jaccard-check_01.plan | 157 +- .../word-jaccard-check_02.plan | 157 +- .../word-jaccard-check_03.plan | 158 +- .../word-jaccard-check_04.plan | 157 +- .../word-jaccard-check_inline_03.plan | 160 +- .../inverted-index-join/word-jaccard_01.plan | 157 +- .../inverted-index-join/word-jaccard_02.plan | 157 +- .../inverted-index-join/word-jaccard_03.plan | 158 +- .../inverted-index-join/word-jaccard_04.plan | 157 +- .../word-jaccard_inline_03.plan | 160 +- .../jaccard-similarity-join-dual-order.plan | 191 ++ .../jaccard-similarity-join-right-ahead.plan | 128 + .../fuzzyjoin/basic-1_1/basic-1_1.1.ddl.aql | 40 + .../fuzzyjoin/basic-1_1/basic-1_1.2.update.aql | 34 + .../fuzzyjoin/basic-1_1/basic-1_1.3.query.aql | 81 + .../fuzzyjoin/basic-1_1_1/basic-1_1_1.1.ddl.aql | 31 + .../basic-1_1_1/basic-1_1_1.2.update.aql | 27 + .../basic-1_1_1/basic-1_1_1.3.query.aql | 61 + .../fuzzyjoin/basic-1_1_2/basic-1_1_2.1.ddl.aql | 23 + .../basic-1_1_2/basic-1_1_2.10.query.aql | 29 + .../basic-1_1_2/basic-1_1_2.2.update.aql | 18 + .../basic-1_1_2/basic-1_1_2.3.query.aql | 30 + .../basic-1_1_2/basic-1_1_2.4.query.aql | 30 + .../basic-1_1_2/basic-1_1_2.5.query.aql | 30 + .../basic-1_1_2/basic-1_1_2.6.query.aql | 30 + .../basic-1_1_2/basic-1_1_2.7.query.aql | 30 + .../basic-1_1_2/basic-1_1_2.8.query.aql | 30 + .../basic-1_1_2/basic-1_1_2.9.query.aql | 29 + .../fuzzyjoin/basic-1_1_3/basic-1_1_3.1.ddl.aql | 23 + .../basic-1_1_3/basic-1_1_3.2.update.aql | 18 + .../basic-1_1_3/basic-1_1_3.3.query.aql | 30 + .../basic-1_1_3/basic-1_1_3.4.query.aql | 38 + .../fuzzyjoin/basic-1_2_1/basic-1_2_1.1.ddl.aql | 31 + .../basic-1_2_1/basic-1_2_1.2.update.aql | 27 + .../basic-1_2_1/basic-1_2_1.3.query.aql | 123 + .../basic-1_2_1/basic-1_2_1.4.query.aql | 51 + .../basic-1_2_1/basic-1_2_1.5.query.aql | 63 + .../basic-1_2_1/basic-1_2_1.6.query.aql | 60 + .../basic-1_2_1/basic-1_2_1.7.query.aql | 46 + .../fuzzyjoin/basic-1_2_2/basic-1_2_2.1.ddl.aql | 31 + .../basic-1_2_2/basic-1_2_2.2.update.aql | 27 + .../basic-1_2_2/basic-1_2_2.3.query.aql | 77 + .../basic-1_2_2/basic-1_2_2.4.query.aql | 78 + .../basic-1_2_2/basic-1_2_2.5.query.aql | 79 + .../basic-1_2_2/basic-1_2_2.6.query.aql | 77 + .../fuzzyjoin/basic-1_2_3/basic-1_2_3.1.ddl.aql | 31 + .../basic-1_2_3/basic-1_2_3.2.update.aql | 27 + .../basic-1_2_3/basic-1_2_3.3.query.aql | 89 + .../fuzzyjoin/basic-1_2_4/basic-1_2_4.1.ddl.aql | 31 + .../basic-1_2_4/basic-1_2_4.2.update.aql | 27 + .../basic-1_2_4/basic-1_2_4.3.query.aql | 76 + .../fuzzyjoin/basic-1_2_5/basic-1_2_5.1.ddl.aql | 31 + .../basic-1_2_5/basic-1_2_5.2.update.aql | 27 + .../basic-1_2_5/basic-1_2_5.3.query.aql | 37 + .../fuzzyjoin/basic-1_2_6/basic-1_2_6.1.ddl.aql | 33 + .../basic-1_2_6/basic-1_2_6.2.update.aql | 23 + .../basic-1_2_6/basic-1_2_6.3.query.aql | 39 + .../fuzzyjoin/basic-1_2_7/basic-1_2_7.1.ddl.aql | 35 + .../basic-1_2_7/basic-1_2_7.2.update.aql | 27 + .../basic-1_2_7/basic-1_2_7.3.query.aql | 89 + .../fuzzyjoin/basic-1_3_1/basic-1_3_1.1.ddl.aql | 31 + .../basic-1_3_1/basic-1_3_1.2.update.aql | 27 + .../basic-1_3_1/basic-1_3_1.3.query.aql | 29 + .../basic-1_3_1/basic-1_3_1.4.query.aql | 40 + .../basic-1_3_1/basic-1_3_1.5.query.aql | 73 + .../basic-1_3_1/basic-1_3_1.6.query.aql | 93 + .../dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql | 2 - .../dblp-csx-4.1.1/word-jaccard.1.ddl.aql | 58 + .../dblp-csx-4.1.1/word-jaccard.2.update.aql | 42 + .../dblp-csx-4.1.1/word-jaccard.3.ddl.aql | 23 + .../dblp-csx-4.1.1/word-jaccard.4.query.aql | 27 + .../ngram-jaccard-inline.1.ddl.aql | 59 + .../ngram-jaccard-inline.2.update.aql | 42 + .../ngram-jaccard-inline.3.ddl.aql | 23 + .../ngram-jaccard-inline.4.query.aql | 27 + .../dblp-csx-4.2.1/word-jaccard.1.ddl.aql | 58 + .../dblp-csx-4.2.1/word-jaccard.2.update.aql | 42 + .../dblp-csx-4.2.1/word-jaccard.3.ddl.aql | 23 + .../dblp-csx-4.2.1/word-jaccard.4.query.aql | 28 + .../ngram-jaccard-inline.1.ddl.aql | 58 + .../ngram-jaccard-inline.2.update.aql | 42 + .../ngram-jaccard-inline.3.ddl.aql | 23 + .../ngram-jaccard-inline.4.query.aql | 29 + .../dblp-csx-4.3.1/word-jaccard.1.ddl.aql | 58 + .../dblp-csx-4.3.1/word-jaccard.2.update.aql | 43 + .../dblp-csx-4.3.1/word-jaccard.3.ddl.aql | 23 + .../dblp-csx-4.3.1/word-jaccard.4.query.aql | 29 + .../ngram-jaccard-inline.1.ddl.aql | 58 + .../ngram-jaccard-inline.2.update.aql | 43 + .../ngram-jaccard-inline.3.ddl.aql | 23 + .../ngram-jaccard-inline.4.query.aql | 30 + .../dblp-csx-4.4.1/word-jaccard.1.ddl.aql | 57 + .../dblp-csx-4.4.1/word-jaccard.2.update.aql | 42 + .../dblp-csx-4.4.1/word-jaccard.3.ddl.aql | 23 + .../dblp-csx-4.4.1/word-jaccard.4.query.aql | 32 + .../dblp-csx-4.4.2/word-jaccard.1.ddl.aql | 58 + .../dblp-csx-4.4.2/word-jaccard.2.update.aql | 42 + .../dblp-csx-4.4.2/word-jaccard.3.ddl.aql | 18 + .../dblp-csx-4.4.2/word-jaccard.4.query.aql | 32 + .../dblp-csx-aqlplus_6.1.ddl.aql | 53 + .../dblp-csx-aqlplus_6.2.update.aql | 41 + .../dblp-csx-aqlplus_6.3.query.aql | 26 + .../dblp-csx-dblp-aqlplus_1.3.query.aql | 2 +- .../results/fuzzyjoin/basic-1_1/basic-1_1.1.adm | 1 + .../fuzzyjoin/basic-1_1_1/basic-1_1_1.1.adm | 13 + .../fuzzyjoin/basic-1_1_2/basic-1_1_2.10.adm | 1 + .../fuzzyjoin/basic-1_1_2/basic-1_1_2.3.adm | 1 + .../fuzzyjoin/basic-1_1_2/basic-1_1_2.4.adm | 1 + .../fuzzyjoin/basic-1_1_2/basic-1_1_2.5.adm | 1 + .../fuzzyjoin/basic-1_1_2/basic-1_1_2.6.adm | 1 + .../fuzzyjoin/basic-1_1_2/basic-1_1_2.7.adm | 1 + .../fuzzyjoin/basic-1_1_2/basic-1_1_2.8.adm | 1 + .../fuzzyjoin/basic-1_1_2/basic-1_1_2.9.adm | 1 + .../fuzzyjoin/basic-1_1_3/basic-1_1_2.3.adm | 1 + .../fuzzyjoin/basic-1_1_3/basic-1_1_2.4.adm | 1 + .../fuzzyjoin/basic-1_2_1/basic-1_2_1.3.adm | 1 + .../fuzzyjoin/basic-1_2_1/basic-1_2_1.4.adm | 4 + .../fuzzyjoin/basic-1_2_1/basic-1_2_1.5.adm | 0 .../fuzzyjoin/basic-1_2_1/basic-1_2_1.6.adm | 196 ++ .../fuzzyjoin/basic-1_2_1/basic-1_2_1.7.adm | 196 ++ .../fuzzyjoin/basic-1_2_2/basic-1_2_2.3.adm | 1 + .../fuzzyjoin/basic-1_2_2/basic-1_2_2.4.adm | 1 + .../fuzzyjoin/basic-1_2_2/basic-1_2_2.5.adm | 1 + .../fuzzyjoin/basic-1_2_2/basic-1_2_2.6.adm | 120 + .../fuzzyjoin/basic-1_2_3/basic-1_2_3.3.adm | 0 .../fuzzyjoin/basic-1_2_4/basic-1_2_4.3.adm | 1 + .../fuzzyjoin/basic-1_2_5/basic-1_2_5.3.adm | 1 + .../fuzzyjoin/basic-1_2_6/basic-1_2_6.3.adm | 1 + .../fuzzyjoin/basic-1_2_7/basic-1_2_7.3.adm | 0 .../fuzzyjoin/basic-1_3_1/basic-1_3_1.3.adm | 834 ++++++ .../fuzzyjoin/basic-1_3_1/basic-1_3_1.4.adm | 547 ++++ .../fuzzyjoin/basic-1_3_1/basic-1_3_1.5.adm | 1 + .../fuzzyjoin/basic-1_3_1/basic-1_3_1.6.adm | 1 + .../fuzzyjoin/dblp-csx-4.1.1/dblp-csx-4.1.1.adm | 7 + .../fuzzyjoin/dblp-csx-4.1.2/dblp-csx-4.1.2.adm | 7 + .../fuzzyjoin/dblp-csx-4.2.1/dblp-csx-4.2.1.adm | 2 + .../fuzzyjoin/dblp-csx-4.2.2/dblp-csx-4.2.2.adm | 5 + .../fuzzyjoin/dblp-csx-4.3.1/dblp-csx-4.3.1.adm | 2 + .../fuzzyjoin/dblp-csx-4.3.2/dblp-csx-4.3.2.adm | 5 + .../fuzzyjoin/dblp-csx-4.4.1/dblp-csx-4.4.1.adm | 5 + .../fuzzyjoin/dblp-csx-4.4.2/dblp-csx-4.4.2.adm | 5 + .../dblp-csx-aqlplus_6/dblp-csx-aqlplus_6.1.adm | 9 + .../src/test/resources/runtimets/testsuite.xml | 107 +- .../asterix/common/exceptions/ErrorCode.java | 1 + .../main/resources/asx_errormsg/en.properties | 1 + .../fuzzyjoin/similarity/SimilarityFilters.java | 21 +- .../similarity/SimilarityFiltersJaccard.java | 141 +- .../SimilarityJaccardPrefixEvaluator.java | 22 +- .../logical/visitors/IsomorphismUtilities.java | 64 + 261 files changed, 26850 insertions(+), 2387 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/compiler/provider/DefaultRuleSetFactory.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/compiler/provider/DefaultRuleSetFactory.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/compiler/provider/DefaultRuleSetFactory.java index 2db5d9a..eb41204 100644 --- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/compiler/provider/DefaultRuleSetFactory.java +++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/compiler/provider/DefaultRuleSetFactory.java @@ -25,6 +25,7 @@ import org.apache.asterix.common.dataflow.ICcApplicationContext; import org.apache.asterix.optimizer.base.RuleCollections; import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException; import org.apache.hyracks.algebricks.common.utils.Pair; +import org.apache.hyracks.algebricks.compiler.rewriter.rulecontrollers.SequentialFirstRuleCheckFixpointRuleController; import org.apache.hyracks.algebricks.compiler.rewriter.rulecontrollers.SequentialFixpointRuleController; import org.apache.hyracks.algebricks.compiler.rewriter.rulecontrollers.SequentialOnceRuleController; import org.apache.hyracks.algebricks.core.rewriter.base.AbstractRuleController; @@ -50,6 +51,8 @@ public class DefaultRuleSetFactory implements IRuleSetFactory { SequentialFixpointRuleController seqCtrlNoDfs = new SequentialFixpointRuleController(false); SequentialFixpointRuleController seqCtrlFullDfs = new SequentialFixpointRuleController(true); SequentialOnceRuleController seqOnceCtrl = new SequentialOnceRuleController(true); + SequentialFirstRuleCheckFixpointRuleController seqFirstRuleGateKeeperDfs = + new SequentialFirstRuleCheckFixpointRuleController(true); defaultLogicalRewrites.add(new Pair<>(seqOnceCtrl, RuleCollections.buildInitialTranslationRuleCollection())); defaultLogicalRewrites.add(new Pair<>(seqOnceCtrl, RuleCollections.buildTypeInferenceRuleCollection())); defaultLogicalRewrites.add(new Pair<>(seqOnceCtrl, RuleCollections.buildAutogenerateIDRuleCollection())); @@ -58,9 +61,8 @@ public class DefaultRuleSetFactory implements IRuleSetFactory { defaultLogicalRewrites .add(new Pair<>(seqCtrlNoDfs, RuleCollections.buildCondPushDownAndJoinInferenceRuleCollection())); defaultLogicalRewrites.add(new Pair<>(seqCtrlFullDfs, RuleCollections.buildLoadFieldsRuleCollection(appCtx))); - // fj - defaultLogicalRewrites.add(new Pair<>(seqCtrlFullDfs, RuleCollections.buildFuzzyJoinRuleCollection())); - // + defaultLogicalRewrites + .add(new Pair<>(seqFirstRuleGateKeeperDfs, RuleCollections.buildFuzzyJoinRuleCollection())); defaultLogicalRewrites .add(new Pair<>(seqCtrlFullDfs, RuleCollections.buildNormalizationRuleCollection(appCtx))); defaultLogicalRewrites http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/FuzzyUtils.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/FuzzyUtils.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/FuzzyUtils.java index d7e05b3..1e64c91 100644 --- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/FuzzyUtils.java +++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/FuzzyUtils.java @@ -52,6 +52,7 @@ public class FuzzyUtils { return BuiltinFunctions.COUNTHASHED_WORD_TOKENS; case MULTISET: case ARRAY: + case UNION: case ANY: return null; default: @@ -119,4 +120,15 @@ public class FuzzyUtils { simFunction = simFunction.toLowerCase(); return simFunction; } + + public static String getSimFunction(FunctionIdentifier simFuncId) { + if (simFuncId.equals(BuiltinFunctions.SIMILARITY_JACCARD) + || simFuncId.equals(BuiltinFunctions.SIMILARITY_JACCARD_CHECK)) { + return JACCARD_FUNCTION_NAME; + } else if (simFuncId.equals(BuiltinFunctions.EDIT_DISTANCE) + || simFuncId.equals(BuiltinFunctions.EDIT_DISTANCE_CHECK)) { + return EDIT_DISTANCE_FUNCTION_NAME; + } + return null; + } } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/RuleCollections.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/RuleCollections.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/RuleCollections.java index cf01573..3c981d4 100644 --- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/RuleCollections.java +++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/RuleCollections.java @@ -41,6 +41,7 @@ import org.apache.asterix.optimizer.rules.FeedScanCollectionToUnnest; import org.apache.asterix.optimizer.rules.FixReplicateOperatorOutputsRule; import org.apache.asterix.optimizer.rules.FullTextContainsParameterCheckRule; import org.apache.asterix.optimizer.rules.FuzzyEqRule; +import org.apache.asterix.optimizer.rules.FuzzyJoinRule; import org.apache.asterix.optimizer.rules.InjectTypeCastForFunctionArgumentsRule; import org.apache.asterix.optimizer.rules.InjectTypeCastForUnionRule; import org.apache.asterix.optimizer.rules.InlineUnnestFunctionRule; @@ -259,8 +260,15 @@ public final class RuleCollections { public static final List<IAlgebraicRewriteRule> buildFuzzyJoinRuleCollection() { List<IAlgebraicRewriteRule> fuzzy = new LinkedList<>(); - // fuzzy.add(new FuzzyJoinRule()); -- The non-indexed fuzzy join will be temporarily disabled. It should be enabled some time in the near future. - fuzzy.add(new InferTypesRule()); + fuzzy.add(new FuzzyJoinRule()); + fuzzy.add(new ExtractCommonExpressionsRule()); + fuzzy.add(new NestedSubplanToJoinRule()); + fuzzy.add(new PushSelectIntoJoinRule()); + fuzzy.add(new RemoveUnusedAssignAndAggregateRule()); + fuzzy.add(new InlineSubplanInputForNestedTupleSourceRule()); + fuzzy.add(new RemoveRedundantVariablesRule()); + fuzzy.add(new AsterixInlineVariablesRule()); + fuzzy.add(new RemoveUnusedAssignAndAggregateRule()); return fuzzy; } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/FuzzyJoinRule.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/FuzzyJoinRule.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/FuzzyJoinRule.java index d814ba0..dabfb82 100644 --- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/FuzzyJoinRule.java +++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/FuzzyJoinRule.java @@ -19,6 +19,7 @@ package org.apache.asterix.optimizer.rules; import java.io.StringReader; +import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; @@ -26,11 +27,17 @@ import java.util.Locale; import org.apache.asterix.aqlplus.parser.AQLPlusParser; import org.apache.asterix.aqlplus.parser.ParseException; +import org.apache.asterix.common.exceptions.CompilationException; +import org.apache.asterix.common.exceptions.ErrorCode; import org.apache.asterix.lang.common.base.Clause; -import org.apache.asterix.lang.common.struct.Identifier; +import org.apache.asterix.lang.common.struct.VarIdentifier; +import org.apache.asterix.lang.common.util.FunctionUtil; import org.apache.asterix.metadata.declared.MetadataProvider; +import org.apache.asterix.om.base.AFloat; +import org.apache.asterix.om.constants.AsterixConstantValue; import org.apache.asterix.om.functions.BuiltinFunctions; import org.apache.asterix.om.typecomputer.impl.TypeComputeUtils; +import org.apache.asterix.om.types.BuiltinType; import org.apache.asterix.om.types.IAType; import org.apache.asterix.optimizer.base.FuzzyUtils; import org.apache.asterix.translator.AqlPlusExpressionToPlanTranslator; @@ -48,6 +55,7 @@ import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable; import org.apache.hyracks.algebricks.core.algebra.expressions.AbstractFunctionCallExpression; import org.apache.hyracks.algebricks.core.algebra.expressions.ConstantExpression; import org.apache.hyracks.algebricks.core.algebra.expressions.IndexedNLJoinExpressionAnnotation; +import org.apache.hyracks.algebricks.core.algebra.expressions.ScalarFunctionCallExpression; import org.apache.hyracks.algebricks.core.algebra.expressions.VariableReferenceExpression; import org.apache.hyracks.algebricks.core.algebra.functions.AlgebricksBuiltinFunctions; import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; @@ -55,6 +63,7 @@ import org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractBina import org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractLogicalOperator; import org.apache.hyracks.algebricks.core.algebra.operators.logical.LeftOuterJoinOperator; import org.apache.hyracks.algebricks.core.algebra.operators.logical.SelectOperator; +import org.apache.hyracks.algebricks.core.algebra.operators.logical.visitors.IsomorphismUtilities; import org.apache.hyracks.algebricks.core.algebra.operators.logical.visitors.LogicalOperatorDeepCopyWithNewVariablesVisitor; import org.apache.hyracks.algebricks.core.algebra.operators.logical.visitors.VariableUtilities; import org.apache.hyracks.algebricks.core.algebra.util.OperatorPropertiesUtil; @@ -62,118 +71,132 @@ import org.apache.hyracks.algebricks.core.rewriter.base.IAlgebraicRewriteRule; public class FuzzyJoinRule implements IAlgebraicRewriteRule { - private static HashSet<FunctionIdentifier> simFuncs = new HashSet<FunctionIdentifier>(); + private static HashSet<FunctionIdentifier> simFuncs = new HashSet<>(); static { simFuncs.add(BuiltinFunctions.SIMILARITY_JACCARD_CHECK); } + private final List<Collection<LogicalVariable>> previousPKs = new ArrayList<>(); + + // Please correspond the value to the anchors $$RIGHT_##_0 and ##RIGHT_3 in AQLPLUS. + private static final int SUBSET_RIGHT_INDEX = 1; // Corresponds to $$RIGHT_1_0 and ##RIGHT_1 + // private static final int LENGTH_LEFT_INDEX = 2; // Corresponds to $$RIGHT_2_0 and ##RIGHT_2 + private static final int LENGTH_RIGHT_INDEX = 3; // Corresponds to $$RIGHT_3_0 and ##RIGHT_3 + + // Step1: Initialize the host embedding aql to substitute the fuzzy equal condition such as $r.a ~= $s.b private static final String AQLPLUS = "" // // -- - Stage 3 - -- // - + "((#RIGHT), " + " (join((#LEFT), " + + "((##LEFT_0), " + " (join((##RIGHT_0), " // // -- -- - Stage 2 - -- // - + " (" + " join( " + " ( " + " #LEFT_1 " + " let $tokensUnrankedLeft := %s($$LEFT_1) " - + " let $lenLeft := len($tokensUnrankedLeft) " + " let $tokensLeft := " - + " for $token in $tokensUnrankedLeft " + " for $tokenRanked at $i in " + + " (" + "join( " + "( " + "##RIGHT_1 " + " let $tokensUnrankedRight := %s($$RIGHT_1) " + + " let $lenRight := len($tokensUnrankedRight) " + " let $tokensRight := " + + " for $token in $tokensUnrankedRight " + "for $tokenRanked at $i in " // // -- -- -- - Stage 1 - -- - // - // + " #LEFT_2 " - // + " let $id := $$LEFTPK_2 " - // + " for $token in %s($$LEFT_2) " - + " #RIGHT_2 " + " let $id := $$RIGHTPK_2 " + " for $token in %s($$RIGHT_2) " - + " /*+ hash */ " + " group by $tokenGroupped := $token with $id " - + " /*+ inmem 34 198608 */ " + " order by count($id), $tokenGroupped " - + " return $tokenGroupped " + // Since we use the right join branch R to generate our token order, this can shorten the prefix length of S + // in case of R ~= S, where some tokens are in S but not in R. + + " ##RIGHT_3 " + "let $id := $$RIGHTPK_3_0 " + "for $token in %s($$RIGHT_3) " + + " /*+ hash */ " + "group by $tokenGroupped := $token with $id " + + " order by count($id), $tokenGroupped return $tokenGroupped " // // -- -- -- - // - + " where $token = /*+ bcast */ $tokenRanked " + " order by $i " + " return $i " - + " for $prefixTokenLeft in subset-collection($tokensLeft, 0, prefix-len-%s(len($tokensLeft), %ff)) " - + " ),( " + " #RIGHT_1 " + " let $tokensUnrankedRight := %s($$RIGHT_1) " - + " let $lenRight := len($tokensUnrankedRight) " + " let $tokensRight := " - + " for $token in $tokensUnrankedRight " + " for $tokenRanked at $i in " + + " where $token = /*+ bcast */ $tokenRanked " + "order by $i " + "return $i " + + " for $prefixTokenRight in subset-collection($tokensRight, 0, prefix-len-%s(len($tokensRight), %ff)) " + + " ), " + "( " + "##LEFT_1 " + "let $tokensUnrankedLeft := %s($$LEFT_1) " + + " let $lenLeft := len($tokensUnrankedLeft) " + "let $tokensLeft := " + + " for $token in $tokensUnrankedLeft " + "for $tokenRanked at $i in " // // -- -- -- - Stage 1 - -- - // - // + " #LEFT_3 " - // + " let $id := $$LEFTPK_3 " - // + " for $token in %s($$LEFT_3) " - + " #RIGHT_3 " + " let $id := $$RIGHTPK_3 " + " for $token in %s($$RIGHT_3) " - + " /*+ hash */ " + " group by $tokenGroupped := $token with $id " - + " /*+ inmem 34 198608 */ " + " order by count($id), $tokenGroupped " - + " return $tokenGroupped " + + " ##RIGHT_2 " + "let $id := $$RIGHTPK_2_0 " + "for $token in %s($$RIGHT_2) " + + " /*+ hash */ " + "group by $tokenGroupped := $token with $id " + + " order by count($id), $tokenGroupped return $tokenGroupped " // // -- -- -- - // - + " where $token = /*+ bcast */ $tokenRanked " + " order by $i " + " return $i " - + " for $prefixTokenRight in subset-collection($tokensRight, 0, prefix-len-%s(len($tokensRight), %ff)) " - + " ), $prefixTokenLeft = $prefixTokenRight) " - + " let $sim := similarity-%s-prefix($lenLeft, $tokensLeft, $lenRight, $tokensRight, $prefixTokenLeft, %ff) " - + " where $sim >= %ff " + " /*+ hash*/ " - + " group by $idLeft := $$LEFTPK_1, $idRight := $$RIGHTPK_1 with $sim " + + " where $token = /*+ bcast */ $tokenRanked " + "order by $i " + "return $i " + // We use the input string $tokensUnrankedLeft instead of $tokensLeft to ensure it will not miss similar + // pairs when the prefix of S has been reduced in case of R ~= S, where some tokens are in S but not in R. + + " let $actualPreLen := prefix-len-%s(len($tokensUnrankedLeft), %ff) - $lenLeft + len($tokensLeft) " + + " for $prefixTokenLeft in subset-collection($tokensLeft, 0, $actualPreLen)) " + + " , $prefixTokenLeft = $prefixTokenRight) " + + "let $sim := similarity-%s-prefix($lenRight, $tokensRight, $lenLeft, $tokensLeft, $prefixTokenLeft, %ff) " + + "where $sim >= %ff " + "/*+ hash*/ " + "group by %s, %s with $sim " // // -- -- - // - + " ), $$LEFTPK = $idLeft)), $$RIGHTPK = $idRight)"; + + " ), %s)), %s)"; - private Collection<LogicalVariable> liveVars = new HashSet<LogicalVariable>(); + private static final String GROUPBY_LEFT = "$idLeft_%d := $$LEFTPK_1_%d"; + private static final String GROUPBY_RIGHT = "$idRight_%d := $$RIGHTPK_1_%d"; + private static final String JOIN_COND_LEFT = "$$LEFTPK_0_%d = $idLeft_%d"; + private static final String JOIN_COND_RIGHT = "$$RIGHTPK_0_%d = $idRight_%d"; + private static final String AQLPLUS_INNER_JOIN = "join"; + private static final String AQLPLUS_LEFTOUTER_JOIN = "loj"; @Override public boolean rewritePost(Mutable<ILogicalOperator> opRef, IOptimizationContext context) throws AlgebricksException { + AbstractLogicalOperator op = (AbstractLogicalOperator) opRef.getValue(); - // current opperator is join + // current operator should be a join. if (op.getOperatorTag() != LogicalOperatorTag.INNERJOIN && op.getOperatorTag() != LogicalOperatorTag.LEFTOUTERJOIN) { return false; } - // Find GET_ITEM function. + // Finds GET_ITEM function in the join condition. AbstractBinaryJoinOperator joinOp = (AbstractBinaryJoinOperator) op; - Mutable<ILogicalExpression> expRef = joinOp.getCondition(); - Mutable<ILogicalExpression> getItemExprRef = getSimilarityExpression(expRef); + Mutable<ILogicalExpression> exprRef = joinOp.getCondition(); + Mutable<ILogicalExpression> getItemExprRef = getSimilarityExpression(exprRef); if (getItemExprRef == null) { return false; } - // Check if the GET_ITEM function is on one of the supported similarity-check functions. + // Checks if the GET_ITEM function is on the one of the supported similarity-check functions. AbstractFunctionCallExpression getItemFuncExpr = (AbstractFunctionCallExpression) getItemExprRef.getValue(); Mutable<ILogicalExpression> argRef = getItemFuncExpr.getArguments().get(0); AbstractFunctionCallExpression simFuncExpr = (AbstractFunctionCallExpression) argRef.getValue(); if (!simFuncs.contains(simFuncExpr.getFunctionIdentifier())) { return false; } - // Skip this rule based on annotations. + // Skips this rule based on annotations. if (simFuncExpr.getAnnotations().containsKey(IndexedNLJoinExpressionAnnotation.INSTANCE)) { return false; } + // Gets both input branches of fuzzy join. List<Mutable<ILogicalOperator>> inputOps = joinOp.getInputs(); ILogicalOperator leftInputOp = inputOps.get(0).getValue(); ILogicalOperator rightInputOp = inputOps.get(1).getValue(); - List<Mutable<ILogicalExpression>> inputExps = simFuncExpr.getArguments(); + List<Mutable<ILogicalExpression>> inputExprs = simFuncExpr.getArguments(); + if (inputExprs.size() != 3) { + return false; + } - ILogicalExpression inputExp0 = inputExps.get(0).getValue(); - ILogicalExpression inputExp1 = inputExps.get(1).getValue(); + // Extracts Fuzzy similarity function. + ILogicalExpression leftOperatingExpr = inputExprs.get(0).getValue(); + ILogicalExpression rightOperatingExpr = inputExprs.get(1).getValue(); + ILogicalExpression thresholdConstantExpr = inputExprs.get(2).getValue(); - // left and right expressions are variables - if (inputExp0.getExpressionTag() != LogicalExpressionTag.VARIABLE - || inputExp1.getExpressionTag() != LogicalExpressionTag.VARIABLE) { + // left and right expressions should be variables. + if (leftOperatingExpr.getExpressionTag() != LogicalExpressionTag.VARIABLE + || rightOperatingExpr.getExpressionTag() != LogicalExpressionTag.VARIABLE + || thresholdConstantExpr.getExpressionTag() != LogicalExpressionTag.CONSTANT) { return false; } - LogicalVariable inputVar0 = ((VariableReferenceExpression) inputExp0).getVariableReference(); - LogicalVariable inputVar1 = ((VariableReferenceExpression) inputExp1).getVariableReference(); + LogicalVariable inputVar0 = ((VariableReferenceExpression) leftOperatingExpr).getVariableReference(); + LogicalVariable inputVar1 = ((VariableReferenceExpression) rightOperatingExpr).getVariableReference(); LogicalVariable leftInputVar; LogicalVariable rightInputVar; - - liveVars.clear(); + Collection<LogicalVariable> liveVars = new HashSet<>(); VariableUtilities.getLiveVariables(leftInputOp, liveVars); if (liveVars.contains(inputVar0)) { leftInputVar = inputVar0; @@ -182,72 +205,217 @@ public class FuzzyJoinRule implements IAlgebraicRewriteRule { leftInputVar = inputVar1; rightInputVar = inputVar0; } - - List<LogicalVariable> leftInputPKs = context.findPrimaryKey(leftInputVar); - List<LogicalVariable> rightInputPKs = context.findPrimaryKey(rightInputVar); - // Bail if primary keys could not be inferred. - if (leftInputPKs == null || rightInputPKs == null) { - return false; - } - // primary key has only one variable - if (leftInputPKs.size() != 1 || rightInputPKs.size() != 1) { - return false; - } + // leftInputPKs in currrentPKs keeps all PKs derived from the left branch in the current similarity fuzzyjoin. + List<LogicalVariable> leftInputPKs = findPrimaryKeysInSubplan(liveVars, context); + liveVars.clear(); + VariableUtilities.getLiveVariables(rightInputOp, liveVars); + // rightInputPKs in currentPKs keeps all PKs derived from the right branch in the current similarity fuzzyjoin. + List<LogicalVariable> rightInputPKs = findPrimaryKeysInSubplan(liveVars, context); IAType leftType = (IAType) context.getOutputTypeEnvironment(leftInputOp).getVarType(leftInputVar); - IAType rightType = (IAType) context.getOutputTypeEnvironment(rightInputOp).getVarType(rightInputVar); - // left-hand side and right-hand side of "~=" has the same type - IAType left2 = TypeComputeUtils.getActualType(leftType); - IAType right2 = TypeComputeUtils.getActualType(rightType); - if (!left2.deepEqual(right2)) { + if (!isPrefixFuzzyJoin(context, leftInputOp, rightInputOp, rightInputVar, leftInputPKs, rightInputPKs, + leftType)) { return false; } + // // -- - FIRE - -- // MetadataProvider metadataProvider = ((MetadataProvider) context.getMetadataProvider()); + // Steps 1 and 2. Generate the prefix-based fuzzy jon template. + String aqlPlus = generateAqlTemplate(metadataProvider, joinOp, simFuncExpr, leftInputPKs, leftType, + rightInputPKs, thresholdConstantExpr); + // Steps 3 and 4. Generate the prefix-based fuzzy join subplan. + ILogicalOperator outputOp = generatePrefixFuzzyJoinSubplan(context, metadataProvider, aqlPlus, leftInputOp, + leftInputPKs, leftInputVar, rightInputOp, rightInputPKs, rightInputVar); + + // Step 5. Bind the plan to the parent op referred by the following opRef. + SelectOperator extraSelect; + if (getItemExprRef != exprRef) { + // more than one join condition + getItemExprRef.setValue(ConstantExpression.TRUE); + switch (joinOp.getJoinKind()) { + case INNER: { + extraSelect = new SelectOperator(exprRef, false, null); + extraSelect.setSourceLocation(exprRef.getValue().getSourceLocation()); + extraSelect.getInputs().add(new MutableObject<>(outputOp)); + outputOp = extraSelect; + break; + } + case LEFT_OUTER: { + LeftOuterJoinOperator topJoin = (LeftOuterJoinOperator) outputOp; + setConditionForLeftOuterJoin(topJoin, exprRef); + break; + } + default: { + throw new IllegalStateException(); + } + } + } + opRef.setValue(outputOp); + OperatorPropertiesUtil.typeOpRec(opRef, context); + return true; + } + + /** + * To handle multiple fuzzy-join conditions on a same pair of datasets, this rule checks the PKs in bottom-up way. + * The previousPKs list incrementally maintains the PKs from a previous fuzzy-join operator's input branches. + * In addition, the given fuzzy-join operator has been successfully translated into a prefix-based fuzzy join + * sub-plan of the current fuzzy-join operator. There are two cases: + * 1. If the previousPKs list contains the currentPKs list (the PKs from the input branches of the current + * fuzzy-join operator), this means that the current fuzzy-join condition has no new input branch. This case + * SHOULD BE regarded as a SELECT over one of the previous fuzzy-joins. + * 2. Otherwise, we can apply this rule to the current fuzzy-join operator to a new prefix-based fuzzy-join plan. + */ + private boolean isPrefixFuzzyJoin(IOptimizationContext context, ILogicalOperator leftInputOp, + ILogicalOperator rightInputOp, LogicalVariable rightInputVar, List<LogicalVariable> leftInputPKs, + List<LogicalVariable> rightInputPKs, IAType leftType) throws AlgebricksException { + Collection<LogicalVariable> currentPKs = new HashSet<>(); + currentPKs.addAll(leftInputPKs); + currentPKs.addAll(rightInputPKs); + // If PKs derived from the both branches are SAME as that of a previous fuzzyjoin, we treat this fuzzyjoin + // condition as a select over that fuzzyjoin. + for (int i = 0; i < previousPKs.size(); i++) { + if (previousPKs.get(i).containsAll(currentPKs) && currentPKs.containsAll(previousPKs.get(i))) { + return false; + } + } + + //Suppose we want to query on the same dataset on the different fields, i.e. A.a1 ~= B.b1 AND A.a2 ~= B.b2 + //We conduct this query as a select over a fuzzyjoin based on the PK inclusion relationship. + previousPKs.add(currentPKs); + // Avoids the duplicated PK generation in findPrimaryKeysInSubplan, especially for multiway fuzzy join. + // Refer to fj-dblp-csx-hybrid.aql in the optimized tests for example. + IsomorphismUtilities.mergeHomogeneousPK(leftInputOp, leftInputPKs); + // Fails if primary keys could not be inferred. + if (leftInputPKs.isEmpty() || rightInputPKs.isEmpty()) { + return false; + } + + IAType rightType = (IAType) context.getOutputTypeEnvironment(rightInputOp).getVarType(rightInputVar); + // left-hand side and right-hand side of fuzzyjoin should be the same type + IAType actualLeftType = TypeComputeUtils.getActualType(leftType); + IAType actualRightType = TypeComputeUtils.getActualType(rightType); + if (!actualLeftType.deepEqual(actualRightType)) { + return false; + } + return true; + } + + private String generateAqlTemplate(MetadataProvider metadataProvider, AbstractBinaryJoinOperator joinOp, + AbstractFunctionCallExpression simFuncExpr, List<LogicalVariable> leftInputPKs, IAType leftType, + List<LogicalVariable> rightInputPKs, ILogicalExpression thresholdConstantExpr) throws AlgebricksException { FunctionIdentifier funcId = FuzzyUtils.getTokenizer(leftType.getTypeTag()); - String tokenizer; - if (funcId == null) { - tokenizer = ""; - } else { + String tokenizer = ""; + if (funcId != null) { tokenizer = funcId.getName(); } - float simThreshold = FuzzyUtils.getSimThreshold(metadataProvider); - String simFunction = FuzzyUtils.getSimFunction(metadataProvider); + String simFunction = FuzzyUtils.getSimFunction(simFuncExpr.getFunctionIdentifier()); + float simThreshold; + ConstantExpression constExpr = (ConstantExpression) thresholdConstantExpr; + AsterixConstantValue constVal = (AsterixConstantValue) constExpr.getValue(); + if (constVal.getObject().getType().equals(BuiltinType.AFLOAT)) { + simThreshold = ((AFloat) constVal.getObject()).getFloatValue(); + } else { + simThreshold = FuzzyUtils.getSimThreshold(metadataProvider); + } // finalize AQL+ query String prepareJoin; switch (joinOp.getJoinKind()) { - case INNER: { - prepareJoin = "join" + AQLPLUS; + case INNER: + prepareJoin = AQLPLUS_INNER_JOIN + AQLPLUS; break; + case LEFT_OUTER: + prepareJoin = AQLPLUS_LEFTOUTER_JOIN + AQLPLUS; + break; + default: + throw new CompilationException(ErrorCode.COMPILATION_INVALID_EXPRESSION); + } + String groupByLeft = ""; + String joinCondLeft = ""; + // Step2. By default, we triggered the prefix fuzzy join strategy, which needs to initialize the mapping from + // the shared token to the actual tuples of the both sides. left(right)InputPKs is used to extract those + // mappings from prefix tokens to tuples. + for (int i = 0; i < leftInputPKs.size(); i++) { + if (i > 0) { + groupByLeft += ", "; + joinCondLeft += " and "; } - case LEFT_OUTER: { - // TODO To make it work for Left Outer Joins, we should permute - // the #LEFT and #RIGHT at the top of the AQL+ query. But, when - // doing this, the - // fuzzyjoin/user-vis-int-vis-user-lot-aqlplus_1.aql (the one - // doing 3-way fuzzy joins) gives a different result. But even - // if we don't change the FuzzyJoinRule, permuting the for - // clauses in fuzzyjoin/user-vis-int-vis-user-lot-aqlplus_1.aql - // leads to different results, which suggests there is some - // other sort of bug. - return false; - // prepareJoin = "loj" + AQLPLUS; - // break; - } - default: { - throw new IllegalStateException(); - } + groupByLeft += String.format(Locale.US, GROUPBY_LEFT, i, i); + joinCondLeft += String.format(Locale.US, JOIN_COND_LEFT, i, i); } - String aqlPlus = String.format(Locale.US, prepareJoin, tokenizer, tokenizer, simFunction, simThreshold, - tokenizer, tokenizer, simFunction, simThreshold, simFunction, simThreshold, simThreshold); - LogicalVariable leftPKVar = leftInputPKs.get(0); - LogicalVariable rightPKVar = rightInputPKs.get(0); + String groupByRight = ""; + String joinCondRight = ""; + for (int i = 0; i < rightInputPKs.size(); i++) { + if (i > 0) { + groupByRight += ", "; + joinCondRight += " and "; + } + groupByRight += String.format(Locale.US, GROUPBY_RIGHT, i, i); + joinCondRight += String.format(Locale.US, JOIN_COND_RIGHT, i, i); + } + return String.format(Locale.US, prepareJoin, tokenizer, tokenizer, simFunction, simThreshold, tokenizer, + tokenizer, simFunction, simThreshold, simFunction, simThreshold, simThreshold, groupByLeft, + groupByRight, joinCondRight, joinCondLeft); + } + private ILogicalOperator generatePrefixFuzzyJoinSubplan(IOptimizationContext context, + MetadataProvider metadataProvider, String aqlPlus, ILogicalOperator leftInputOp, + List<LogicalVariable> leftInputPKs, LogicalVariable leftInputVar, ILogicalOperator rightInputOp, + List<LogicalVariable> rightInputPKs, LogicalVariable rightInputVar) throws AlgebricksException { + // Step3. Translate the tokenizer, join condition and group by (shared token) as shown + // in the above AQLPLUS template. Counter counter = new Counter(context.getVarCounter()); + // The translator will compile metadata internally. Run this compilation + // under the same transaction id as the "outer" compilation. + AqlPlusExpressionToPlanTranslator translator = new AqlPlusExpressionToPlanTranslator(metadataProvider, counter); + + LogicalOperatorDeepCopyWithNewVariablesVisitor copyVisitor = + new LogicalOperatorDeepCopyWithNewVariablesVisitor(context, context); + + // Step3.1. Substitute the variable references of the above AQLPLUS template with + // the actually attached variables. + translator.addOperatorToMetaScope(new VarIdentifier("##LEFT_0"), leftInputOp); + translator.addVariableToMetaScope(new VarIdentifier("$$LEFT_0"), leftInputVar); + for (int i = 0; i < leftInputPKs.size(); i++) { + translator.addVariableToMetaScope(new VarIdentifier("$$LEFTPK_0_" + i), leftInputPKs.get(i)); + } + + // Step3.2. right side again. + translator.addOperatorToMetaScope(new VarIdentifier("##RIGHT_0"), rightInputOp); + translator.addVariableToMetaScope(new VarIdentifier("$$RIGHT_0"), rightInputVar); + for (int i = 0; i < rightInputPKs.size(); i++) { + translator.addVariableToMetaScope(new VarIdentifier("$$RIGHTPK_0_" + i), rightInputPKs.get(i)); + } + + // Step3.3. the suffix 0-3 is used for identifying the different level of variable references. + ILogicalOperator leftInputOpCopy = copyVisitor.deepCopy(leftInputOp); + translator.addOperatorToMetaScope(new VarIdentifier("##LEFT_1"), leftInputOpCopy); + LogicalVariable leftInputVarCopy = copyVisitor.varCopy(leftInputVar); + translator.addVariableToMetaScope(new VarIdentifier("$$LEFT_1"), leftInputVarCopy); + for (int i = 0; i < leftInputPKs.size(); i++) { + leftInputVarCopy = copyVisitor.varCopy(leftInputPKs.get(i)); + translator.addVariableToMetaScope(new VarIdentifier("$$LEFTPK_1_" + i), leftInputVarCopy); + } + copyVisitor.updatePrimaryKeys(context); + copyVisitor.reset(); + + // Notice: pick side to run Stage 1, currently always picks RIGHT side. It means that the right side will + // produce the token order as well as its own token list. + for (int i = SUBSET_RIGHT_INDEX; i <= LENGTH_RIGHT_INDEX; i++) { + translator.addOperatorToMetaScope(new VarIdentifier("##RIGHT_" + i), copyVisitor.deepCopy(rightInputOp)); + LogicalVariable rightInputVarCopy = copyVisitor.varCopy(rightInputVar); + translator.addVariableToMetaScope(new VarIdentifier("$$RIGHT_" + i), rightInputVarCopy); + for (int j = 0; j < rightInputPKs.size(); j++) { + rightInputVarCopy = copyVisitor.varCopy(rightInputPKs.get(j)); + translator.addVariableToMetaScope(new VarIdentifier("$$RIGHTPK_" + i + "_" + j), rightInputVarCopy); + } + copyVisitor.updatePrimaryKeys(context); + copyVisitor.reset(); + } + counter.set(context.getVarCounter()); AQLPlusParser parser = new AQLPlusParser(new StringReader(aqlPlus)); parser.initScope(); @@ -256,115 +424,59 @@ public class FuzzyJoinRule implements IAlgebraicRewriteRule { try { clauses = parser.Clauses(); } catch (ParseException e) { - throw new AlgebricksException(e); + throw CompilationException.create(ErrorCode.COMPILATION_TRANSLATION_ERROR, e); } - // The translator will compile metadata internally. Run this compilation - // under the same transaction id as the "outer" compilation. - AqlPlusExpressionToPlanTranslator translator = new AqlPlusExpressionToPlanTranslator(metadataProvider, counter); - context.setVarCounter(counter.get()); - - LogicalOperatorDeepCopyWithNewVariablesVisitor deepCopyVisitor = - new LogicalOperatorDeepCopyWithNewVariablesVisitor(context, context); - translator.addOperatorToMetaScope(new Identifier("#LEFT"), leftInputOp); - translator.addVariableToMetaScope(new Identifier("$$LEFT"), leftInputVar); - translator.addVariableToMetaScope(new Identifier("$$LEFTPK"), leftPKVar); - - translator.addOperatorToMetaScope(new Identifier("#RIGHT"), rightInputOp); - translator.addVariableToMetaScope(new Identifier("$$RIGHT"), rightInputVar); - translator.addVariableToMetaScope(new Identifier("$$RIGHTPK"), rightPKVar); - - translator.addOperatorToMetaScope(new Identifier("#LEFT_1"), deepCopyVisitor.deepCopy(leftInputOp)); - translator.addVariableToMetaScope(new Identifier("$$LEFT_1"), deepCopyVisitor.varCopy(leftInputVar)); - translator.addVariableToMetaScope(new Identifier("$$LEFTPK_1"), deepCopyVisitor.varCopy(leftPKVar)); - deepCopyVisitor.updatePrimaryKeys(context); - deepCopyVisitor.reset(); - - // translator.addOperatorToMetaScope(new Identifier("#LEFT_2"), - // deepCopyVisitor.deepCopy(leftInputOp, null)); - // translator.addVariableToMetaScope(new Identifier("$$LEFT_2"), - // deepCopyVisitor.varCopy(leftInputVar)); - // translator.addVariableToMetaScope(new Identifier("$$LEFTPK_2"), - // deepCopyVisitor.varCopy(leftPKVar)); - // deepCopyVisitor.updatePrimaryKeys(context); - // deepCopyVisitor.reset(); - // - // translator.addOperatorToMetaScope(new Identifier("#LEFT_3"), - // deepCopyVisitor.deepCopy(leftInputOp, null)); - // translator.addVariableToMetaScope(new Identifier("$$LEFT_3"), - // deepCopyVisitor.varCopy(leftInputVar)); - // translator.addVariableToMetaScope(new Identifier("$$LEFTPK_3"), - // deepCopyVisitor.varCopy(leftPKVar)); - // deepCopyVisitor.updatePrimaryKeys(context); - // deepCopyVisitor.reset(); - - translator.addOperatorToMetaScope(new Identifier("#RIGHT_1"), deepCopyVisitor.deepCopy(rightInputOp)); - translator.addVariableToMetaScope(new Identifier("$$RIGHT_1"), deepCopyVisitor.varCopy(rightInputVar)); - translator.addVariableToMetaScope(new Identifier("$$RIGHTPK_1"), deepCopyVisitor.varCopy(rightPKVar)); - deepCopyVisitor.updatePrimaryKeys(context); - deepCopyVisitor.reset(); - - // TODO pick side to run Stage 1, currently always picks RIGHT side - translator.addOperatorToMetaScope(new Identifier("#RIGHT_2"), deepCopyVisitor.deepCopy(rightInputOp)); - translator.addVariableToMetaScope(new Identifier("$$RIGHT_2"), deepCopyVisitor.varCopy(rightInputVar)); - translator.addVariableToMetaScope(new Identifier("$$RIGHTPK_2"), deepCopyVisitor.varCopy(rightPKVar)); - deepCopyVisitor.updatePrimaryKeys(context); - deepCopyVisitor.reset(); - - translator.addOperatorToMetaScope(new Identifier("#RIGHT_3"), deepCopyVisitor.deepCopy(rightInputOp)); - translator.addVariableToMetaScope(new Identifier("$$RIGHT_3"), deepCopyVisitor.varCopy(rightInputVar)); - translator.addVariableToMetaScope(new Identifier("$$RIGHTPK_3"), deepCopyVisitor.varCopy(rightPKVar)); - deepCopyVisitor.updatePrimaryKeys(context); - deepCopyVisitor.reset(); - - ILogicalPlan plan = translator.translate(clauses); + // Step 4. The essential substitution with translator. + ILogicalPlan plan; + try { + plan = translator.translate(clauses); + } catch (CompilationException e) { + throw CompilationException.create(ErrorCode.COMPILATION_TRANSLATION_ERROR, e); + } context.setVarCounter(counter.get()); - ILogicalOperator outputOp = plan.getRoots().get(0).getValue(); + return plan.getRoots().get(0).getValue(); + } - SelectOperator extraSelect = null; - if (getItemExprRef != expRef) { - // more than one join condition - getItemExprRef.setValue(ConstantExpression.TRUE); - switch (joinOp.getJoinKind()) { - case INNER: { - extraSelect = new SelectOperator(expRef, false, null); - extraSelect.setSourceLocation(expRef.getValue().getSourceLocation()); - extraSelect.getInputs().add(new MutableObject<ILogicalOperator>(outputOp)); - outputOp = extraSelect; - break; - } - case LEFT_OUTER: { - if (outputOp.getOperatorTag() != LogicalOperatorTag.LEFTOUTERJOIN) { - throw new IllegalStateException(); - } - LeftOuterJoinOperator topJoin = (LeftOuterJoinOperator) outputOp; - topJoin.getCondition().setValue(expRef.getValue()); - break; - } - default: { - throw new IllegalStateException(); - } - } + // Since the generatePrefixFuzzyJoinSubplan generates the prefix-based join operators for the partial simJoin + // of expRef, we need to add the full condition expRef\getItemExprRef into the top-level operator of the plan. + // Notice: Any composite select on leftOuterJoin with fuzzyjoin condition inlined can be regarded as its example. + // Example: leftouterjoin-probe-pidx-with-join-edit-distance-check-idx_01.aql or with more extra conditions inlined. + private void setConditionForLeftOuterJoin(LeftOuterJoinOperator topJoin, Mutable<ILogicalExpression> expRef) { + // Combine the conditions of top join of aqlplus plan and the original join + AbstractFunctionCallExpression andFunc = + new ScalarFunctionCallExpression(FunctionUtil.getFunctionInfo(AlgebricksBuiltinFunctions.AND)); + + List<Mutable<ILogicalExpression>> conjs = new ArrayList<>(); + if (topJoin.getCondition().getValue().splitIntoConjuncts(conjs)) { + andFunc.getArguments().addAll(conjs); + } else { + andFunc.getArguments().add(new MutableObject<>(topJoin.getCondition().getValue())); } - opRef.setValue(outputOp); - OperatorPropertiesUtil.typeOpRec(opRef, context); - return true; + + List<Mutable<ILogicalExpression>> conjs2 = new ArrayList<>(); + if (expRef.getValue().splitIntoConjuncts(conjs2)) { + andFunc.getArguments().addAll(conjs2); + } else { + andFunc.getArguments().add(expRef); + } + topJoin.getCondition().setValue(andFunc); } /** * Look for GET_ITEM function call. */ - private Mutable<ILogicalExpression> getSimilarityExpression(Mutable<ILogicalExpression> expRef) { - ILogicalExpression exp = expRef.getValue(); + private Mutable<ILogicalExpression> getSimilarityExpression(Mutable<ILogicalExpression> exprRef) { + ILogicalExpression exp = exprRef.getValue(); if (exp.getExpressionTag() == LogicalExpressionTag.FUNCTION_CALL) { AbstractFunctionCallExpression funcExpr = (AbstractFunctionCallExpression) exp; if (funcExpr.getFunctionIdentifier().equals(BuiltinFunctions.GET_ITEM)) { - return expRef; + return exprRef; } if (funcExpr.getFunctionIdentifier().equals(AlgebricksBuiltinFunctions.AND)) { - for (int i = 0; i < 2; i++) { - Mutable<ILogicalExpression> expRefRet = getSimilarityExpression(funcExpr.getArguments().get(i)); + for (Mutable<ILogicalExpression> arg : funcExpr.getArguments()) { + Mutable<ILogicalExpression> expRefRet = getSimilarityExpression(arg); if (expRefRet != null) { return expRefRet; } @@ -374,9 +486,19 @@ public class FuzzyJoinRule implements IAlgebraicRewriteRule { return null; } - @Override - public boolean rewritePre(Mutable<ILogicalOperator> opRef, IOptimizationContext context) - throws AlgebricksException { - return false; + // To extract all the PKs of the liveVars referenced by on of the fuzzyjoin branch branch. + private List<LogicalVariable> findPrimaryKeysInSubplan(Collection<LogicalVariable> liveVars, + IOptimizationContext context) { + Collection<LogicalVariable> primaryKeys = new HashSet<>(); + for (LogicalVariable var : liveVars) { + List<LogicalVariable> pks = context.findPrimaryKey(var); + if (pks != null) { + primaryKeys.addAll(pks); + } + } + if (primaryKeys.isEmpty()) { + return new ArrayList<>(); + } + return new ArrayList<>(primaryKeys); } }