http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-complex/word-jaccard-check-multi-let.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-complex/word-jaccard-check-multi-let.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-complex/word-jaccard-check-multi-let.aql new file mode 100644 index 0000000..26b42ed --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-complex/word-jaccard-check-multi-let.aql @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Tests whether a keyword index is applied to optimize a selection query using the similarity-jaccard-check function on word tokens. + * Tests that the optimizer rule correctly drills through the let clauses. + * The index should be applied. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index keyword_index on DBLP(nested.title: string?) type keyword enforced; + +write output to asterix_nc1:"rttest/inverted-index-complex_word-jaccard-check-multi-let.adm"; + +// This test is complex because we have three assigns to drill into. +for $paper in dataset('DBLP') +let $paper_tokens := word-tokens($paper.nested.title) +let $query_tokens := word-tokens("Transactions for Cooperative Environments") +let $jacc := similarity-jaccard-check($paper_tokens, $query_tokens, 0.8f) +where $jacc[0] +return {"Paper": $paper_tokens, "Query": $query_tokens }
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/leftouterjoin-probe-pidx-with-join-edit-distance-check-idx_01.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/leftouterjoin-probe-pidx-with-join-edit-distance-check-idx_01.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/leftouterjoin-probe-pidx-with-join-edit-distance-check-idx_01.aql new file mode 100644 index 0000000..d24c284 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/leftouterjoin-probe-pidx-with-join-edit-distance-check-idx_01.aql @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Test that left-outer-join may use two available indexes, one for primary index in prob subtree and another for secondary rtree index in index subtree. + * Issue : 730, 741 + * Expected Res : Success + * Date : 8th May 2014 + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type TwitterUserType as closed { + screen-name: string, + lang: string, + friends-count: int32, + statuses-count: int32, + name: string, + followers-count: int32 +} + +create type TweetMessageNestedType as open { + tweetid: int64, + user: TwitterUserType, + sender-location: point, + send-time: datetime, + referred-topics: {{ string }}, + countA: int32, + countB: int32 +} + +create type TweetMessageType as open { + nested: TweetMessageNestedType +} + +create dataset TweetMessages(TweetMessageType) +primary key nested.tweetid; + +create index msgNgramIx on TweetMessages(nested.message-text: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_leftouterjoin-probe-pidx-with-join-edit-distance-check_idx_01.adm"; + +for $t1 in dataset('TweetMessages') +where $t1.nested.tweetid > int64("240") +order by $t1.nested.tweetid +return { + "tweet": {"id": $t1.nested.tweetid, "topics" : $t1.nested.message-text} , + "similar-tweets": for $t2 in dataset('TweetMessages') + let $sim := edit-distance-check($t1.nested.message-text, $t2.nested.message-text, 7) + where $sim[0] and + $t2.nested.tweetid != $t1.nested.tweetid + order by $t2.nested.tweetid + return {"id": $t2.nested.tweetid, "topics" : $t2.nested.message-text} +}; http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_02.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_02.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_02.aql new file mode 100644 index 0000000..5fc32d5 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_02.aql @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the edit-distance-check function of their authors. + * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as closed { + id: int32, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + title: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index ngram_index on CSX(nested.authors: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-edit-distance-check_01.adm"; + +for $b in dataset('DBLP') +for $a in dataset('CSX') +where edit-distance-check($a.nested.authors, $b.nested.authors, 3)[0] and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_03.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_03.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_03.aql new file mode 100644 index 0000000..9758b48 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_03.aql @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Self joins dataset DBLP, based on the edit-distance-check function of their authors. + * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + title: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index ngram_index on DBLP(nested.authors: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-edit-distance-check_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +where edit-distance-check($a.nested.authors, $b.nested.authors, 3)[0] and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_04.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_04.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_04.aql new file mode 100644 index 0000000..3291f52 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-check_04.aql @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the edit-distance-check function of their authors. + * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + title: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + title: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index ngram_index_DBLP on DBLP(nested.authors: string?) type ngram(3) enforced; + +create index ngram_index_CSX on CSX(nested.authors: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-edit-distance-check_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('CSX') +where edit-distance-check($a.nested.authors, $b.nested.authors, 3)[0] and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-inline.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-inline.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-inline.aql new file mode 100644 index 0000000..b7ffdd4 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance-inline.aql @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy self joins a dataset, DBLP, based on the edit-distance function of its authors. + * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join. + * We test the inlining of variables that enable the select to be pushed into the join for subsequent optimization with an index. + * We expect the top-level equi join introduced because of surrogate optimization to be removed, since it is not necessary. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + title: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index ngram_index on DBLP(nested.authors: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join-noeqjoin_ngram-edit-distance-inline.adm"; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +let $ed := edit-distance($a.nested.authors, $b.nested.authors) +where $ed < 3 and $a.nested.id < $b.nested.id +return {"aauthors": $a.nested.authors, "bauthors": $b.nested.authors, "ed": $ed} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_02.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_02.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_02.aql new file mode 100644 index 0000000..4d1010a --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_02.aql @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the edit-distance function of their authors. + * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as closed { + id: int32, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + title: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index ngram_index on CSX(nested.authors: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-edit-distance_01.adm"; + +for $b in dataset('DBLP') +for $a in dataset('CSX') +where edit-distance($a.nested.authors, $b.nested.authors) < 3 and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_03.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_03.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_03.aql new file mode 100644 index 0000000..eb55bb1 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_03.aql @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Self joins dataset DBLP, based on the edit-distance function of their authors. + * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + title: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index ngram_index on DBLP(nested.authors: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-edit-distance_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +where edit-distance($a.nested.authors, $b.nested.authors) < 3 and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_04.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_04.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_04.aql new file mode 100644 index 0000000..e546740 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-edit-distance_04.aql @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the edit-distance function of their authors. + * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + title: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + title: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index ngram_index_DBLP on DBLP(nested.authors: string?) type ngram(3) enforced; + +create index ngram_index_CSX on CSX(nested.authors: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-edit-distance_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('CSX') +where edit-distance($a.nested.authors, $b.nested.authors) < 3 and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-edit-distance_01.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-edit-distance_01.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-edit-distance_01.aql new file mode 100644 index 0000000..081f022 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-edit-distance_01.aql @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on ~= using edit distance of their authors. + * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + title: string, + misc: string +} + +create type CSXTypetmp as closed { + id: int32, + csxid: string, + title: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index ngram_index on DBLP(nested.authors: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-fuzzyeq-edit-distance_01.adm"; + +set simfunction 'edit-distance'; +set simthreshold '3'; + +for $b in dataset('CSX') +for $a in dataset('DBLP') +where $a.nested.authors ~= $b.nested.authors and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-edit-distance_03.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-edit-distance_03.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-edit-distance_03.aql new file mode 100644 index 0000000..cd29e65 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-edit-distance_03.aql @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Self joins dataset DBLP, based on ~= using edit distance of their authors. + * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + title: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index ngram_index on DBLP(nested.authors: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-fuzzyeq-edit-distance_01.adm"; + +set simfunction 'edit-distance'; +set simthreshold '3'; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +where $a.nested.authors ~= $b.nested.authors and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-jaccard_02.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-jaccard_02.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-jaccard_02.aql new file mode 100644 index 0000000..2f7c560 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-jaccard_02.aql @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on ~= using Jaccard of their titles' 3-gram tokens. + * DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; +set import-private-functions 'true'; + +create type DBLPTypetmp as closed { + id: int32, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index ngram_index on CSX(nested.title: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-fuzzyeq-jaccard_01.adm"; + +set simfunction 'jaccard'; +set simthreshold '0.5f'; + +for $b in dataset('DBLP') +for $a in dataset('CSX') +where gram-tokens($a.nested.title, 3, false) ~= gram-tokens($b.nested.title, 3, false) and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-jaccard_03.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-jaccard_03.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-jaccard_03.aql new file mode 100644 index 0000000..5e12569 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-fuzzyeq-jaccard_03.aql @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Self joins dataset DBLP, based on ~= using Jaccard of their titles' 3-gram tokens. + * DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; +set import-private-functions 'true'; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index ngram_index on DBLP(nested.title: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-fuzzyeq-jaccard_01.adm"; + +set simfunction 'jaccard'; +set simthreshold '0.5f'; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +where gram-tokens($a.nested.title, 3, false) ~= gram-tokens($b.nested.title, 3, false) and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_02.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_02.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_02.aql new file mode 100644 index 0000000..5576768 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_02.aql @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard-check function of their titles' 3-gram tokens. + * DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; +set import-private-functions 'true'; + +create type DBLPTypetmp as closed { + id: int32, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index ngram_index on CSX(nested.title: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-jaccard-check_01.adm"; + +for $b in dataset('DBLP') +for $a in dataset('CSX') +where similarity-jaccard-check(gram-tokens($a.nested.title, 3, false), gram-tokens($b.nested.title, 3, false), 0.5f)[0] + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_03.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_03.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_03.aql new file mode 100644 index 0000000..32829fb --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_03.aql @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Self joins dataset DBLP, based on the similarity-jaccard-check function of their titles' 3-gram tokens. + * DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; +set import-private-functions 'true'; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index ngram_index on DBLP(nested.title: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-jaccard-check_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +where similarity-jaccard-check(gram-tokens($a.nested.title, 3, false), gram-tokens($b.nested.title, 3, false), 0.5f)[0] + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_04.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_04.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_04.aql new file mode 100644 index 0000000..dc4c210 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-check_04.aql @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard-check function of their titles' 3-gram tokens. + * DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; +set import-private-functions 'true'; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index ngram_index_DBLP on DBLP(nested.title: string?) type ngram(3) enforced; + +create index ngram_index_CSX on CSX(nested.title: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-jaccard-check_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('CSX') +where similarity-jaccard-check(gram-tokens($a.nested.title, 3, false), gram-tokens($b.nested.title, 3, false), 0.5f)[0] + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-inline.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-inline.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-inline.aql new file mode 100644 index 0000000..bd599b4 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard-inline.aql @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy self joins a dataset, DBLP, based on the similarity-jaccard function of its titles' 3-gram tokens. + * DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join. + * We test the inlining of variables that enable the select to be pushed into the join for subsequent optimization with an index. + * We expect the top-level equi join introduced because of surrogate optimization to be removed, since it is not necessary. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; +set import-private-functions 'true'; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index ngram_index on DBLP(nested.title: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join-noeqjoin_ngram-jaccard-inline.adm"; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +let $jacc := similarity-jaccard(gram-tokens($a.nested.title, 3, false), gram-tokens($b.nested.title, 3, false)) +where $jacc >= 0.5f and $a.nested.id < $b.nested.id +return {"atitle": $a.nested.title, "btitle": $b.nested.title, "jacc": $jacc} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_02.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_02.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_02.aql new file mode 100644 index 0000000..e9f491b --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_02.aql @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' 3-gram tokens. + * DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; +set import-private-functions 'true'; + +create type DBLPTypetmp as closed { + id: int32, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index ngram_index on CSX(nested.title: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-jaccard_01.adm"; + +for $b in dataset('DBLP') +for $a in dataset('CSX') +where similarity-jaccard(gram-tokens($a.nested.title, 3, false), gram-tokens($b.nested.title, 3, false)) >= 0.5f + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_03.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_03.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_03.aql new file mode 100644 index 0000000..e064c03 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_03.aql @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Self joins dataset DBLP, based on the similarity-jaccard function of their titles' 3-gram tokens. + * DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; +set import-private-functions 'true'; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index ngram_index on DBLP(nested.title: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-jaccard_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +where similarity-jaccard(gram-tokens($a.nested.title, 3, false), gram-tokens($b.nested.title, 3, false)) >= 0.5f + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_04.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_04.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_04.aql new file mode 100644 index 0000000..b08eabf --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/ngram-jaccard_04.aql @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' 3-gram tokens. + * DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; +set import-private-functions 'true'; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index ngram_index_DBLP on DBLP(nested.title: string?) type ngram(3) enforced; + +create index ngram_index_CSX on CSX(nested.title: string?) type ngram(3) enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_ngram-jaccard_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('CSX') +where similarity-jaccard(gram-tokens($a.nested.title, 3, false), gram-tokens($b.nested.title, 3, false)) >= 0.5f + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-fuzzyeq-jaccard_02.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-fuzzyeq-jaccard_02.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-fuzzyeq-jaccard_02.aql new file mode 100644 index 0000000..e19b4af --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-fuzzyeq-jaccard_02.aql @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on ~= using Jaccard of their titles' word tokens. + * DBLP has a keyword index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as closed { + id: int32, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index keyword_index on CSX(nested.title: string?) type keyword enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_word-fuzzyeq-jaccard_01.adm"; + +set simfunction 'jaccard'; +set simthreshold '0.5f'; + +for $b in dataset('DBLP') +for $a in dataset('CSX') +where word-tokens($a.nested.title) ~= word-tokens($b.nested.title) and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-fuzzyeq-jaccard_03.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-fuzzyeq-jaccard_03.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-fuzzyeq-jaccard_03.aql new file mode 100644 index 0000000..5fd64c9 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-fuzzyeq-jaccard_03.aql @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Self joins dataset DBLP, based on ~= using Jaccard of their titles' word tokens. + * DBLP has a keyword index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index keyword_index on DBLP(nested.title: string?) type keyword enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_word-fuzzyeq-jaccard_01.adm"; + +set simfunction 'jaccard'; +set simthreshold '0.5f'; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +where word-tokens($a.nested.title) ~= word-tokens($b.nested.title) and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check-after-btree-access.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check-after-btree-access.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check-after-btree-access.aql new file mode 100644 index 0000000..9ef8bb2 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check-after-btree-access.aql @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy self joins a dataset, TweetMessages, based on the similarity-jaccard-check function of its text-messages' word tokens. + * TweetMessages has a keyword index on text-message and btree index on the primary key tweetid, and we expect the join to be + * transformed into btree and inverted indexed nested-loop joins. We test whether the join condition can be transformed into + * multiple indexed nested loop joins of various type of indexes. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type TwitterUserType as closed { + screen-name: string, + lang: string, + friends-count: int32, + statuses-count: int32, + name: string, + followers-count: int32 +} + +create type TweetMessageNestedType as open { + tweetid: int64, + user: TwitterUserType, + sender-location: point, + send-time: datetime, + referred-topics: {{ string }}, + countA: int32, + countB: int32 +} + +create type TweetMessageType as closed { + nested: TweetMessageNestedType +} + +create dataset TweetMessages(TweetMessageType) +primary key nested.tweetid; + +create index twmSndLocIx on TweetMessages(nested.sender-location) type rtree; +create index msgCountAIx on TweetMessages(nested.countA) type btree; +create index msgCountBIx on TweetMessages(nested.countB) type btree; +create index msgTextIx on TweetMessages(nested.message-text: string?) type keyword enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_word-jaccard-check-after-btree-access.adm"; + +for $t1 in dataset('TweetMessages') +for $t2 in dataset('TweetMessages') +let $sim := similarity-jaccard-check(word-tokens($t1.nested.message-text), word-tokens($t2.nested.message-text), 0.6f) +where $sim[0] and $t1.nested.tweetid < int64("20") and $t2.nested.tweetid != $t1.nested.tweetid +return { + "t1": $t1.nested.tweetid, + "t2": $t2.nested.tweetid, + "sim": $sim[1] +} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_02.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_02.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_02.aql new file mode 100644 index 0000000..583dda6 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_02.aql @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard-check function of their titles' word tokens. + * DBLP has a keyword index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as closed { + id: int32, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index keyword_index on CSX(nested.title: string?) type keyword enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_word-jaccard-check_01.adm"; + +for $b in dataset('DBLP') +for $a in dataset('CSX') +where similarity-jaccard-check(word-tokens($a.nested.title), word-tokens($b.nested.title), 0.5f)[0] + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_03.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_03.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_03.aql new file mode 100644 index 0000000..2cff649 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_03.aql @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Selg joins dataset DBLP, based on the similarity-jaccard-check function of their titles' word tokens. + * DBLP has a keyword index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index keyword_index_DBLP on DBLP(nested.title: string?) type keyword enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_word-jaccard-check_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +where similarity-jaccard-check(word-tokens($a.nested.title), word-tokens($b.nested.title), 0.5f)[0] + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_04.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_04.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_04.aql new file mode 100644 index 0000000..fe7b2f9 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-check_04.aql @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard-check function of their titles' word tokens. + * DBLP has a keyword index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index keyword_index on DBLP(nested.title: string?) type keyword enforced; + +create index keyword_index on CSX(nested.title: string?) type keyword enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_word-jaccard-check_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('CSX') +where similarity-jaccard-check(word-tokens($a.nested.title), word-tokens($b.nested.title), 0.5f)[0] + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-inline.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-inline.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-inline.aql new file mode 100644 index 0000000..2ec1846 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard-inline.aql @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy self joins a dataset, DBLP, based on the similarity-jaccard function of its titles' word tokens. + * DBLP has a keyword index on title, and we expect the join to be transformed into an indexed nested-loop join. + * We test the inlining of variables that enable the select to be pushed into the join for subsequent optimization with an index. + * We expect the top-level equi join introduced because of surrogate optimization to be removed, since it is not necessary. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index keyword_index on DBLP(nested.title: string?) type keyword enforced; + +write output to asterix_nc1:"rttest/inverted-index-join-noeqjoin_word-jaccard-inline.adm"; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +let $jacc := similarity-jaccard(word-tokens($a.nested.title), word-tokens($b.nested.title)) +where $jacc >= 0.5f and $a.nested.id < $b.nested.id +return {"atitle": $a.nested.title, "btitle": $b.nested.title, "jacc": $jacc} http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_02.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_02.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_02.aql new file mode 100644 index 0000000..2d52ae9 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_02.aql @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' word tokens. + * DBLP has a keyword index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as closed { + id: int32, + dblpid: string, + title: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index keyword_index on CSX(nested.title: string?) type keyword enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_word-jaccard_01.adm"; + +for $b in dataset('DBLP') +for $a in dataset('CSX') +where similarity-jaccard(word-tokens($a.nested.title), word-tokens($b.nested.title)) >= 0.5f + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } + http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_03.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_03.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_03.aql new file mode 100644 index 0000000..456ad8c --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_03.aql @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Self joins dataset DBLP, based on the similarity-jaccard function of their titles' word tokens. + * DBLP has a keyword index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create index keyword_index on DBLP(nested.title: string?) type keyword enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_word-jaccard_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('DBLP') +where similarity-jaccard(word-tokens($a.nested.title), word-tokens($b.nested.title)) >= 0.5f + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } + http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_04.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_04.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_04.aql new file mode 100644 index 0000000..8812d81 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/inverted-index-join/word-jaccard_04.aql @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' word tokens. + * DBLP has a keyword index on title, and we expect the join to be transformed into an indexed nested-loop join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type DBLPTypetmp as open { + id: int32, + dblpid: string, + authors: string, + misc: string +} + +create type CSXTypetmp as open { + id: int32, + csxid: string, + authors: string, + misc: string +} + +create type DBLPType as closed { + nested : DBLPTypetmp +} + +create type CSXType as closed { + nested : CSXTypetmp +} + +create dataset DBLP(DBLPType) primary key nested.id; + +create dataset CSX(CSXType) primary key nested.id; + +create index keyword_index on DBLP(nested.title: string?) type keyword enforced; + +create index keyword_index on CSX(nested.title: string?) type keyword enforced; + +write output to asterix_nc1:"rttest/inverted-index-join_word-jaccard_01.adm"; + +for $a in dataset('DBLP') +for $b in dataset('CSX') +where similarity-jaccard(word-tokens($a.nested.title), word-tokens($b.nested.title)) >= 0.5f + and $a.nested.id < $b.nested.id +return {"arec": $a, "brec": $b } + http://git-wip-us.apache.org/repos/asf/asterixdb/blob/c4dbb614/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/rtree-index-join/leftouterjoin-probe-pidx-with-join-rtree-sidx_02.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/rtree-index-join/leftouterjoin-probe-pidx-with-join-rtree-sidx_02.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/rtree-index-join/leftouterjoin-probe-pidx-with-join-rtree-sidx_02.aql new file mode 100644 index 0000000..8499c9a --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/nested-open-index/rtree-index-join/leftouterjoin-probe-pidx-with-join-rtree-sidx_02.aql @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Test that left-outer-join may use two available indexes, one for primary index in prob subtree and another for secondary rtree index in index subtree. + * Issue : 730, 741 + * Expected Res : Success + * Date : 8th May 2014 + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +create type TwitterUserType as closed { + screen-name: string, + lang: string, + friends-count: int32, + statuses-count: int32, + name: string, + followers-count: int32 +} + +create type TweetMessageNestedType as open { + tweetid: int64, + user: TwitterUserType, + send-time: datetime, + referred-topics: {{ string }}, + message-text: string, + countA: int32, + countB: int32 +} + +create type TweetMessageType as open { + nested: TweetMessageNestedType +} + +create dataset TweetMessages(TweetMessageType) +primary key nested.tweetid; + +create index twmSndLocIx on TweetMessages(nested.sender-location: point?) type rtree enforced; +create index msgCountAIx on TweetMessages(nested.countA) type btree; +create index msgCountBIx on TweetMessages(nested.countB) type btree; +create index msgTextIx on TweetMessages(nested.message-text) type keyword; + +write output to asterix_nc1:"rttest/rtree-index-join_leftouterjoin-probe-pidx-with-join-rtree-sidx_02.adm"; + +for $t1 in dataset('TweetMessages') +let $n := create-circle($t1.nested.sender-location, 0.5) +where $t1.nested.tweetid < int64("10") +order by $t1.nested.tweetid +return { +"tweetid1": $t1.nested.tweetid, +"loc1":$t1.nested.sender-location, +"nearby-message": for $t2 in dataset('TweetMessages') + where spatial-intersect($t2.nested.sender-location, $n) and $t1.nested.tweetid != $t2.nested.tweetid + order by $t2.nested.tweetid + return {"tweetid2":$t2.nested.tweetid, "loc2":$t2.nested.sender-location} +};