Repository: asterixdb Updated Branches: refs/heads/master 90fb051a0 -> ab01c87e5
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.1.ddl.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.1.ddl.aql new file mode 100644 index 0000000..b6732c4 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.1.ddl.aql @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Description : Full-text search non-index test + * : This test is intended to verify that the full-text search works as expected. + * : But, the form of the query is join. So, each keyword from the outer dataset will be processed + * : separately. Thus, query #3 and query #4 should generate the same result. + * : query #3 - two string values in [an ordered list] query with "any" option. + * : an ordered list is first initialized by let clause and is being used. + * : in this case, "any" option that enforces a disjunctive search will be applied. + * : query #4 - the same as query #3, but with a different option - "all" + * : in this case, we explicitly specify "all" option that enforces a conjunctive search. + * : query #5 - the same as query #4, but without any option that is equivalent to "all". + * Expected Result : Success + * +*/ + +drop dataverse twitter if exists; +create dataverse twitter if not exists; +use dataverse twitter +create type typeUser if not exists as open { + id: int64, + name: string, + screen_name : string, + lang : string, + location: string, + create_at: date, + description: string, + followers_count: int32, + friends_count: int32, + statues_count: int64 +} +create type typePlace if not exists as open{ + country : string, + country_code : string, + full_name : string, + id : string, + name : string, + place_type : string, + bounding_box : rectangle +} +create type typeGeoTag if not exists as open { + stateID: int32, + stateName: string, + countyID: int32, + countyName: string, + cityID: int32?, + cityName: string? +} +create type typeTweet if not exists as open{ + create_at : datetime, + id: int64, + "text": string, + in_reply_to_status : int64, + in_reply_to_user : int64, + favorite_count : int64, + coordinate: point?, + retweet_count : int64, + lang : string, + is_retweet: boolean, + hashtags : {{ string }} ?, + user_mentions : {{ int64 }} ? , + user : typeUser, + place : typePlace?, + geo_tag: typeGeoTag +} +create dataset ds_tweet(typeTweet) if not exists primary key id +using compaction policy prefix (("max-mergable-component-size"="134217728"),("max-tolerance-component-count"="10")) with filter on create_at; http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.2.update.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.2.update.aql new file mode 100644 index 0000000..6947e27 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.2.update.aql @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use dataverse twitter; + +load dataset ds_tweet +using localfs +(("path"="asterix_nc1://data/fulltext/cloudberry_sample_tweet.adm"),("format"="adm")); http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.3.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.3.query.aql new file mode 100644 index 0000000..c795d40 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.3.query.aql @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use dataverse twitter; + +for $t in dataset twitter.ds_tweet +where ftcontains($t.'text', ['good']) +order by $t.id +return {"id":$t.id} + http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.4.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.4.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.4.query.aql new file mode 100644 index 0000000..8892da0 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.4.query.aql @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use dataverse twitter; + +for $t in dataset twitter.ds_tweet +where ftcontains($t.'text', ['good','difficult']) +order by $t.id +return {"id":$t.id} + http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.5.query.aql ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.5.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.5.query.aql new file mode 100644 index 0000000..b3c61b3 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fulltext/fulltext-08/fulltext-08.5.query.aql @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use dataverse twitter; + +for $t in dataset twitter.ds_tweet +where ftcontains($t.'text', ['good','difficult'], {'mode':'any'}) +order by $t.id +return {"id":$t.id} + http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.3.adm ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.3.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.3.adm new file mode 100644 index 0000000..85c3c4f --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.3.adm @@ -0,0 +1,3 @@ +{ "id": 668945643054870528 } +{ "id": 668945646725017600 } +{ "id": 668945653892911104 } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.4.adm ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.4.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.4.adm new file mode 100644 index 0000000..17babd8 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.4.adm @@ -0,0 +1 @@ +{ "id": 668945643054870528 } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.5.adm ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.5.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.5.adm new file mode 100644 index 0000000..2d91ff6 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/fulltext/fulltext-08/fulltext-08.5.adm @@ -0,0 +1,4 @@ +{ "id": 668945643054870528 } +{ "id": 668945646725017600 } +{ "id": 668945651263115264 } +{ "id": 668945653892911104 } http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml index 7a486b7..956ea53 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml @@ -421,6 +421,11 @@ </compilation-unit> </test-case> <test-case FilePath="fulltext"> + <compilation-unit name="fulltext-08"> + <output-dir compare="Text">fulltext-08</output-dir> + </compilation-unit> + </test-case> + <test-case FilePath="fulltext"> <compilation-unit name="fulltext-index-01"> <output-dir compare="Text">fulltext-index-01</output-dir> </compilation-unit> http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-doc/src/site/markdown/aql/fulltext.md ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-doc/src/site/markdown/aql/fulltext.md b/asterixdb/asterix-doc/src/site/markdown/aql/fulltext.md index 4fe17ac..bc0b398 100644 --- a/asterixdb/asterix-doc/src/site/markdown/aql/fulltext.md +++ b/asterixdb/asterix-doc/src/site/markdown/aql/fulltext.md @@ -56,6 +56,14 @@ Thus, "Voice" or "voice" will be evaluated as the same word. The DDL and DML of TinySocial can be found in [ADM: Modeling Semistructed Data in AsterixDB](primer.html#ADM:_Modeling_Semistructed_Data_in_AsterixDB). +The same query can be also expressed in the SQL++. + + use TinySocial; + + select element {"id":msg.id} + from TweetMessages as msg + where TinySocial.ftcontains(msg.`message-text`, "voice", {"mode":"any"}) + The `Expression1` is an expression that should be evaluable as a string at runtime as in the above example where `$msg.message-text` is a string field. The `Expression2` can be a string, an (un)ordered list of string value(s), or an expression. In the last case, the given expression should be evaluable @@ -103,3 +111,13 @@ or âsound is not clear. You may need to install a new system.â ... where ftcontains($msg.message-text, ["sound", "system"], {"mode":"all"}) ... where ftcontains($msg.message-text, ["sound", "system"]) + + +## <a id="FulltextIndex">Creating and utilizing a Full-text index</a> <font size="4"><a href="#toc">[Back to TOC]</a></font> ## + +When there is a full-text index on the field that is being searched, rather than scanning all records, +AsterixDB can utilize that index to expedite the execution of a FTS query. To create a full-text index, +you need to specify the index type as `fulltext` in your DDL statement. For instance, the following DDL +statement create a full-text index on the TweetMessages.message-text attribute. + + create index messageFTSIdx on TweetMessages(message-text) type fulltext; http://git-wip-us.apache.org/repos/asf/asterixdb/blob/ab01c87e/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/FullTextContainsEvaluator.java ---------------------------------------------------------------------- diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/FullTextContainsEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/FullTextContainsEvaluator.java index b94821f..0a48f6f 100644 --- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/FullTextContainsEvaluator.java +++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/FullTextContainsEvaluator.java @@ -244,7 +244,16 @@ public class FullTextContainsEvaluator implements IScalarEvaluator { int queryTokenCount = 0; int uniqueQueryTokenCount = 0; + int numBytesToStoreLength; + // Reset the tokenizer for the given keywords in the given query + if (typeTag2 == ATypeTag.STRING) { + // How many bytes are required to store the length of the given token? + numBytesToStoreLength = UTF8StringUtil + .getNumBytesToStoreLength(UTF8StringUtil.getUTFLength(queryArray, queryArrayStartOffset)); + queryArrayStartOffset = queryArrayStartOffset + numBytesToStoreLength; + queryArrayLength = queryArrayLength - numBytesToStoreLength; + } tokenizerForRightArray.reset(queryArray, queryArrayStartOffset, queryArrayLength); // Create tokens from the given query predicate @@ -256,7 +265,6 @@ public class FullTextContainsEvaluator implements IScalarEvaluator { // We don't store the actual value of this token since we can access it via offset and length. int tokenOffset = tokenizerForRightArray.getToken().getStartOffset(); int tokenLength = tokenizerForRightArray.getToken().getTokenLength(); - int numBytesToStoreLength; // If a token comes from a string tokenizer, each token doesn't have the length data // in the beginning. Instead, if a token comes from an (un)ordered list, each token has @@ -352,7 +360,14 @@ public class FullTextContainsEvaluator implements IScalarEvaluator { // The left side: field (document) // Resets the tokenizer for the given keywords in a document. - tokenizerForLeftArray.reset(arg1.getByteArray(), arg1.getStartOffset(), arg1.getLength()); + + // How many bytes are required to store the length of the given string? + int numBytesToStoreLength = UTF8StringUtil + .getNumBytesToStoreLength(UTF8StringUtil.getUTFLength(arg1.getByteArray(), arg1.getStartOffset())); + int startOffset = arg1.getStartOffset() + numBytesToStoreLength; + int length = arg1.getLength() - numBytesToStoreLength; + + tokenizerForLeftArray.reset(arg1.getByteArray(), startOffset, length); // Creates tokens from a field in the left side (document) while (tokenizerForLeftArray.hasNext()) {