This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 33ab7751104 [feature](inverted index) add slop functionality to
match_phrase #33225 (#33809)
33ab7751104 is described below
commit 33ab7751104f2657995788cf9936eceea3437546
Author: zzzxl <[email protected]>
AuthorDate: Thu Apr 18 14:30:37 2024 +0800
[feature](inverted index) add slop functionality to match_phrase #33225
(#33809)
---
.../inverted_index/query/phrase_query.cpp | 48 ++++++++
.../segment_v2/inverted_index/query/phrase_query.h | 29 +++++
.../rowset/segment_v2/inverted_index_reader.cpp | 10 ++
.../test_index_match_phrase_slop.out | 74 +++++++++++++
.../test_index_match_phrase_slop.groovy | 122 +++++++++++++++++++++
5 files changed, 283 insertions(+)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
new file mode 100644
index 00000000000..9e57ee6e29d
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "phrase_query.h"
+
+#include <charconv>
+
+namespace doris::segment_v2 {
+
+Status PhraseQuery::parser_slop(std::string& query, int32_t& slop) {
+ auto is_digits = [](const std::string_view& str) {
+ return std::all_of(str.begin(), str.end(), [](unsigned char c) {
return std::isdigit(c); });
+ };
+
+ size_t last_space_pos = query.find_last_of(' ');
+ if (last_space_pos != std::string::npos) {
+ size_t tilde_pos = last_space_pos + 1;
+ if (tilde_pos < query.size() - 1 && query[tilde_pos] == '~') {
+ size_t slop_pos = tilde_pos + 1;
+ std::string_view slop_str(query.data() + slop_pos, query.size() -
slop_pos);
+ if (is_digits(slop_str)) {
+ auto result = std::from_chars(slop_str.begin(),
slop_str.end(), slop);
+ if (result.ec != std::errc()) {
+ return
Status::Error<doris::ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
+ "PhraseQuery parser failed: {}", query);
+ }
+ query = query.substr(0, last_space_pos);
+ }
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
new file mode 100644
index 00000000000..1b6c559c849
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "common/status.h"
+
+namespace doris::segment_v2 {
+
+class PhraseQuery {
+public:
+ static Status parser_slop(std::string& query, int32_t& slop);
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 5dc9ccbceea..ab79c159f44 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -58,6 +58,7 @@
#include
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
#include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h"
#include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h"
#include "olap/rowset/segment_v2/inverted_index_cache.h"
#include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
@@ -264,10 +265,15 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
auto index_file_path = index_dir / index_file_name;
try {
+ int32_t slop = 0;
std::vector<std::string> analyse_result;
if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
analyse_result.emplace_back(search_str);
} else {
+ if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+ RETURN_IF_ERROR(PhraseQuery::parser_slop(search_str, slop));
+ }
+
InvertedIndexCtxSPtr inverted_index_ctx =
std::make_shared<InvertedIndexCtx>();
inverted_index_ctx->parser_type =
get_inverted_index_parser_type_from_string(
get_parser_string_from_properties(_index_meta.properties()));
@@ -319,6 +325,9 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
str_tokens += token;
str_tokens += " ";
}
+ if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+ str_tokens += " " + std::to_string(slop);
+ }
auto* cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCache::CacheKey cache_key;
@@ -349,6 +358,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
phrase_query->add(term);
_CLDECDELETE(term);
}
+ phrase_query->setSlop(slop);
query.reset(phrase_query);
res = normal_index_search(stats, query_type,
index_searcher,
null_bitmap_already_read, query,
term_match_bitmap);
diff --git
a/regression-test/data/inverted_index_p0/test_index_match_phrase_slop.out
b/regression-test/data/inverted_index_p0/test_index_match_phrase_slop.out
new file mode 100644
index 00000000000..c44a0c0c5dd
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_slop.out
@@ -0,0 +1,74 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+0
+
+-- !sql --
+21
+
+-- !sql --
+21
+
+-- !sql --
+1
+
+-- !sql --
+125
+
+-- !sql --
+125
+
+-- !sql --
+0
+
+-- !sql --
+137
+
+-- !sql --
+137
+
+-- !sql --
+0
+
+-- !sql --
+80
+
+-- !sql --
+80
+
+-- !sql --
+12
+
+-- !sql --
+823
+
+-- !sql --
+823
+
+-- !sql --
+1 127.0.0.1 I'm glad I kept my fingers crossed ~4 1 1
+
+-- !sql --
+1 127.0.0.1 I'm glad I kept my fingers crossed ~4 1 1
+
+-- !sql --
+1 127.0.0.1 I'm glad I kept my fingers crossed ~4 1 1
+
+-- !sql --
+1 127.0.0.1 I'm glad I kept my fingers crossed ~4 1 1
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+1 127.0.0.1 I'm glad I kept my fingers crossed ~4 1 1
diff --git
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_slop.groovy
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_slop.groovy
new file mode 100644
index 00000000000..05d57b5e961
--- /dev/null
+++
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_slop.groovy
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_match_phrase_slop", "p0"){
+ def indexTbName1 = "test_index_match_phrase_slop"
+
+ sql "DROP TABLE IF EXISTS ${indexTbName1}"
+
+ sql """
+ CREATE TABLE ${indexTbName1} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` varchar(20) NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` int(11) NULL COMMENT "",
+ `size` int(11) NULL COMMENT "",
+ INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "disable_auto_compaction" = "true"
+ );
+ """
+
+ def load_httplogs_data = {table_name, label, read_flag, format_flag,
file_name, ignore_failure=false,
+ expected_succ_rows = -1, load_to_single_tablet =
'true' ->
+
+ // load the json data
+ streamLoad {
+ table "${table_name}"
+
+ // set http request header params
+ set 'label', label + "_" + UUID.randomUUID().toString()
+ set 'read_json_by_line', read_flag
+ set 'format', format_flag
+ file file_name // import json file
+ time 10000 // limit inflight 10s
+ if (expected_succ_rows >= 0) {
+ set 'max_filter_ratio', '1'
+ }
+
+ // if declared a check callback, the default check condition will
ignore.
+ // So you must check all condition
+ check { result, exception, startTime, endTime ->
+ if (ignore_failure && expected_succ_rows < 0) { return }
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ if (expected_succ_rows >= 0) {
+ assertEquals(json.NumberLoadedRows, expected_succ_rows)
+ } else {
+ assertEquals(json.NumberTotalRows,
json.NumberLoadedRows + json.NumberUnselectedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes
> 0)
+ }
+ }
+ }
+ }
+
+ try {
+ load_httplogs_data.call(indexTbName1, 'test_index_match_phrase_slop',
'true', 'json', 'documents-1000.json')
+
+ sql """ INSERT INTO ${indexTbName1} VALUES (1, "127.0.0.1", "I'm glad
I kept my fingers crossed ~4", 1, 1); """
+
+ sql "sync"
+
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'get jpg'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'get jpg ~2'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'get jpg ~2'; """
+
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'images bg'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'images bg ~1'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'images bg ~1'; """
+
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'images jpg'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'images jpg ~2'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'images jpg ~2'; """
+
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'french gif'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'french gif ~4'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'french gif ~4'; """
+
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'get http'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'get http ~6'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'get http ~6'; """
+
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'crossed~4'; """
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'crossed ~4'; """
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'glad crossed ~4'; """
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'glad crossed ~4'; """
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'glad crossed ~4 '; """
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'glad crossed ~4.'; """
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'glad crossed~4'; """
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'glad~4crossed~4'; """
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'glad ~4 crossed~4'; """
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'glad ~4 crossed ~4'; """
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'glad crossed \\\\~4'; """
+ qt_sql """ select * from ${indexTbName1} where request match_phrase
'glad crossed \\~4'; """
+
+ } finally {
+ //try_sql("DROP TABLE IF EXISTS ${testTable}")
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]