This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new af63014eda4 [fix](inverted index) Fix for Inaccurate
match_phrase_prefix Cache in Query Processing (#46310)
af63014eda4 is described below
commit af63014eda4e06ea59479f4cf8650657f2f416cd
Author: zzzxl <[email protected]>
AuthorDate: Mon Jan 6 16:08:46 2025 +0800
[fix](inverted index) Fix for Inaccurate match_phrase_prefix Cache in Query
Processing (#46310)
Problem Summary:
1. different values of inverted_index_max_expansions require separate
caches.
---
.../rowset/segment_v2/inverted_index_reader.cpp | 46 ++++++++++------------
.../test_index_match_phrase_prefix_1.out | 6 +++
.../inverted_index_p0/test_index_match_regexp.out | 6 +++
.../test_index_match_phrase_prefix_1.groovy | 9 +++++
.../test_index_match_regexp.groovy | 7 ++++
5 files changed, 49 insertions(+), 25 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 4fe45283cd2..fced65724e5 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -259,7 +259,7 @@ Status InvertedIndexReader::match_index_search(
InvertedIndexQueryType query_type, const InvertedIndexQueryInfo&
query_info,
const FulltextIndexSearcherPtr& index_searcher,
const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
- TQueryOptions queryOptions = runtime_state->query_options();
+ const auto& queryOptions = runtime_state->query_options();
try {
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
auto query = QueryFactory::create(query_type, index_searcher,
queryOptions, io_ctx);
@@ -294,24 +294,23 @@ Status FullTextIndexReader::query(const io::IOContext*
io_ctx, OlapReaderStatist
VLOG_DEBUG << column_name << " begin to search the fulltext index from
clucene, query_str ["
<< search_str << "]";
+ const auto& queryOptions = runtime_state->query_options();
try {
InvertedIndexQueryInfo query_info;
InvertedIndexQueryCache::CacheKey cache_key;
auto index_file_key =
_inverted_index_file_reader->get_index_file_cache_key(&_index_meta);
+ // terms
if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
- cache_key = {index_file_key, column_name, query_type, search_str};
query_info.terms.emplace_back(search_str);
+ } else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+ PhraseQuery::parser_info(search_str, column_name, query_type,
_index_meta.properties(),
+ query_info,
queryOptions.enable_phrase_query_sequential_opt);
} else {
- if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
- PhraseQuery::parser_info(
- search_str, column_name, query_type,
_index_meta.properties(), query_info,
-
runtime_state->query_options().enable_phrase_query_sequential_opt);
- } else {
- query_info.terms =
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
- search_str, column_name, query_type,
_index_meta.properties());
- }
+ query_info.terms =
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
+ search_str, column_name, query_type,
_index_meta.properties());
}
+
if (query_info.terms.empty()) {
auto msg = fmt::format(
"token parser result is empty for query, "
@@ -325,22 +324,20 @@ Status FullTextIndexReader::query(const io::IOContext*
io_ctx, OlapReaderStatist
}
}
- std::unique_ptr<lucene::search::Query> query;
+ // field_name
query_info.field_name = StringUtil::string_to_wstring(column_name);
- if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
- query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
- query_type == InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY ||
- query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
- query_type == InvertedIndexQueryType::EQUAL_QUERY ||
- query_type == InvertedIndexQueryType::MATCH_ANY_QUERY) {
- std::string str_tokens = join(query_info.terms, " ");
- if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
- str_tokens += " " + std::to_string(query_info.slop);
- str_tokens += " " + std::to_string(query_info.ordered);
- }
- cache_key = {index_file_key, column_name, query_type, str_tokens};
+ // cache_key
+ std::string str_tokens = join(query_info.terms, " ");
+ if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+ str_tokens += " " + std::to_string(query_info.slop);
+ str_tokens += " " + std::to_string(query_info.ordered);
+ } else if (query_type ==
InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
+ query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
+ str_tokens += " " +
std::to_string(queryOptions.inverted_index_max_expansions);
}
+ cache_key = {index_file_key, column_name, query_type,
std::move(str_tokens)};
+
auto* cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCacheHandle cache_handler;
@@ -350,13 +347,12 @@ Status FullTextIndexReader::query(const io::IOContext*
io_ctx, OlapReaderStatist
if (cache_status.ok()) {
return Status::OK();
}
- FulltextIndexSearcherPtr* searcher_ptr = nullptr;
InvertedIndexCacheHandle inverted_index_cache_handle;
RETURN_IF_ERROR(
handle_searcher_cache(runtime_state,
&inverted_index_cache_handle, io_ctx, stats));
auto searcher_variant =
inverted_index_cache_handle.get_index_searcher();
- searcher_ptr =
std::get_if<FulltextIndexSearcherPtr>(&searcher_variant);
+ auto* searcher_ptr =
std::get_if<FulltextIndexSearcherPtr>(&searcher_variant);
if (searcher_ptr != nullptr) {
term_match_bitmap = std::make_shared<roaring::Roaring>();
RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state,
query_type, query_info,
diff --git
a/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out
b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out
index e7e934f394e..7ac0d5f7ec2 100644
---
a/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out
+++
b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out
@@ -11,3 +11,9 @@
-- !sql --
6
+-- !sql --
+0
+
+-- !sql --
+1
+
diff --git a/regression-test/data/inverted_index_p0/test_index_match_regexp.out
b/regression-test/data/inverted_index_p0/test_index_match_regexp.out
index fb5d23ad266..2c06da4147c 100644
--- a/regression-test/data/inverted_index_p0/test_index_match_regexp.out
+++ b/regression-test/data/inverted_index_p0/test_index_match_regexp.out
@@ -20,3 +20,9 @@
-- !sql --
0
+-- !sql --
+4
+
+-- !sql --
+377
+
diff --git
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy
index f42462f12a6..5ee38c9e403 100644
---
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy
+++
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy
@@ -47,6 +47,8 @@ suite("test_index_match_phrase_prefix_1", "nonConcurrent"){
sql """ INSERT INTO ${indexTbName1} VALUES (6, "O1704361998540E2Cemx9S
123456789", "O1704361998540E2Cemx9S 123456789", "O1704361998540E2Cemx9S
123456789"); """
sql """ INSERT INTO ${indexTbName1} VALUES (7,
"O1704361998540E2Cemx9S*123456789", "O1704361998540E2Cemx9S*123456789",
"O1704361998540E2Cemx9S*123456789"); """
+ sql """ INSERT INTO ${indexTbName1} VALUES (1, "", "s1", ""), (2, "",
"s2", ""), (3, "", "s3", ""), (4, "", "s4", ""), (5, "", "tv s5", ""); """
+
try {
sql "sync"
sql """ set enable_common_expr_pushdown = true; """
@@ -58,7 +60,14 @@ suite("test_index_match_phrase_prefix_1", "nonConcurrent"){
qt_sql """ select count() from ${indexTbName1} where c
match_phrase_prefix 'O1704361998540E2Cemx9S=123456789'; """
qt_sql """ select count() from ${indexTbName1} where d
match_phrase_prefix 'O1704361998540E2Cemx9S=123456789'; """
+ sql """ set inverted_index_max_expansions = 3; """
+ qt_sql """ select count() from ${indexTbName1} where c
match_phrase_prefix 'tv s'; """
+
+ sql """ set inverted_index_max_expansions = 5; """
+ qt_sql """ select count() from ${indexTbName1} where c
match_phrase_prefix 'tv s'; """
+
} finally {
+ sql """ set inverted_index_max_expansions = 50; """
GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
}
}
\ No newline at end of file
diff --git
a/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
index 49f0f563989..1f508306dbb 100644
--- a/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
+++ b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
@@ -90,7 +90,14 @@ suite("test_index_match_regexp", "nonConcurrent"){
qt_sql """ select count() from test_index_match_regexp where request
match_regexp '.*tickets.*'; """
qt_sql """ select count() from test_index_match_regexp where request
match_regexp 'nonexistence'; """
+ sql """ set inverted_index_max_expansions = 1; """
+ qt_sql """ select count() from test_index_match_regexp where request
match_regexp 'b'; """
+
+ sql """ set inverted_index_max_expansions = 50; """
+ qt_sql """ select count() from test_index_match_regexp where request
match_regexp 'b'; """
+
} finally {
+ sql """ set inverted_index_max_expansions = 50; """
GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
}
}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]