This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 6fdce0ddd8f [feature](inverted index) String type inverted index match
function completion (#38170)
6fdce0ddd8f is described below
commit 6fdce0ddd8f362f939a177815ed3702f334253d3
Author: zzzxl <[email protected]>
AuthorDate: Wed Jul 31 12:18:47 2024 +0800
[feature](inverted index) String type inverted index match function
completion (#38170)
1. Inverted index of string type supports match_phrase_prefix and
match_regexp.
---
be/src/olap/match_predicate.cpp | 21 ++--
be/src/olap/match_predicate.h | 2 +-
.../rowset/segment_v2/inverted_index_reader.cpp | 103 +++++++--------
.../olap/rowset/segment_v2/inverted_index_reader.h | 13 +-
.../inverted_index_p0/test_index_complex_match.out | 19 +++
.../test_index_complex_match.groovy | 138 +++++++++++++++++++++
.../inverted_index_p0/test_no_index_match.groovy | 4 +-
7 files changed, 224 insertions(+), 76 deletions(-)
diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp
index 0332e3f2e31..683e38775f3 100644
--- a/be/src/olap/match_predicate.cpp
+++ b/be/src/olap/match_predicate.cpp
@@ -51,9 +51,9 @@ Status MatchPredicate::evaluate(const
vectorized::IndexFieldNameAndTypePair& nam
if (iterator == nullptr) {
return Status::OK();
}
- if (_skip_evaluate(iterator)) {
- return Status::Error<ErrorCode::INVERTED_INDEX_EVALUATE_SKIPPED>(
- "match predicate evaluate skipped.");
+ if (_check_evaluate(iterator)) {
+ return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
+ "phrase queries require setting support_phrase = true");
}
auto type = name_with_type.second;
const std::string& name = name_with_type.first;
@@ -122,13 +122,14 @@ InvertedIndexQueryType
MatchPredicate::_to_inverted_index_query_type(MatchType m
return ret;
}
-bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const {
- if ((_match_type == MatchType::MATCH_PHRASE || _match_type ==
MatchType::MATCH_PHRASE_PREFIX ||
- _match_type == MatchType::MATCH_PHRASE_EDGE) &&
- iterator->get_inverted_index_reader_type() ==
InvertedIndexReaderType::FULLTEXT &&
-
get_parser_phrase_support_string_from_properties(iterator->get_index_properties())
==
- INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) {
- return true;
+bool MatchPredicate::_check_evaluate(InvertedIndexIterator* iterator) const {
+ if (_match_type == MatchType::MATCH_PHRASE || _match_type ==
MatchType::MATCH_PHRASE_PREFIX ||
+ _match_type == MatchType::MATCH_PHRASE_EDGE) {
+ if (iterator->get_inverted_index_reader_type() ==
InvertedIndexReaderType::FULLTEXT &&
+
get_parser_phrase_support_string_from_properties(iterator->get_index_properties())
==
+ INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) {
+ return true;
+ }
}
return false;
}
diff --git a/be/src/olap/match_predicate.h b/be/src/olap/match_predicate.h
index 17d8e76ac88..ad202b7b242 100644
--- a/be/src/olap/match_predicate.h
+++ b/be/src/olap/match_predicate.h
@@ -79,7 +79,7 @@ private:
std::string info = "MatchPredicate";
return info;
}
- bool _skip_evaluate(InvertedIndexIterator* iterator) const;
+ bool _check_evaluate(InvertedIndexIterator* iterator) const;
private:
std::string _value;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index d89d089de3b..2ac283e6e34 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -284,6 +284,27 @@ Status
InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir,
return Status::OK();
};
+Status InvertedIndexReader::match_index_search(
+ OlapReaderStatistics* stats, RuntimeState* runtime_state,
InvertedIndexQueryType query_type,
+ const InvertedIndexQueryInfo& query_info, const
FulltextIndexSearcherPtr& index_searcher,
+ const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
+ TQueryOptions queryOptions = runtime_state->query_options();
+ try {
+ SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
+ auto query = QueryFactory::create(query_type, index_searcher,
queryOptions);
+ if (!query) {
+ return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
+ "query type " + query_type_to_string(query_type) + ",
query is nullptr");
+ }
+ query->add(query_info);
+ query->search(*term_match_bitmap);
+ } catch (const CLuceneError& e) {
+ return
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured:
{}",
+
e.what());
+ }
+ return Status::OK();
+}
+
Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats,
RuntimeState* runtime_state,
std::unique_ptr<InvertedIndexIterator>* iterator) {
*iterator = InvertedIndexIterator::create_unique(stats, runtime_state,
shared_from_this());
@@ -384,27 +405,6 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
}
}
-Status FullTextIndexReader::match_index_search(
- OlapReaderStatistics* stats, RuntimeState* runtime_state,
InvertedIndexQueryType query_type,
- const InvertedIndexQueryInfo& query_info, const
FulltextIndexSearcherPtr& index_searcher,
- const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
- TQueryOptions queryOptions = runtime_state->query_options();
- try {
- SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
- auto query = QueryFactory::create(query_type, index_searcher,
queryOptions);
- if (!query) {
- return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
- "query type " + query_type_to_string(query_type) + ",
query is nullptr");
- }
- query->add(query_info);
- query->search(*term_match_bitmap);
- } catch (const CLuceneError& e) {
- return
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured:
{}",
-
e.what());
- }
- return Status::OK();
-}
-
InvertedIndexReaderType FullTextIndexReader::type() {
return InvertedIndexReaderType::FULLTEXT;
}
@@ -461,28 +461,25 @@ Status
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
std::string search_str(search_query->data, act_len);
VLOG_DEBUG << "begin to query the inverted index from clucene"
<< ", column_name: " << column_name << ", search_str: " <<
search_str;
- std::wstring column_name_ws = StringUtil::string_to_wstring(column_name);
- std::wstring search_str_ws = StringUtil::string_to_wstring(search_str);
- // unique_ptr with custom deleter
- std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term {
- _CLNEW lucene::index::Term(column_name_ws.c_str(),
search_str_ws.c_str()),
- [](lucene::index::Term* term) { _CLDECDELETE(term); }};
- std::unique_ptr<lucene::search::Query> query;
auto index_file_key =
_inverted_index_file_reader->get_index_file_cache_key(&_index_meta);
-
// try to get query bitmap result from cache and return immediately on
cache hit
InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name,
query_type,
search_str};
auto* cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCacheHandle cache_handler;
-
auto cache_status = handle_query_cache(cache, cache_key, &cache_handler,
stats, bit_map);
if (cache_status.ok()) {
return Status::OK();
}
- roaring::Roaring result;
+ std::wstring column_name_ws = StringUtil::string_to_wstring(column_name);
+
+ InvertedIndexQueryInfo query_info;
+ query_info.field_name = column_name_ws;
+ query_info.terms.emplace_back(search_str);
+
+ auto result = std::make_shared<roaring::Roaring>();
FulltextIndexSearcherPtr* searcher_ptr = nullptr;
InvertedIndexCacheHandle inverted_index_cache_handle;
RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle,
stats));
@@ -494,33 +491,29 @@ Status
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
case InvertedIndexQueryType::MATCH_ANY_QUERY:
case InvertedIndexQueryType::MATCH_ALL_QUERY:
case InvertedIndexQueryType::EQUAL_QUERY: {
- query =
std::make_unique<lucene::search::TermQuery>(term.get());
- SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
- (*searcher_ptr)->_search(query.get(), [&result](DocRange*
doc_range) {
- if (doc_range->type_ == DocRangeType::kMany) {
- result.addMany(doc_range->doc_many_size_,
doc_range->doc_many->data());
- } else {
- result.addRange(doc_range->doc_range.first,
doc_range->doc_range.second);
- }
- });
+ RETURN_IF_ERROR(match_index_search(stats, runtime_state,
+
InvertedIndexQueryType::MATCH_ANY_QUERY,
+ query_info, *searcher_ptr,
result));
break;
}
- case InvertedIndexQueryType::MATCH_PHRASE_QUERY: {
- query =
std::make_unique<lucene::search::TermQuery>(term.get());
- SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
- (*searcher_ptr)
- ->_search(query.get(),
- [&result](const int32_t docid, const float_t
/*score*/) {
- // docid equal to rowid in segment
- result.add(docid);
- });
+ case InvertedIndexQueryType::MATCH_PHRASE_QUERY:
+ case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY:
+ case InvertedIndexQueryType::MATCH_REGEXP_QUERY: {
+ RETURN_IF_ERROR(match_index_search(stats, runtime_state,
query_type, query_info,
+ *searcher_ptr, result));
break;
}
-
case InvertedIndexQueryType::LESS_THAN_QUERY:
case InvertedIndexQueryType::LESS_EQUAL_QUERY:
case InvertedIndexQueryType::GREATER_THAN_QUERY:
case InvertedIndexQueryType::GREATER_EQUAL_QUERY: {
+ std::wstring search_str_ws =
StringUtil::string_to_wstring(search_str);
+ // unique_ptr with custom deleter
+ std::unique_ptr<lucene::index::Term, void
(*)(lucene::index::Term*)> term {
+ _CLNEW lucene::index::Term(column_name_ws.c_str(),
search_str_ws.c_str()),
+ [](lucene::index::Term* term) { _CLDECDELETE(term); }};
+ std::unique_ptr<lucene::search::Query> query;
+
bool include_upper = query_type ==
InvertedIndexQueryType::LESS_EQUAL_QUERY;
bool include_lower = query_type ==
InvertedIndexQueryType::GREATER_EQUAL_QUERY;
@@ -537,7 +530,7 @@ Status
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
(*searcher_ptr)
->_search(query.get(),
[&result](const int32_t docid, const float_t
/*score*/) {
- result.add(docid);
+ result->add(docid);
});
break;
}
@@ -560,12 +553,10 @@ Status
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
}
// add to cache
- std::shared_ptr<roaring::Roaring> term_match_bitmap =
- std::make_shared<roaring::Roaring>(result);
- term_match_bitmap->runOptimize();
- cache->insert(cache_key, term_match_bitmap, &cache_handler);
+ result->runOptimize();
+ cache->insert(cache_key, result, &cache_handler);
- bit_map = term_match_bitmap;
+ bit_map = result;
}
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 92df87361c8..a598ccc9ee7 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -141,6 +141,12 @@ public:
InvertedIndexReaderType reader_type);
protected:
+ Status match_index_search(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
+ InvertedIndexQueryType query_type,
+ const InvertedIndexQueryInfo& query_info,
+ const FulltextIndexSearcherPtr& index_searcher,
+ const std::shared_ptr<roaring::Roaring>&
term_match_bitmap);
+
friend class InvertedIndexIterator;
std::shared_ptr<InvertedIndexFileReader> _inverted_index_file_reader;
TabletIndex _index_meta;
@@ -177,13 +183,6 @@ public:
const std::map<string, string>&
properties);
static void
setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>&
analyzer,
const std::map<string, string>&
properties);
-
-private:
- Status match_index_search(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
- InvertedIndexQueryType query_type,
- const InvertedIndexQueryInfo& query_info,
- const FulltextIndexSearcherPtr& index_searcher,
- const std::shared_ptr<roaring::Roaring>&
term_match_bitmap);
};
class StringTypeInvertedIndexReader : public InvertedIndexReader {
diff --git
a/regression-test/data/inverted_index_p0/test_index_complex_match.out
b/regression-test/data/inverted_index_p0/test_index_complex_match.out
new file mode 100644
index 00000000000..5c3636700dd
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_complex_match.out
@@ -0,0 +1,19 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+22
+
+-- !sql --
+22
+
+-- !sql --
+270
+
+-- !sql --
+210
+
+-- !sql --
+180
+
+-- !sql --
+875
+
diff --git
a/regression-test/suites/inverted_index_p0/test_index_complex_match.groovy
b/regression-test/suites/inverted_index_p0/test_index_complex_match.groovy
new file mode 100644
index 00000000000..191e147685d
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_index_complex_match.groovy
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_complex_match", "p0"){
+ def indexTbName1 = "test_index_complex_match_1"
+ def indexTbName2 = "test_index_complex_match_2"
+
+ sql "DROP TABLE IF EXISTS ${indexTbName1}"
+ sql "DROP TABLE IF EXISTS ${indexTbName2}"
+
+ sql """
+ CREATE TABLE ${indexTbName1} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` varchar(20) NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` int(11) NULL COMMENT "",
+ `size` int(11) NULL COMMENT "",
+ INDEX idx_1 (`clientip`) USING INVERTED COMMENT '',
+ INDEX idx_2 (`request`) USING INVERTED PROPERTIES("parser" = "english")
COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ sql """
+ CREATE TABLE ${indexTbName2} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` varchar(20) NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` int(11) NULL COMMENT "",
+ `size` int(11) NULL COMMENT "",
+ INDEX idx_1 (`clientip`) USING INVERTED COMMENT '',
+ INDEX idx_2 (`request`) USING INVERTED PROPERTIES("parser" = "english",
"support_phrase" = "false") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ def load_httplogs_data = {table_name, label, read_flag, format_flag,
file_name, ignore_failure=false,
+ expected_succ_rows = -1, load_to_single_tablet =
'true' ->
+
+ // load the json data
+ streamLoad {
+ table "${table_name}"
+
+ // set http request header params
+ set 'label', label + "_" + UUID.randomUUID().toString()
+ set 'read_json_by_line', read_flag
+ set 'format', format_flag
+ file file_name // import json file
+ time 10000 // limit inflight 10s
+ if (expected_succ_rows >= 0) {
+ set 'max_filter_ratio', '1'
+ }
+
+ // if declared a check callback, the default check condition will
ignore.
+ // So you must check all condition
+ check { result, exception, startTime, endTime ->
+ if (ignore_failure && expected_succ_rows < 0) { return }
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ if (expected_succ_rows >= 0) {
+ assertEquals(json.NumberLoadedRows, expected_succ_rows)
+ } else {
+ assertEquals(json.NumberTotalRows,
json.NumberLoadedRows + json.NumberUnselectedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes
> 0)
+ }
+ }
+ }
+ }
+
+ try {
+ load_httplogs_data.call(indexTbName1, indexTbName1, 'true', 'json',
'documents-1000.json')
+ load_httplogs_data.call(indexTbName2, indexTbName2, 'true', 'json',
'documents-1000.json')
+
+ sql "sync"
+
+ qt_sql """ select count() from ${indexTbName1} where clientip
match_phrase '247.37.0.0'; """
+ qt_sql """ select count() from ${indexTbName1} where clientip
match_phrase_prefix '247'; """
+ qt_sql """ select count() from ${indexTbName1} where clientip
match_regexp '2'; """
+
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase 'hm'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_phrase_prefix 'na'; """
+ qt_sql """ select count() from ${indexTbName1} where request
match_regexp 'ag'; """
+
+
+ try {
+ sql """ select count() from ${indexTbName2} where request
match_phrase 'hm'; """
+ } catch (Exception e) {
+ log.info(e.getMessage());
+ assertTrue(e.getMessage().contains("phrase queries require setting
support_phrase = true"))
+ }
+
+ try {
+ sql """ select count() from ${indexTbName2} where request
match_phrase_prefix 'na'; """
+ } catch (Exception e) {
+ log.info(e.getMessage());
+ assertTrue(e.getMessage().contains("phrase queries require setting
support_phrase = true"))
+ }
+
+ try {
+ sql """ select count() from ${indexTbName2} where request
match_regexp 'ag'; """
+ } catch (Exception e) {
+ log.info(e.getMessage());
+ assertTrue(e.getMessage().contains("phrase queries require setting
support_phrase = true"))
+ }
+
+ } finally {
+ //try_sql("DROP TABLE IF EXISTS ${testTable}")
+ }
+}
\ No newline at end of file
diff --git
a/regression-test/suites/inverted_index_p0/test_no_index_match.groovy
b/regression-test/suites/inverted_index_p0/test_no_index_match.groovy
index cfa94b514a7..60227b01168 100644
--- a/regression-test/suites/inverted_index_p0/test_no_index_match.groovy
+++ b/regression-test/suites/inverted_index_p0/test_no_index_match.groovy
@@ -98,14 +98,14 @@ suite("test_no_index_match", "p0") {
}
try {
- """ select /*+ SET_VAR(enable_match_without_inverted_index = 0) */
count() from ${testTable_unique} where (request match_phrase 'hm bg'); """
+ sql """ select /*+ SET_VAR(enable_match_without_inverted_index = 0)
*/ count() from ${testTable_unique} where (request match_phrase 'hm bg'); """
} catch (Exception e) {
log.info(e.getMessage());
assertTrue(e.getMessage().contains("match_phrase not support
execute_match"))
}
try {
- """ select /*+ SET_VAR(enable_match_without_inverted_index = 0) */
count() from ${testTable_unique} where (request match_phrase_prefix 'hm b');
"""
+ sql """ select /*+ SET_VAR(enable_match_without_inverted_index = 0)
*/ count() from ${testTable_unique} where (request match_phrase_prefix 'hm b');
"""
} catch (Exception e) {
log.info(e.getMessage());
assertTrue(e.getMessage().contains("match_phrase_prefix not support
execute_match"))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]