This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 153c7982f3 [Optimize](invert index) Optimize multiple terms
conjunction query (#23871)
153c7982f3 is described below
commit 153c7982f3b8dd1ffd715e52ced775a917544594
Author: zzzxl <[email protected]>
AuthorDate: Sat Sep 9 01:52:58 2023 +0800
[Optimize](invert index) Optimize multiple terms conjunction query (#23871)
---
be/src/clucene | 2 +-
be/src/olap/rowset/segment_v2/column_reader.cpp | 6 +-
be/src/olap/rowset/segment_v2/column_reader.h | 4 +-
.../inverted_index/query/conjunction_query.cpp | 169 ++++++++++++++
.../inverted_index/query/conjunction_query.h | 63 +++++
.../olap/rowset/segment_v2/inverted_index_cache.h | 6 +
.../segment_v2/inverted_index_compaction.cpp | 3 +-
.../rowset/segment_v2/inverted_index_reader.cpp | 256 +++++++++++----------
.../olap/rowset/segment_v2/inverted_index_reader.h | 58 +++--
.../rowset/segment_v2/inverted_index_writer.cpp | 13 +-
be/src/olap/rowset/segment_v2/segment.cpp | 4 +-
be/src/olap/rowset/segment_v2/segment.h | 3 +-
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 2 +-
.../java/org/apache/doris/qe/SessionVariable.java | 13 ++
gensrc/thrift/PaloInternalService.thrift | 2 +
15 files changed, 450 insertions(+), 154 deletions(-)
diff --git a/be/src/clucene b/be/src/clucene
index 9e60ec666b..fd45366505 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 9e60ec666b3ccf7dd8b7c3e331ac03ccf87d5845
+Subproject commit fd453665055c65b94892d13a93ac47180afd72bb
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 48ac67a51f..17a3136231 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -32,6 +32,7 @@
#include "olap/column_predicate.h"
#include "olap/decimal12.h"
#include "olap/inverted_index_parser.h"
+#include "olap/iterators.h"
#include "olap/olap_common.h"
#include "olap/rowset/segment_v2/binary_dict_page.h" // for
BinaryDictPageDecoder
#include "olap/rowset/segment_v2/binary_plain_page.h"
@@ -242,11 +243,12 @@ Status
ColumnReader::new_bitmap_index_iterator(BitmapIndexIterator** iterator) {
}
Status ColumnReader::new_inverted_index_iterator(const TabletIndex* index_meta,
- OlapReaderStatistics* stats,
+ const StorageReadOptions&
read_options,
std::unique_ptr<InvertedIndexIterator>* iterator) {
RETURN_IF_ERROR(_ensure_inverted_index_loaded(index_meta));
if (_inverted_index) {
- RETURN_IF_ERROR(_inverted_index->new_iterator(stats, iterator));
+ RETURN_IF_ERROR(_inverted_index->new_iterator(read_options.stats,
+
read_options.runtime_state, iterator));
}
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h
b/be/src/olap/rowset/segment_v2/column_reader.h
index 52b6092f92..41a1caf2b8 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -53,6 +53,7 @@ class WrapperField;
class AndBlockColumnPredicate;
class ColumnPredicate;
class TabletIndex;
+class StorageReadOptions;
namespace io {
class FileReader;
@@ -119,7 +120,8 @@ public:
// Client should delete returned iterator
Status new_bitmap_index_iterator(BitmapIndexIterator** iterator);
- Status new_inverted_index_iterator(const TabletIndex* index_meta,
OlapReaderStatistics* stats,
+ Status new_inverted_index_iterator(const TabletIndex* index_meta,
+ const StorageReadOptions& read_options,
std::unique_ptr<InvertedIndexIterator>*
iterator);
// Seek to the first entry in the column.
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
new file mode 100644
index 0000000000..90909045cc
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "conjunction_query.h"
+
+#include <cstdint>
+
+namespace doris {
+
+ConjunctionQuery::ConjunctionQuery(IndexReader* reader)
+ : _reader(reader), _index_version(reader->getIndexVersion()) {}
+
+ConjunctionQuery::~ConjunctionQuery() {
+ for (auto& term : _terms) {
+ if (term) {
+ _CLDELETE(term);
+ }
+ }
+ for (auto& term_doc : _term_docs) {
+ if (term_doc) {
+ _CLDELETE(term_doc);
+ }
+ }
+}
+
+void ConjunctionQuery::add(const std::wstring& field_name,
+ const std::vector<std::wstring>& wterms) {
+ if (wterms.size() < 1) {
+ _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size()
< 1");
+ }
+
+ std::vector<TermIterator> iterators;
+ for (auto& wterm : wterms) {
+ Term* t = _CLNEW Term(field_name.c_str(), wterm.c_str());
+ _terms.push_back(t);
+ TermDocs* term_doc = _reader->termDocs(t);
+ _term_docs.push_back(term_doc);
+ iterators.emplace_back(term_doc);
+ }
+
+ std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a,
const TermIterator& b) {
+ return a.docFreq() < b.docFreq();
+ });
+
+ if (iterators.size() == 1) {
+ _lead1 = iterators[0];
+ } else {
+ _lead1 = iterators[0];
+ _lead2 = iterators[1];
+ for (int32_t i = 2; i < _terms.size(); i++) {
+ _others.push_back(iterators[i]);
+ }
+ }
+
+ if (_index_version == IndexVersion::kV1 && iterators.size() >= 2) {
+ int32_t little = iterators[0].docFreq();
+ int32_t big = iterators[iterators.size() - 1].docFreq();
+ if (little == 0 || (big / little) > _conjunction_ratio) {
+ _use_skip = true;
+ }
+ }
+}
+
+void ConjunctionQuery::search(roaring::Roaring& roaring) {
+ if (_lead1.isEmpty()) {
+ return;
+ }
+
+ if (!_use_skip) {
+ search_by_bitmap(roaring);
+ return;
+ }
+
+ search_by_skiplist(roaring);
+}
+
+void ConjunctionQuery::search_by_bitmap(roaring::Roaring& roaring) {
+ // can get a term of all docid
+ auto func = [&roaring](const TermIterator& term_docs, bool first) {
+ roaring::Roaring result;
+ DocRange doc_range;
+ while (term_docs.readRange(&doc_range)) {
+ if (doc_range.type_ == DocRangeType::kMany) {
+ result.addMany(doc_range.doc_many_size_,
doc_range.doc_many->data());
+ } else {
+ result.addRange(doc_range.doc_range.first,
doc_range.doc_range.second);
+ }
+ }
+ if (first) {
+ roaring.swap(result);
+ } else {
+ roaring &= result;
+ }
+ };
+
+ // fill the bitmap for the first time
+ func(_lead1, true);
+
+ // the second inverted list may be empty
+ if (!_lead2.isEmpty()) {
+ func(_lead2, false);
+ }
+
+ // The inverted index iterators contained in the _others array must not be
empty
+ for (auto& other : _others) {
+ func(other, false);
+ }
+}
+
+void ConjunctionQuery::search_by_skiplist(roaring::Roaring& roaring) {
+ int32_t doc = 0;
+ int32_t first_doc = _lead1.nextDoc();
+ while ((doc = do_next(first_doc)) != INT32_MAX) {
+ roaring.add(doc);
+ }
+}
+
+int32_t ConjunctionQuery::do_next(int32_t doc) {
+ while (true) {
+ assert(doc == _lead1.docID());
+
+ // the skip list is used to find the two smallest inverted lists
+ int32_t next2 = _lead2.advance(doc);
+ if (next2 != doc) {
+ doc = _lead1.advance(next2);
+ if (next2 != doc) {
+ continue;
+ }
+ }
+
+ // if both lead1 and lead2 exist, use skip list to lookup other
inverted indexes
+ bool advance_head = false;
+ for (auto& other : _others) {
+ if (other.isEmpty()) {
+ continue;
+ }
+
+ if (other.docID() < doc) {
+ int32_t next = other.advance(doc);
+ if (next > doc) {
+ doc = _lead1.advance(next);
+ advance_head = true;
+ break;
+ }
+ }
+ }
+ if (advance_head) {
+ continue;
+ }
+
+ return doc;
+ }
+}
+
+} // namespace doris
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h
new file mode 100644
index 0000000000..bffb12ffb2
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <CLucene.h>
+#include <CLucene/index/IndexReader.h>
+#include <CLucene/index/IndexVersion.h>
+#include <CLucene/index/Term.h>
+#include <CLucene/search/query/TermIterator.h>
+
+#include "roaring/roaring.hh"
+
+CL_NS_USE(index)
+
+namespace doris {
+
+class ConjunctionQuery {
+public:
+ ConjunctionQuery(IndexReader* reader);
+ ~ConjunctionQuery();
+
+ void set_conjunction_ratio(int32_t conjunction_ratio) {
+ _conjunction_ratio = conjunction_ratio;
+ }
+
+ void add(const std::wstring& field_name, const std::vector<std::wstring>&
wterms);
+ void search(roaring::Roaring& roaring);
+
+private:
+ void search_by_bitmap(roaring::Roaring& roaring);
+ void search_by_skiplist(roaring::Roaring& roaring);
+
+ int32_t do_next(int32_t doc);
+
+ IndexReader* _reader = nullptr;
+ IndexVersion _index_version = IndexVersion::kV0;
+ int32_t _conjunction_ratio = 1000;
+ bool _use_skip = false;
+
+ TermIterator _lead1;
+ TermIterator _lead2;
+ std::vector<TermIterator> _others;
+
+ std::vector<Term*> _terms;
+ std::vector<TermDocs*> _term_docs;
+};
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_cache.h
b/be/src/olap/rowset/segment_v2/inverted_index_cache.h
index c67e17ddda..edbfdd6b72 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_cache.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_cache.h
@@ -17,7 +17,13 @@
#pragma once
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Woverloaded-virtual"
+
#include <CLucene.h> // IWYU pragma: keep
+
+#pragma GCC diagnostic pop
+
#include <CLucene/config/repl_wchar.h>
#include <CLucene/util/Misc.h>
#include <butil/macros.h>
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp
index cbc3b4399d..7f653a9359 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp
@@ -32,7 +32,8 @@ Status compact_column(int32_t index_id, int src_segment_num,
int dest_segment_nu
std::vector<uint32_t> dest_segment_num_rows) {
lucene::store::Directory* dir =
DorisCompoundDirectory::getDirectory(fs,
index_writer_path.c_str(), false);
- auto index_writer = _CLNEW lucene::index::IndexWriter(dir, nullptr, true
/* create */,
+ lucene::analysis::SimpleAnalyzer<char> analyzer;
+ auto index_writer = _CLNEW lucene::index::IndexWriter(dir, &analyzer, true
/* create */,
true /*
closeDirOnShutdown */);
// get compound directory src_index_dirs
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index a521da394c..3d5801ecb7 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -53,9 +53,12 @@
#include "io/fs/file_system.h"
#include "olap/key_coder.h"
#include "olap/olap_common.h"
+#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
+#include "olap/rowset/segment_v2/inverted_index_cache.h"
#include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
#include "olap/rowset/segment_v2/inverted_index_desc.h"
#include "olap/types.h"
+#include "runtime/runtime_state.h"
#include "util/faststring.h"
#include "util/runtime_profile.h"
#include "util/time.h"
@@ -101,13 +104,11 @@ std::vector<std::wstring>
InvertedIndexReader::get_analyse_result(
std::shared_ptr<lucene::analysis::Analyzer> analyzer;
std::unique_ptr<lucene::util::Reader> reader;
auto analyser_type = inverted_index_ctx->parser_type;
- if (analyser_type == InvertedIndexParserType::PARSER_STANDARD) {
- analyzer =
std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
- reader.reset(
- (new lucene::util::StringReader(std::wstring(value.begin(),
value.end()).c_str())));
- } else if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
+ if (analyser_type == InvertedIndexParserType::PARSER_STANDARD ||
+ analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
analyzer =
std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
- reader.reset(new lucene::util::SStringReader<char>(value.data(),
value.size(), false));
+ } else if (analyser_type == InvertedIndexParserType::PARSER_ENGLISH) {
+ analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
} else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
auto chinese_analyzer =
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
@@ -119,17 +120,11 @@ std::vector<std::wstring>
InvertedIndexReader::get_analyse_result(
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
}
analyzer = chinese_analyzer;
- reader.reset(_CLNEW lucene::util::SStringReader<char>(value.c_str(),
strlen(value.c_str()),
- false));
- //reader.reset(new lucene::util::SimpleInputStreamReader(
- // new lucene::util::AStringReader(value.c_str()),
- // lucene::util::SimpleInputStreamReader::UTF8));
} else {
// default
- analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<TCHAR>>();
- reader.reset(
- (new lucene::util::StringReader(std::wstring(value.begin(),
value.end()).c_str())));
+ analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
}
+ reader.reset(new lucene::util::SStringReader<char>(value.data(),
value.size(), false));
std::wstring field_ws = std::wstring(field_name.begin(), field_name.end());
std::unique_ptr<lucene::analysis::TokenStream> token_stream(
@@ -138,17 +133,10 @@ std::vector<std::wstring>
InvertedIndexReader::get_analyse_result(
lucene::analysis::Token token;
while (token_stream->next(&token)) {
- if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
- if (token.termLength<char>() != 0) {
- std::string_view term(token.termBuffer<char>(),
token.termLength<char>());
- std::wstring ws_term = StringUtil::string_to_wstring(term);
- analyse_result.emplace_back(ws_term);
- }
- } else {
- if (token.termLength<TCHAR>() != 0) {
- analyse_result.emplace_back(
- std::wstring(token.termBuffer<TCHAR>(),
token.termLength<TCHAR>()));
- }
+ if (token.termLength<char>() != 0) {
+ std::string_view term(token.termBuffer<char>(),
token.termLength<char>());
+ std::wstring ws_term = StringUtil::string_to_wstring(term);
+ analyse_result.emplace_back(ws_term);
}
}
@@ -219,15 +207,15 @@ Status
InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cach
return Status::OK();
}
-Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats,
+Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats,
RuntimeState* runtime_state,
std::unique_ptr<InvertedIndexIterator>* iterator) {
- *iterator = InvertedIndexIterator::create_unique(stats,
shared_from_this());
+ *iterator = InvertedIndexIterator::create_unique(stats, runtime_state,
shared_from_this());
return Status::OK();
}
-Status FullTextIndexReader::query(OlapReaderStatistics* stats, const
std::string& column_name,
- const void* query_value,
InvertedIndexQueryType query_type,
- roaring::Roaring* bit_map) {
+Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
+ const std::string& column_name, const void*
query_value,
+ InvertedIndexQueryType query_type,
roaring::Roaring* bit_map) {
SCOPED_RAW_TIMER(&stats->inverted_index_query_timer);
std::string search_str = reinterpret_cast<const
StringRef*>(query_value)->to_string();
@@ -263,83 +251,35 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, const std::string
}
}
- std::unique_ptr<lucene::search::Query> query;
- std::wstring field_ws = std::wstring(column_name.begin(),
column_name.end());
-
- auto index_search = [&](bool& null_bitmap_already_read,
- std::shared_ptr<roaring::Roaring>&
term_match_bitmap,
- InvertedIndexQueryCache* cache,
- InvertedIndexQueryCache::CacheKey& cache_key,
- InvertedIndexQueryCacheHandle& cache_handle) {
- // check index file existence
- if (!indexExists(index_file_path)) {
- return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
- "inverted index path: {} not exist.",
index_file_path.string());
- }
-
- InvertedIndexCacheHandle inverted_index_cache_handle;
- InvertedIndexSearcherCache::instance()->get_index_searcher(
- _fs, index_dir.c_str(), index_file_name,
&inverted_index_cache_handle, stats);
- auto index_searcher =
inverted_index_cache_handle.get_index_searcher();
-
- // try to reuse index_searcher's directory to read null_bitmap to
cache
- // to avoid open directory additionally for null_bitmap
- if (!null_bitmap_already_read) {
- InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
- read_null_bitmap(&null_bitmap_cache_handle,
- index_searcher->getReader()->directory());
- null_bitmap_already_read = true;
- }
+ // check index file existence
+ if (!indexExists(index_file_path)) {
+ return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
+ "inverted index path: {} not exist.",
index_file_path.string());
+ }
- try {
- if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
- query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
- query_type == InvertedIndexQueryType::EQUAL_QUERY) {
-
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
- index_searcher->_search(query.get(),
[&term_match_bitmap](DocRange* docRange) {
- if (docRange->type_ == DocRangeType::kMany) {
-
term_match_bitmap->addMany(docRange->doc_many_size_,
-
docRange->doc_many.data());
- } else {
-
term_match_bitmap->addRange(docRange->doc_range.first,
-
docRange->doc_range.second);
- }
- });
- } else {
-
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
- index_searcher->_search(
- query.get(),
- [&term_match_bitmap](const int32_t docid, const
float_t /*score*/) {
- // docid equal to rowid in segment
- term_match_bitmap->add(docid);
- });
- }
- } catch (const CLuceneError& e) {
- return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
- "CLuceneError occured: {}", e.what());
- }
+ InvertedIndexCacheHandle inverted_index_cache_handle;
+ InvertedIndexSearcherCache::instance()->get_index_searcher(
+ _fs, index_dir.c_str(), index_file_name,
&inverted_index_cache_handle, stats);
+ auto index_searcher = inverted_index_cache_handle.get_index_searcher();
- {
- // add to cache
- term_match_bitmap->runOptimize();
- cache->insert(cache_key, term_match_bitmap, &cache_handle);
- }
- return Status::OK();
- };
+ std::unique_ptr<lucene::search::Query> query;
+ std::wstring field_ws = std::wstring(column_name.begin(),
column_name.end());
roaring::Roaring query_match_bitmap;
bool null_bitmap_already_read = false;
- if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+ if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
+ query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) {
std::wstring wstr_tokens;
for (auto& token : analyse_result) {
wstr_tokens += token;
+ wstr_tokens += L" ";
}
auto cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCache::CacheKey cache_key;
cache_key.index_path = index_file_path;
cache_key.column_name = column_name;
- cache_key.query_type = InvertedIndexQueryType::MATCH_PHRASE_QUERY;
+ cache_key.query_type = query_type;
auto str_tokens = lucene_wcstoutf8string(wstr_tokens.c_str(),
wstr_tokens.length());
cache_key.value.swap(str_tokens);
InvertedIndexQueryCacheHandle cache_handle;
@@ -352,19 +292,28 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, const std::string
term_match_bitmap = std::make_shared<roaring::Roaring>();
- auto* phrase_query = new lucene::search::PhraseQuery();
- for (auto& token : analyse_result) {
- auto* term = _CLNEW lucene::index::Term(field_ws.c_str(),
token.c_str());
- phrase_query->add(term);
- _CLDECDELETE(term);
+ Status res = Status::OK();
+ if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+ auto* phrase_query = new lucene::search::PhraseQuery();
+ for (auto& token : analyse_result) {
+ auto* term = _CLNEW
lucene::index::Term(field_ws.c_str(), token.c_str());
+ phrase_query->add(term);
+ _CLDECDELETE(term);
+ }
+ query.reset(phrase_query);
+ res = normal_index_search(stats, query_type,
index_searcher,
+ null_bitmap_already_read, query,
term_match_bitmap);
+ } else {
+ res = match_all_index_search(stats, runtime_state,
field_ws, analyse_result,
+ index_searcher,
term_match_bitmap);
}
- query.reset(phrase_query);
-
- Status res = index_search(null_bitmap_already_read,
term_match_bitmap, cache,
- cache_key, cache_handle);
if (!res.ok()) {
return res;
}
+
+ // add to cache
+ term_match_bitmap->runOptimize();
+ cache->insert(cache_key, term_match_bitmap, &cache_handle);
}
query_match_bitmap = *term_match_bitmap;
} else {
@@ -394,11 +343,16 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, const std::string
[](lucene::index::Term* term) {
_CLDECDELETE(term); }};
query.reset(new lucene::search::TermQuery(term.get()));
- Status res = index_search(null_bitmap_already_read,
term_match_bitmap, cache,
- cache_key, cache_handle);
+ Status res =
+ normal_index_search(stats, query_type,
index_searcher,
+ null_bitmap_already_read,
query, term_match_bitmap);
if (!res.ok()) {
return res;
}
+
+ // add to cache
+ term_match_bitmap->runOptimize();
+ cache->insert(cache_key, term_match_bitmap, &cache_handle);
}
// add to query_match_bitmap
@@ -415,8 +369,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, const std::string
query_match_bitmap |= *term_match_bitmap;
break;
}
- case InvertedIndexQueryType::EQUAL_QUERY:
- case InvertedIndexQueryType::MATCH_ALL_QUERY: {
+ case InvertedIndexQueryType::EQUAL_QUERY: {
SCOPED_RAW_TIMER(&stats->inverted_index_query_bitmap_op_timer);
query_match_bitmap &= *term_match_bitmap;
break;
@@ -437,17 +390,83 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, const std::string
}
}
+Status FullTextIndexReader::normal_index_search(
+ OlapReaderStatistics* stats, InvertedIndexQueryType query_type,
+ const IndexSearcherPtr& index_searcher, bool& null_bitmap_already_read,
+ const std::unique_ptr<lucene::search::Query>& query,
+ const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
+ check_null_bitmap(index_searcher, null_bitmap_already_read);
+
+ try {
+ SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
+ if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
+ query_type == InvertedIndexQueryType::EQUAL_QUERY) {
+ index_searcher->_search(query.get(),
[&term_match_bitmap](DocRange* doc_range) {
+ if (doc_range->type_ == DocRangeType::kMany) {
+ term_match_bitmap->addMany(doc_range->doc_many_size_,
+ doc_range->doc_many->data());
+ } else {
+ term_match_bitmap->addRange(doc_range->doc_range.first,
+ doc_range->doc_range.second);
+ }
+ });
+ } else {
+ index_searcher->_search(query.get(), [&term_match_bitmap](const
int32_t docid,
+ const
float_t /*score*/) {
+ // docid equal to rowid in segment
+ term_match_bitmap->add(docid);
+ });
+ }
+ } catch (const CLuceneError& e) {
+ return
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured:
{}",
+
e.what());
+ }
+
+ return Status::OK();
+}
+
+Status FullTextIndexReader::match_all_index_search(
+ OlapReaderStatistics* stats, RuntimeState* runtime_state, const
std::wstring& field_ws,
+ const std::vector<std::wstring>& analyse_result, const
IndexSearcherPtr& index_searcher,
+ const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
+ TQueryOptions queryOptions = runtime_state->query_options();
+ try {
+ SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
+ ConjunctionQuery query(index_searcher->getReader());
+
query.set_conjunction_ratio(queryOptions.inverted_index_conjunction_opt_threshold);
+ query.add(field_ws, analyse_result);
+ query.search(*term_match_bitmap);
+ } catch (const CLuceneError& e) {
+ return
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured:
{}",
+
e.what());
+ }
+ return Status::OK();
+}
+
+void FullTextIndexReader::check_null_bitmap(const IndexSearcherPtr&
index_searcher,
+ bool& null_bitmap_already_read) {
+ // try to reuse index_searcher's directory to read null_bitmap to cache
+ // to avoid open directory additionally for null_bitmap
+ if (!null_bitmap_already_read) {
+ InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
+ read_null_bitmap(&null_bitmap_cache_handle,
index_searcher->getReader()->directory());
+ null_bitmap_already_read = true;
+ }
+}
+
InvertedIndexReaderType FullTextIndexReader::type() {
return InvertedIndexReaderType::FULLTEXT;
}
Status StringTypeInvertedIndexReader::new_iterator(
- OlapReaderStatistics* stats, std::unique_ptr<InvertedIndexIterator>*
iterator) {
- *iterator = InvertedIndexIterator::create_unique(stats,
shared_from_this());
+ OlapReaderStatistics* stats, RuntimeState* runtime_state,
+ std::unique_ptr<InvertedIndexIterator>* iterator) {
+ *iterator = InvertedIndexIterator::create_unique(stats, runtime_state,
shared_from_this());
return Status::OK();
}
Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
+ RuntimeState* runtime_state,
const std::string& column_name,
const void* query_value,
InvertedIndexQueryType query_type,
roaring::Roaring* bit_map) {
@@ -538,11 +557,11 @@ Status
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
query_type == InvertedIndexQueryType::EQUAL_QUERY) {
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
- index_searcher->_search(query.get(), [&result](DocRange* docRange)
{
- if (docRange->type_ == DocRangeType::kMany) {
- result.addMany(docRange->doc_many_size_,
docRange->doc_many.data());
+ index_searcher->_search(query.get(), [&result](DocRange*
doc_range) {
+ if (doc_range->type_ == DocRangeType::kMany) {
+ result.addMany(doc_range->doc_many_size_,
doc_range->doc_many->data());
} else {
- result.addRange(docRange->doc_range.first,
docRange->doc_range.second);
+ result.addRange(doc_range->doc_range.first,
doc_range->doc_range.second);
}
});
} else {
@@ -600,9 +619,9 @@ BkdIndexReader::BkdIndexReader(io::FileSystemSPtr fs, const
std::string& path,
config::inverted_index_read_buffer_size);
}
-Status BkdIndexReader::new_iterator(OlapReaderStatistics* stats,
+Status BkdIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
std::unique_ptr<InvertedIndexIterator>*
iterator) {
- *iterator = InvertedIndexIterator::create_unique(stats,
shared_from_this());
+ *iterator = InvertedIndexIterator::create_unique(stats, runtime_state,
shared_from_this());
return Status::OK();
}
@@ -692,9 +711,9 @@ Status
BkdIndexReader::handle_cache(InvertedIndexQueryCache* cache,
}
}
-Status BkdIndexReader::query(OlapReaderStatistics* stats, const std::string&
column_name,
- const void* query_value, InvertedIndexQueryType
query_type,
- roaring::Roaring* bit_map) {
+Status BkdIndexReader::query(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
+ const std::string& column_name, const void*
query_value,
+ InvertedIndexQueryType query_type,
roaring::Roaring* bit_map) {
SCOPED_RAW_TIMER(&stats->inverted_index_query_timer);
auto visitor = std::make_unique<InvertedIndexVisitor>(bit_map, query_type);
@@ -968,7 +987,8 @@ Status
InvertedIndexIterator::read_from_inverted_index(const std::string& column
}
}
- RETURN_IF_ERROR(_reader->query(_stats, column_name, query_value,
query_type, bit_map));
+ RETURN_IF_ERROR(
+ _reader->query(_stats, _runtime_state, column_name, query_value,
query_type, bit_map));
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 5f7b318825..30269f3b19 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -52,6 +52,7 @@ namespace doris {
class KeyCoder;
class TypeInfo;
struct OlapReaderStatistics;
+class RuntimeState;
namespace segment_v2 {
@@ -65,6 +66,8 @@ enum class InvertedIndexReaderType {
BKD = 2,
};
+using IndexSearcherPtr = std::shared_ptr<lucene::search::IndexSearcher>;
+
class InvertedIndexReader : public
std::enable_shared_from_this<InvertedIndexReader> {
public:
explicit InvertedIndexReader(io::FileSystemSPtr fs, const std::string&
path,
@@ -73,11 +76,11 @@ public:
virtual ~InvertedIndexReader() = default;
// create a new column iterator. Client should delete returned iterator
- virtual Status new_iterator(OlapReaderStatistics* stats,
+ virtual Status new_iterator(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
std::unique_ptr<InvertedIndexIterator>*
iterator) = 0;
- virtual Status query(OlapReaderStatistics* stats, const std::string&
column_name,
- const void* query_value, InvertedIndexQueryType
query_type,
- roaring::Roaring* bit_map) = 0;
+ virtual Status query(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
+ const std::string& column_name, const void*
query_value,
+ InvertedIndexQueryType query_type, roaring::Roaring*
bit_map) = 0;
virtual Status try_query(OlapReaderStatistics* stats, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
uint32_t* count) = 0;
@@ -118,11 +121,11 @@ public:
: InvertedIndexReader(fs, path, index_meta) {}
~FullTextIndexReader() override = default;
- Status new_iterator(OlapReaderStatistics* stats,
+ Status new_iterator(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
std::unique_ptr<InvertedIndexIterator>* iterator)
override;
- Status query(OlapReaderStatistics* stats, const std::string& column_name,
- const void* query_value, InvertedIndexQueryType query_type,
- roaring::Roaring* bit_map) override;
+ Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state,
+ const std::string& column_name, const void* query_value,
+ InvertedIndexQueryType query_type, roaring::Roaring* bit_map)
override;
Status try_query(OlapReaderStatistics* stats, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
uint32_t* count) override {
@@ -131,6 +134,21 @@ public:
}
InvertedIndexReaderType type() override;
+
+private:
+ Status normal_index_search(OlapReaderStatistics* stats,
InvertedIndexQueryType query_type,
+ const IndexSearcherPtr& index_searcher,
+ bool& null_bitmap_already_read,
+ const std::unique_ptr<lucene::search::Query>&
query,
+ const std::shared_ptr<roaring::Roaring>&
term_match_bitmap);
+
+ Status match_all_index_search(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
+ const std::wstring& field_ws,
+ const std::vector<std::wstring>&
analyse_result,
+ const IndexSearcherPtr& index_searcher,
+ const std::shared_ptr<roaring::Roaring>&
term_match_bitmap);
+
+ void check_null_bitmap(const IndexSearcherPtr& index_searcher, bool&
null_bitmap_already_read);
};
class StringTypeInvertedIndexReader : public InvertedIndexReader {
@@ -142,11 +160,11 @@ public:
: InvertedIndexReader(fs, path, index_meta) {}
~StringTypeInvertedIndexReader() override = default;
- Status new_iterator(OlapReaderStatistics* stats,
+ Status new_iterator(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
std::unique_ptr<InvertedIndexIterator>* iterator)
override;
- Status query(OlapReaderStatistics* stats, const std::string& column_name,
- const void* query_value, InvertedIndexQueryType query_type,
- roaring::Roaring* bit_map) override;
+ Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state,
+ const std::string& column_name, const void* query_value,
+ InvertedIndexQueryType query_type, roaring::Roaring* bit_map)
override;
Status try_query(OlapReaderStatistics* stats, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
uint32_t* count) override {
@@ -214,12 +232,12 @@ public:
}
}
- Status new_iterator(OlapReaderStatistics* stats,
+ Status new_iterator(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
std::unique_ptr<InvertedIndexIterator>* iterator)
override;
- Status query(OlapReaderStatistics* stats, const std::string& column_name,
- const void* query_value, InvertedIndexQueryType query_type,
- roaring::Roaring* bit_map) override;
+ Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state,
+ const std::string& column_name, const void* query_value,
+ InvertedIndexQueryType query_type, roaring::Roaring* bit_map)
override;
Status try_query(OlapReaderStatistics* stats, const std::string&
column_name,
const void* query_value, InvertedIndexQueryType
query_type,
uint32_t* count) override;
@@ -246,8 +264,9 @@ class InvertedIndexIterator {
ENABLE_FACTORY_CREATOR(InvertedIndexIterator);
public:
- InvertedIndexIterator(OlapReaderStatistics* stats,
std::shared_ptr<InvertedIndexReader> reader)
- : _stats(stats), _reader(reader) {}
+ InvertedIndexIterator(OlapReaderStatistics* stats, RuntimeState*
runtime_state,
+ std::shared_ptr<InvertedIndexReader> reader)
+ : _stats(stats), _runtime_state(runtime_state), _reader(reader) {}
Status read_from_inverted_index(const std::string& column_name, const
void* query_value,
InvertedIndexQueryType query_type,
uint32_t segment_num_rows,
@@ -264,7 +283,8 @@ public:
[[nodiscard]] const std::map<string, string>& get_index_properties() const;
private:
- OlapReaderStatistics* _stats;
+ OlapReaderStatistics* _stats = nullptr;
+ RuntimeState* _runtime_state = nullptr;
std::shared_ptr<InvertedIndexReader> _reader;
};
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 2194d349a9..8cd65bbab6 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -155,9 +155,8 @@ public:
_doc = std::make_unique<lucene::document::Document>();
_dir.reset(DorisCompoundDirectory::getDirectory(_fs,
index_path.c_str(), true));
- if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) {
- _analyzer =
std::make_unique<lucene::analysis::standard::StandardAnalyzer>();
- } else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) {
+ if (_parser_type == InvertedIndexParserType::PARSER_STANDARD ||
+ _parser_type == InvertedIndexParserType::PARSER_UNICODE) {
_analyzer =
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
} else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
_analyzer =
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
@@ -234,12 +233,10 @@ public:
void new_fulltext_field(const char* field_value_data, size_t
field_value_size) {
if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH ||
- _parser_type == InvertedIndexParserType::PARSER_CHINESE) {
+ _parser_type == InvertedIndexParserType::PARSER_CHINESE ||
+ _parser_type == InvertedIndexParserType::PARSER_UNICODE ||
+ _parser_type == InvertedIndexParserType::PARSER_STANDARD) {
new_char_token_stream(field_value_data, field_value_size, _field);
- } else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) {
- new_char_token_stream(field_value_data, field_value_size, _field);
- } else if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) {
- new_field_value(field_value_data, field_value_size, _field);
} else {
new_field_char_value(field_value_data, field_value_size, _field);
}
diff --git a/be/src/olap/rowset/segment_v2/segment.cpp
b/be/src/olap/rowset/segment_v2/segment.cpp
index 21ceea7a93..7eb660b3c2 100644
--- a/be/src/olap/rowset/segment_v2/segment.cpp
+++ b/be/src/olap/rowset/segment_v2/segment.cpp
@@ -328,12 +328,12 @@ Status Segment::new_bitmap_index_iterator(const
TabletColumn& tablet_column,
Status Segment::new_inverted_index_iterator(const TabletColumn& tablet_column,
const TabletIndex* index_meta,
- OlapReaderStatistics* stats,
+ const StorageReadOptions&
read_options,
std::unique_ptr<InvertedIndexIterator>* iter) {
auto col_unique_id = tablet_column.unique_id();
if (_column_readers.count(col_unique_id) > 0 && index_meta) {
RETURN_IF_ERROR(_column_readers.at(col_unique_id)
- ->new_inverted_index_iterator(index_meta,
stats, iter));
+ ->new_inverted_index_iterator(index_meta,
read_options, iter));
return Status::OK();
}
return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/segment.h
b/be/src/olap/rowset/segment_v2/segment.h
index 65f3245e6b..382ae69a7b 100644
--- a/be/src/olap/rowset/segment_v2/segment.h
+++ b/be/src/olap/rowset/segment_v2/segment.h
@@ -90,7 +90,8 @@ public:
std::unique_ptr<BitmapIndexIterator>*
iter);
Status new_inverted_index_iterator(const TabletColumn& tablet_column,
- const TabletIndex* index_meta,
OlapReaderStatistics* stats,
+ const TabletIndex* index_meta,
+ const StorageReadOptions& read_options,
std::unique_ptr<InvertedIndexIterator>*
iter);
const ShortKeyIndexDecoder* get_short_key_index() const {
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index bde5a533ae..203117a22a 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1076,7 +1076,7 @@ Status SegmentIterator::_init_inverted_index_iterators() {
if (_inverted_index_iterators[cid] == nullptr) {
RETURN_IF_ERROR(_segment->new_inverted_index_iterator(
_opts.tablet_schema->column(cid),
_opts.tablet_schema->get_inverted_index(cid),
- _opts.stats, &_inverted_index_iterators[cid]));
+ _opts, &_inverted_index_iterators[cid]));
}
}
return Status::OK();
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 2d9303beb9..6dd2ac6811 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -395,6 +395,8 @@ public class SessionVariable implements Serializable,
Writable {
public static final String ENABLE_MEMTABLE_ON_SINK_NODE =
"enable_memtable_on_sink_node";
+ public static final String INVERTED_INDEX_CONJUNCTION_OPT_THRESHOLD =
"inverted_index_conjunction_opt_threshold";
+
public static final List<String> DEBUG_VARIABLES = ImmutableList.of(
SKIP_DELETE_PREDICATE,
SKIP_DELETE_BITMAP,
@@ -1148,6 +1150,15 @@ public class SessionVariable implements Serializable,
Writable {
@VariableMgr.VarAttr(name = ENABLE_INSERT_GROUP_COMMIT)
public boolean enableInsertGroupCommit = false;
+ @VariableMgr.VarAttr(name = INVERTED_INDEX_CONJUNCTION_OPT_THRESHOLD,
+ description =
{"在match_all中求取多个倒排索引的交集时,如果最大的倒排索引中的总数是最小倒排索引中的总数的整数倍,"
+ + "则使用跳表来优化交集操作。",
+ "When intersecting multiple inverted indexes in match_all,"
+ + " if the maximum total count of the largest inverted
index"
+ + " is a multiple of the minimum total count of the
smallest inverted index,"
+ + " use a skiplist to optimize the intersection."})
+ public int invertedIndexConjunctionOptThreshold = 1000;
+
// If this fe is in fuzzy mode, then will use initFuzzyModeVariables to
generate some variables,
// not the default value set in the code.
public void initFuzzyModeVariables() {
@@ -2243,6 +2254,8 @@ public class SessionVariable implements Serializable,
Writable {
tResult.setTruncateCharOrVarcharColumns(truncateCharOrVarcharColumns);
tResult.setEnableMemtableOnSinkNode(enableMemtableOnSinkNode);
+
tResult.setInvertedIndexConjunctionOptThreshold(invertedIndexConjunctionOptThreshold);
+
return tResult;
}
diff --git a/gensrc/thrift/PaloInternalService.thrift
b/gensrc/thrift/PaloInternalService.thrift
index 9c75e46f44..02a314f8fa 100644
--- a/gensrc/thrift/PaloInternalService.thrift
+++ b/gensrc/thrift/PaloInternalService.thrift
@@ -240,6 +240,8 @@ struct TQueryOptions {
// A tag used to distinguish fe start epoch.
82: optional i64 fe_process_uuid = 0;
+
+ 83: optional i32 inverted_index_conjunction_opt_threshold = 1000;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]