This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 171f374f56 [improvement](invert index) Change the loading method of
keyword type (#21893)
171f374f56 is described below
commit 171f374f5629752266e0be15525292a972f3256e
Author: zzzxl <[email protected]>
AuthorDate: Wed Jul 19 15:26:49 2023 +0800
[improvement](invert index) Change the loading method of keyword type
(#21893)
1. fix can not index Chinese
2. optimized invert index load
---
be/src/clucene | 2 +-
be/src/olap/rowset/segment_v2/inverted_index_reader.cpp | 3 ++-
be/src/olap/rowset/segment_v2/inverted_index_writer.cpp | 10 ++++++++--
3 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/be/src/clucene b/be/src/clucene
index 103e88a8a3..5dd6fca31d 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 103e88a8a3b24da9ae2a0d9908a3ceb3f7808a61
+Subproject commit 5dd6fca31d1a0226a29abfea7c03c9694401ec32
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 0a935bec6a..d382d74aab 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -22,6 +22,7 @@
#include <CLucene/analysis/LanguageBasedAnalyzer.h>
#include <CLucene/analysis/standard/StandardAnalyzer.h>
#include <CLucene/clucene-config.h>
+#include <CLucene/config/repl_wchar.h>
#include <CLucene/debug/error.h>
#include <CLucene/debug/mem.h>
#include <CLucene/index/IndexReader.h>
@@ -425,7 +426,7 @@ Status
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
VLOG_DEBUG << "begin to query the inverted index from clucene"
<< ", column_name: " << column_name << ", search_str: " <<
search_str;
std::wstring column_name_ws = std::wstring(column_name.begin(),
column_name.end());
- std::wstring search_str_ws = std::wstring(search_str.begin(),
search_str.end());
+ std::wstring search_str_ws = lucene_utf8stows(search_str);
// unique_ptr with custom deleter
std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term {
_CLNEW lucene::index::Term(column_name_ws.c_str(),
search_str_ws.c_str()),
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index fcf125b2fa..e6a5be6315 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -172,7 +172,7 @@ public:
_analyzer.reset(chinese_analyzer);
} else {
// ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
- _analyzer =
std::make_unique<lucene::analysis::SimpleAnalyzer<TCHAR>>();
+ _analyzer =
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
}
_index_writer =
std::make_unique<lucene::index::IndexWriter>(_dir.get(), _analyzer.get(),
create,
true);
@@ -228,8 +228,10 @@ public:
new lucene::util::AStringReader(field_value_data,
field_value_size),
lucene::util::SimpleInputStreamReader::UTF8);
_field->setValue(stringReader);
- } else {
+ } else if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) {
new_field_value(field_value_data, field_value_size, _field);
+ } else {
+ new_field_char_value(field_value_data, field_value_size, _field);
}
}
@@ -246,6 +248,10 @@ public:
//_CLDELETE_ARRAY(field_value)
}
+ void new_field_char_value(const char* s, size_t len,
lucene::document::Field* field) {
+ field->setValue((char*)s, len);
+ }
+
Status add_values(const std::string fn, const void* values, size_t count)
override {
if constexpr (field_is_slice_type(field_type)) {
if (_field == nullptr || _index_writer == nullptr) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]