This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new ba5731e8 [Feature] add chinese analzyer mode for Jieba (#76)
ba5731e8 is described below
commit ba5731e8e49c1bd98cc5a70304832c4ba4afe87d
Author: airborne12 <[email protected]>
AuthorDate: Fri May 26 19:07:58 2023 +0800
[Feature] add chinese analzyer mode for Jieba (#76)
---
.../CLucene/analysis/LanguageBasedAnalyzer.cpp | 97 ++++++++------
.../CLucene/analysis/LanguageBasedAnalyzer.h | 27 +++-
.../CLucene/analysis/jieba/ChineseTokenizer.cpp | 21 ++-
.../CLucene/analysis/jieba/ChineseTokenizer.h | 13 +-
src/test/contribs-lib/analysis/testChinese.cpp | 148 ++++++++++++++++++++-
5 files changed, 256 insertions(+), 50 deletions(-)
diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
index 2db2e941..8d7d8674 100644
--- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
@@ -22,15 +22,23 @@ CL_NS_USE2(analysis, snowball)
CL_NS_DEF(analysis)
-LanguageBasedAnalyzer::LanguageBasedAnalyzer(const TCHAR *language, bool stem)
{
+LanguageBasedAnalyzer::LanguageBasedAnalyzer(const TCHAR *language, bool stem,
AnalyzerMode mode) {
+ stopSet = _CLNEW CLTCSetList;
+
if (language == NULL)
_tcsncpy(lang, LUCENE_BLANK_STRING, 100);
else
_tcsncpy(lang, language, 100);
this->stem = stem;
+ this->mode = mode;
}
LanguageBasedAnalyzer::~LanguageBasedAnalyzer() = default;
+
+void LanguageBasedAnalyzer::setStopWords(const TCHAR** stopwords) {
+ StopFilter::fillStopTable(stopSet, stopwords);
+}
+
void LanguageBasedAnalyzer::setLanguage(const TCHAR *language) {
_tcsncpy(lang, language, 100);
}
@@ -39,6 +47,10 @@ void LanguageBasedAnalyzer::setStem(bool s) {
this->stem = s;
}
+void LanguageBasedAnalyzer::setMode(AnalyzerMode m) {
+ this->mode = m;
+}
+
void LanguageBasedAnalyzer::initDict(const std::string &dictPath) {
if (_tcscmp(lang, _T("chinese")) == 0) {
CL_NS2(analysis, jieba)::ChineseTokenizer::init(dictPath);
@@ -46,65 +58,68 @@ void LanguageBasedAnalyzer::initDict(const std::string
&dictPath) {
}
TokenStream *LanguageBasedAnalyzer::reusableTokenStream(const TCHAR *
/*fieldName*/, CL_NS(util)::Reader *reader) {
- TokenStream *tokenizer = getPreviousTokenStream();
- if (tokenizer == nullptr) {
+ SavedStreams* streams =
reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
+
+ if (streams == nullptr) {
+ streams = _CLNEW SavedStreams();
if (_tcscmp(lang, _T("cjk")) == 0) {
- tokenizer = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader);
+ streams->tokenStream = _CLNEW CL_NS2(analysis,
cjk)::CJKTokenizer(reader);
+ streams->filteredTokenStream =
+ _CLNEW StopFilter(streams->tokenStream, true, stopSet);
} else if (_tcscmp(lang, _T("chinese")) == 0) {
- tokenizer = _CLNEW CL_NS2(analysis,
jieba)::ChineseTokenizer(reader);
+ streams->tokenStream = _CLNEW CL_NS2(analysis,
jieba)::ChineseTokenizer(reader, mode);
+ streams->filteredTokenStream =
+ _CLNEW StopFilter(streams->tokenStream, true, stopSet);
} else {
- BufferedReader *bufferedReader = reader->__asBufferedReader();
- if (bufferedReader == NULL)
- tokenizer = _CLNEW StandardTokenizer(_CLNEW
FilteredBufferedReader(reader, false), true);
- else
- tokenizer = _CLNEW StandardTokenizer(bufferedReader);
-
- tokenizer = _CLNEW StandardFilter(tokenizer, true);
-
- if (stem)
- tokenizer = _CLNEW SnowballFilter(tokenizer, lang,
true);//todo: should check whether snowball supports the language
-
- if (stem)
//hmm... this could be configured seperately from stem
- tokenizer = _CLNEW ISOLatin1AccentFilter(tokenizer,
true);//todo: this should really only be applied to latin languages...
-
- //lower case after the latin1 filter
- tokenizer = _CLNEW LowerCaseFilter(tokenizer, true);
+ CL_NS(util)::BufferedReader* bufferedReader =
reader->__asBufferedReader();
+
+ if (bufferedReader == nullptr) {
+ streams->tokenStream = _CLNEW StandardTokenizer(
+ _CLNEW CL_NS(util)::FilteredBufferedReader(reader,
false), true);
+ } else {
+ streams->tokenStream = _CLNEW
StandardTokenizer(bufferedReader);
+ }
+
+ streams->filteredTokenStream = _CLNEW
StandardFilter(streams->tokenStream, true);
+ if (stem) {
+ streams->filteredTokenStream = _CLNEW SnowballFilter(
streams->filteredTokenStream, lang, true);//todo: should check whether snowball
supports the language
+ }
+ streams->filteredTokenStream =
+ _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
+ streams->filteredTokenStream =
+ _CLNEW StopFilter(streams->filteredTokenStream, true,
stopSet);
}
- setPreviousTokenStream(tokenizer);
+ setPreviousTokenStream(streams);
} else {
- auto t = dynamic_cast<Tokenizer *>(tokenizer);
- if (t != nullptr) {
- t->reset(reader);
- }
+ streams->tokenStream->reset(reader);
}
- return tokenizer;
+
+ return streams->filteredTokenStream;
}
TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR *fieldName, Reader
*reader) {
- TokenStream *ret = NULL;
+ TokenStream *ret = nullptr;
if (_tcscmp(lang, _T("cjk")) == 0) {
ret = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader);
} else if (_tcscmp(lang, _T("chinese")) == 0) {
- ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader);
+ ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode);
} else {
- BufferedReader *bufferedReader = reader->__asBufferedReader();
- if (bufferedReader == NULL)
- ret = _CLNEW StandardTokenizer(_CLNEW
FilteredBufferedReader(reader, false), true);
- else
+ CL_NS(util)::BufferedReader* bufferedReader =
reader->__asBufferedReader();
+
+ if (bufferedReader == nullptr) {
+ ret = _CLNEW StandardTokenizer(
+ _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false),
true);
+ } else {
ret = _CLNEW StandardTokenizer(bufferedReader);
+ }
ret = _CLNEW StandardFilter(ret, true);
-
- if (stem)
+ if (stem) {
ret = _CLNEW SnowballFilter(ret, lang, true);//todo: should check
whether snowball supports the language
-
- if (stem) //hmm... this could
be configured seperately from stem
- ret = _CLNEW ISOLatin1AccentFilter(ret, true);//todo: this should
really only be applied to latin languages...
-
- //lower case after the latin1 filter
+ }
ret = _CLNEW LowerCaseFilter(ret, true);
}
- //todo: could add a stop filter based on the language - need to fix the
stoplist loader first
+ ret = _CLNEW StopFilter(ret, true, stopSet);
return ret;
}
diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
index 22fc3dd9..7c07a882 100644
--- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
@@ -11,15 +11,40 @@
CL_NS_DEF(analysis)
+enum class AnalyzerMode {
+ Default,
+ All,
+ Search
+};
+
class CLUCENE_CONTRIBS_EXPORT LanguageBasedAnalyzer : public
CL_NS(analysis)::Analyzer {
+ class SavedStreams : public TokenStream {
+ public:
+ Tokenizer* tokenStream;
+ TokenStream* filteredTokenStream;
+
+ SavedStreams():tokenStream(NULL), filteredTokenStream(NULL)
+ {
+ }
+
+ void close(){}
+ Token* next(Token* token) {return NULL;}
+ };
+ /**
+ * Contains the stopwords used with the StopFilter.
+ */
+ CL_NS(analysis)::CLTCSetList* stopSet;
TCHAR lang[100]{};
bool stem;
+ AnalyzerMode mode{};
public:
- explicit LanguageBasedAnalyzer(const TCHAR *language = nullptr, bool stem
= true);
+ explicit LanguageBasedAnalyzer(const TCHAR *language = nullptr, bool stem
= true, AnalyzerMode mode = AnalyzerMode::All);
~LanguageBasedAnalyzer() override;
+ void setStopWords(const TCHAR** stopwords);
void setLanguage(const TCHAR *language);
void setStem(bool s);
+ void setMode(AnalyzerMode m);
void initDict(const std::string &dictPath);
TokenStream *tokenStream(const TCHAR *fieldName, CL_NS(util)::Reader
*reader) override;
TokenStream *reusableTokenStream(const TCHAR * /*fieldName*/,
CL_NS(util)::Reader *reader) override;
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
index 0d44e376..ca371958 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
@@ -7,7 +7,7 @@ CL_NS_DEF2(analysis, jieba)
CL_NS_USE(analysis)
CL_NS_USE(util)
-ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader) :
Tokenizer(reader) {
+ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode
m) : Tokenizer(reader), mode(m) {
buffer[0] = 0;
}
@@ -22,7 +22,7 @@ CL_NS(analysis)::Token
*ChineseTokenizer::next(lucene::analysis::Token *token) {
int totalLen = 0;
do {
auto bufferLen = input->read((const void**)&ioBuffer, 1,
LUCENE_IO_BUFFER_SIZE);
- if (bufferLen == -1) {
+ if (bufferLen <= 0) {
dataLen = 0;
bufferIndex = 0;
break;
@@ -35,17 +35,26 @@ CL_NS(analysis)::Token
*ChineseTokenizer::next(lucene::analysis::Token *token) {
char tmp_buffer[4 * totalLen];
lucene_wcsntoutf8(tmp_buffer, initBuffer, totalLen, 4 * totalLen);
- JiebaSingleton::getInstance().Cut(tmp_buffer, tokens_text, true);
+ switch (mode) {
+ case AnalyzerMode::Search:
+ JiebaSingleton::getInstance().CutForSearch(tmp_buffer,
tokens_text, true);
+ break;
+ case AnalyzerMode::All:
+ JiebaSingleton::getInstance().CutAll(tmp_buffer, tokens_text);
+ break;
+ case AnalyzerMode::Default:
+ JiebaSingleton::getInstance().Cut(tmp_buffer, tokens_text, true);
+ break;
+ }
dataLen = tokens_text.size();
}
if (bufferIndex < dataLen) {
- auto token_text = tokens_text[bufferIndex];
- bufferIndex++;
+ auto token_text = tokens_text[bufferIndex++];
lucene_utf8towcs(buffer, token_text.c_str(), LUCENE_MAX_WORD_LEN);
auto length = _tcslen(buffer);
token->set(buffer, 0, length);
return token;
}
- return NULL;
+ return nullptr;
}
CL_NS_END2
\ No newline at end of file
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
index e642be86..276b138b 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
@@ -7,6 +7,8 @@
#include "Jieba.hpp"
#include "CLucene/analysis/AnalysisHeader.h"
+#include "CLucene/analysis/LanguageBasedAnalyzer.h"
+
CL_NS_DEF2(analysis,jieba)
@@ -27,6 +29,7 @@ private:
class ChineseTokenizer : public lucene::analysis::Tokenizer {
private:
+ AnalyzerMode mode{};
/** word offset, used to imply which character(in ) is parsed */
int32_t offset{};
@@ -52,7 +55,7 @@ private:
public:
// Constructor
- explicit ChineseTokenizer(lucene::util::Reader *reader);
+ explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode);
static void init(const std::string& dictPath="");
// Destructor
@@ -60,6 +63,14 @@ public:
// Override the next method to tokenize Chinese text using Jieba
lucene::analysis::Token* next(lucene::analysis::Token* token) override;
+
+ void reset(lucene::util::Reader *reader) override {
+ this->input = reader;
+ this->offset = 0;
+ this->bufferIndex = 0;
+ this->dataLen = 0;
+ this->tokens_text.clear();
+ }
};
CL_NS_END2
diff --git a/src/test/contribs-lib/analysis/testChinese.cpp
b/src/test/contribs-lib/analysis/testChinese.cpp
index 3f54d537..c4210e4d 100644
--- a/src/test/contribs-lib/analysis/testChinese.cpp
+++ b/src/test/contribs-lib/analysis/testChinese.cpp
@@ -149,6 +149,142 @@ std::string get_dict_path() {
return "";
}
+void testSimpleJiebaSearchModeTokenizer2(CuTest* tc) {
+ LanguageBasedAnalyzer a;
+ CL_NS(util)::StringReader reader(_T("冰咒龙"));
+ reader.mark(50);
+ TokenStream* ts;
+ Token t;
+
+ //test with chinese
+ a.setLanguage(_T("chinese"));
+ a.setStem(false);
+ a.setMode(lucene::analysis::AnalyzerMode::Search);
+ a.initDict(get_dict_path());
+ ts = a.tokenStream(_T("contents"), &reader);
+
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("冰咒")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("龙")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) == NULL);
+ _CLDELETE(ts);
+}
+
+void testSimpleJiebaAllModeTokenizer2(CuTest* tc) {
+ LanguageBasedAnalyzer a;
+ CL_NS(util)::StringReader reader(_T("冰咒龙"));
+ reader.mark(50);
+ TokenStream* ts;
+ Token t;
+
+ //test with chinese
+ a.setLanguage(_T("chinese"));
+ a.setStem(false);
+ a.setMode(lucene::analysis::AnalyzerMode::All);
+ a.initDict(get_dict_path());
+ ts = a.tokenStream(_T("contents"), &reader);
+
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("冰")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("咒")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("龙")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) == NULL);
+ _CLDELETE(ts);
+}
+
+void testSimpleJiebaAllModeTokenizer(CuTest* tc) {
+ LanguageBasedAnalyzer a;
+ CL_NS(util)::StringReader reader(_T("我来到北京清华大学"));
+ reader.mark(50);
+ TokenStream* ts;
+ Token t;
+
+ //test with chinese
+ a.setLanguage(_T("chinese"));
+ a.setStem(false);
+ a.setMode(lucene::analysis::AnalyzerMode::All);
+ a.initDict(get_dict_path());
+ ts = a.tokenStream(_T("contents"), &reader);
+
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("北京")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("清华")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("清华大学")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("华大")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("大学")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) == NULL);
+ _CLDELETE(ts);
+}
+
+void testSimpleJiebaDefaultModeTokenizer(CuTest* tc) {
+ LanguageBasedAnalyzer a;
+ CL_NS(util)::StringReader reader(_T("我来到北京清华大学"));
+ reader.mark(50);
+ TokenStream* ts;
+ Token t;
+
+ //test with chinese
+ a.setLanguage(_T("chinese"));
+ a.setStem(false);
+ a.setMode(lucene::analysis::AnalyzerMode::Default);
+ a.initDict(get_dict_path());
+ ts = a.tokenStream(_T("contents"), &reader);
+
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("北京")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("清华大学")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) == NULL);
+ _CLDELETE(ts);
+}
+
+void testSimpleJiebaSearchModeTokenizer(CuTest* tc) {
+ LanguageBasedAnalyzer a;
+ CL_NS(util)::StringReader reader(_T("我来到北京清华大学"));
+ reader.mark(50);
+ TokenStream* ts;
+ Token t;
+
+ //test with chinese
+ a.setLanguage(_T("chinese"));
+ a.setStem(false);
+ a.setMode(lucene::analysis::AnalyzerMode::Search);
+ a.initDict(get_dict_path());
+ ts = a.tokenStream(_T("contents"), &reader);
+
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("北京")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("清华")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("华大")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("大学")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("清华大学")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) == NULL);
+ _CLDELETE(ts);
+}
+
void testSimpleJiebaTokenizer(CuTest* tc) {
LanguageBasedAnalyzer a;
CL_NS(util)::StringReader reader(_T("我爱你中国"));
@@ -159,6 +295,7 @@ void testSimpleJiebaTokenizer(CuTest* tc) {
//test with chinese
a.setLanguage(_T("chinese"));
a.setStem(false);
+ a.setMode(lucene::analysis::AnalyzerMode::Default);
a.initDict(get_dict_path());
ts = a.tokenStream(_T("contents"), &reader);
@@ -180,6 +317,7 @@ void testSimpleJiebaTokenizer2(CuTest* tc) {
//test with chinese
a.setLanguage(_T("chinese"));
a.setStem(false);
+ a.setMode(lucene::analysis::AnalyzerMode::Default);
ts = a.tokenStream(_T("contents"), &reader);
CLUCENE_ASSERT(ts->next(&t) != NULL);
@@ -208,6 +346,7 @@ void testSimpleJiebaTokenizer3(CuTest* tc) {
//test with chinese
a.setLanguage(_T("chinese"));
a.setStem(false);
+ a.setMode(lucene::analysis::AnalyzerMode::Default);
ts = a.tokenStream(_T("contents"), &reader);
CLUCENE_ASSERT(ts->next(&t) != NULL);
@@ -316,7 +455,7 @@ void testJiebaMatch(CuTest* tc) {
auto analyzer = _CLNEW lucene::analysis::LanguageBasedAnalyzer();
analyzer->setLanguage(L"chinese");
-
+ analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
IndexWriter w(&dir, analyzer, true);
auto field_name = lucene::util::Misc::_charToWide("chinese");
@@ -395,6 +534,7 @@ void testJiebaMatch2(CuTest* tc) {
auto analyzer = _CLNEW lucene::analysis::LanguageBasedAnalyzer();
analyzer->setLanguage(L"chinese");
+ analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
IndexWriter w(&dir, analyzer, true);
auto field_name = lucene::util::Misc::_charToWide("chinese");
@@ -474,6 +614,7 @@ void testJiebaMatchHuge(CuTest* tc) {
auto analyzer = _CLNEW lucene::analysis::LanguageBasedAnalyzer();
analyzer->setLanguage(L"chinese");
+ analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
analyzer->initDict(get_dict_path());
IndexWriter w(&dir, analyzer, true);
@@ -1127,6 +1268,11 @@ CuSuite *testchinese(void) {
SUITE_ADD_TEST(suite, testJiebaMatch);
SUITE_ADD_TEST(suite, testJiebaMatch2);
SUITE_ADD_TEST(suite, testJiebaMatchHuge);
+ SUITE_ADD_TEST(suite, testSimpleJiebaAllModeTokenizer);
+ SUITE_ADD_TEST(suite, testSimpleJiebaDefaultModeTokenizer);
+ SUITE_ADD_TEST(suite, testSimpleJiebaSearchModeTokenizer);
+ SUITE_ADD_TEST(suite, testSimpleJiebaAllModeTokenizer2);
+ SUITE_ADD_TEST(suite, testSimpleJiebaSearchModeTokenizer2);
return suite;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]