This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene-2.0
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene-2.0 by this push:
new 9d83fdf5 [Feature](tokenizer) add lowercase option for tokenizer
(#157) (#161)
9d83fdf5 is described below
commit 9d83fdf5a36e654ff806b991e8fa9ae794ef8a3d
Author: airborne12 <[email protected]>
AuthorDate: Mon Dec 25 11:03:30 2023 +0800
[Feature](tokenizer) add lowercase option for tokenizer (#157) (#161)
---
src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp | 5 +++--
.../CLucene/analysis/jieba/ChineseTokenizer.cpp | 13 +++++++++++++
src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h | 1 +
src/core/CLucene/analysis/AnalysisHeader.h | 7 +++++++
src/core/CLucene/analysis/Analyzers.cpp | 12 +++++++++++-
src/core/CLucene/analysis/Analyzers.h | 13 ++++++++-----
src/core/CLucene/analysis/standard95/StandardAnalyzer.h | 5 +++--
src/core/CLucene/analysis/standard95/StandardTokenizer.h | 13 ++++++++++---
8 files changed, 56 insertions(+), 13 deletions(-)
diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
index 23de239d..0bc03443 100644
--- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
@@ -31,6 +31,7 @@ LanguageBasedAnalyzer::LanguageBasedAnalyzer(const TCHAR
*language, bool stem, A
_tcsncpy(lang, language, 100);
this->stem = stem;
this->mode = mode;
+ Analyzer::_lowercase = false;
}
LanguageBasedAnalyzer::~LanguageBasedAnalyzer() {
@@ -78,7 +79,7 @@ TokenStream *LanguageBasedAnalyzer::reusableTokenStream(const
TCHAR * /*fieldNam
streams->filteredTokenStream =
_CLNEW StopFilter(streams->tokenStream, true, stopSet);
} else if (_tcscmp(lang, _T("chinese")) == 0) {
- streams->tokenStream = _CLNEW CL_NS2(analysis,
jieba)::ChineseTokenizer(reader, mode);
+ streams->tokenStream = _CLNEW CL_NS2(analysis,
jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase);
streams->filteredTokenStream = streams->tokenStream;
} else {
CL_NS(util)::BufferedReader* bufferedReader =
reader->__asBufferedReader();
@@ -111,7 +112,7 @@ TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR
*fieldName, Reader *
if (_tcscmp(lang, _T("cjk")) == 0) {
ret = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader);
} else if (_tcscmp(lang, _T("chinese")) == 0) {
- ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode);
+ ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode,
Analyzer::_lowercase);
} else {
CL_NS(util)::BufferedReader* bufferedReader =
reader->__asBufferedReader();
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
index 2fd6f0a3..9a7f5edd 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
@@ -9,6 +9,12 @@ CL_NS_USE(util)
ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode
m) : Tokenizer(reader), mode(m) {
reset(reader);
+ Tokenizer::lowercase = false;
+}
+
+ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode
m, bool lowercase) : Tokenizer(reader), mode(m) {
+ reset(reader);
+ Tokenizer::lowercase = lowercase;
}
void ChineseTokenizer::init(const std::string &dictPath) {
@@ -22,6 +28,13 @@ CL_NS(analysis)::Token
*ChineseTokenizer::next(lucene::analysis::Token *token) {
std::string_view& token_text = tokens_text[bufferIndex++];
size_t size = std::min(token_text.size(),
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
+ if (Tokenizer::lowercase) {
+ if (!token_text.empty() && token_text[0] < 0x80) {
+ std::transform(token_text.begin(), token_text.end(),
+ const_cast<char*>(token_text.data()),
+ [](char c) { return to_lower(c); });
+ }
+ }
token->setNoCopy(token_text.data(), 0, size);
return token;
}
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
index 9bd34fb7..9fe33f58 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
@@ -45,6 +45,7 @@ private:
public:
// Constructor
explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode);
+ explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode,
bool lowercase);
static void init(const std::string& dictPath="");
// Destructor
diff --git a/src/core/CLucene/analysis/AnalysisHeader.h
b/src/core/CLucene/analysis/AnalysisHeader.h
index 46ab0020..578d8e00 100644
--- a/src/core/CLucene/analysis/AnalysisHeader.h
+++ b/src/core/CLucene/analysis/AnalysisHeader.h
@@ -293,6 +293,10 @@ public:
* performance.
*/
virtual TokenStream* reusableTokenStream(const TCHAR* fieldName,
CL_NS(util)::Reader* reader);
+
+ virtual void set_lowercase(bool lowercase) {
+ _lowercase = lowercase;
+ }
private:
DEFINE_MUTEX(THIS_LOCK)
@@ -309,6 +313,8 @@ protected:
* to save a TokenStream for later re-use by the same
* thread. */
virtual void setPreviousTokenStream(TokenStream* obj);
+ bool _lowercase = false;
+
public:
/**
* Invoked before indexing a Field instance if
@@ -343,6 +349,7 @@ class CLUCENE_EXPORT Tokenizer:public TokenStream {
protected:
/** The text source for this Tokenizer. */
CL_NS(util)::Reader* input;
+ bool lowercase = false;
public:
/** Construct a tokenizer with null input. */
diff --git a/src/core/CLucene/analysis/Analyzers.cpp
b/src/core/CLucene/analysis/Analyzers.cpp
index 3ea3e8b0..05a1c9e6 100644
--- a/src/core/CLucene/analysis/Analyzers.cpp
+++ b/src/core/CLucene/analysis/Analyzers.cpp
@@ -50,6 +50,12 @@ template class LowerCaseTokenizer<TCHAR>;
template<typename T>
SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in) :
LowerCaseTokenizer<T>(in) {
+ Tokenizer::lowercase = true;
+}
+
+template<typename T>
+SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in, bool lowercase) :
LowerCaseTokenizer<T>(in) {
+ Tokenizer::lowercase = lowercase;
}
template<typename T>
@@ -86,7 +92,11 @@ Token *SimpleTokenizer<char>::next(Token *token) {
if (length == 0)// start of token
start = offset - 1;
- buffer[length++] = to_lower(c); // buffer it, normalized
+ if (lowercase) {
+ buffer[length++] = to_lower(c); // buffer it, normalized
+ } else {
+ buffer[length++] = c; // buffer it, normalized
+ }
if (length == LUCENE_MAX_WORD_LEN)// buffer overflow!
break;
diff --git a/src/core/CLucene/analysis/Analyzers.h
b/src/core/CLucene/analysis/Analyzers.h
index 432dde01..a06263cf 100644
--- a/src/core/CLucene/analysis/Analyzers.h
+++ b/src/core/CLucene/analysis/Analyzers.h
@@ -138,8 +138,9 @@ protected:
template<typename T>
class CLUCENE_EXPORT SimpleTokenizer:public LowerCaseTokenizer<T> {
public:
- /** Construct a new SimpleTokenizer. */
- SimpleTokenizer(CL_NS(util)::Reader* in);
+ /** Construct a new SimpleTokenizer. */
+ explicit SimpleTokenizer(CL_NS(util)::Reader* in);
+ SimpleTokenizer(CL_NS(util)::Reader* in, bool lowercase);
virtual ~SimpleTokenizer();
Token* next(Token* token) override {
@@ -179,16 +180,18 @@ public:
template <typename T>
class CLUCENE_EXPORT SimpleAnalyzer: public Analyzer {
public:
- SimpleAnalyzer(){}
+ SimpleAnalyzer(){
+ _lowercase = true;
+ }
bool isSDocOpt() override { return true; }
TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader*
reader) override{
- return _CLNEW SimpleTokenizer<T>(reader);
+ return _CLNEW SimpleTokenizer<T>(reader, _lowercase);
}
TokenStream* reusableTokenStream(const TCHAR* fieldName,
CL_NS(util)::Reader* reader) override{
if (tokenizer_ == nullptr) {
- tokenizer_ = new SimpleTokenizer<T>(reader);
+ tokenizer_ = new SimpleTokenizer<T>(reader, _lowercase);
} else {
tokenizer_->reset(reader);
}
diff --git a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
index 7e29eec8..7460c811 100644
--- a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
+++ b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h
@@ -6,17 +6,18 @@ namespace lucene::analysis::standard95 {
class StandardAnalyzer : public Analyzer {
public:
+ StandardAnalyzer() : Analyzer() { _lowercase = true; }
bool isSDocOpt() override { return true; }
TokenStream* tokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) override {
- return _CLNEW StandardTokenizer(reader, useStopWords_);
+ return _CLNEW StandardTokenizer(reader, useStopWords_, _lowercase);
}
TokenStream* reusableTokenStream(const TCHAR* fieldName,
lucene::util::Reader* reader) override {
if (tokenizer_ == nullptr) {
- tokenizer_ = new StandardTokenizer(reader, useStopWords_);
+ tokenizer_ = new StandardTokenizer(reader, useStopWords_, _lowercase);
} else {
tokenizer_->reset(reader);
}
diff --git a/src/core/CLucene/analysis/standard95/StandardTokenizer.h
b/src/core/CLucene/analysis/standard95/StandardTokenizer.h
index 67403ae8..1aac8671 100644
--- a/src/core/CLucene/analysis/standard95/StandardTokenizer.h
+++ b/src/core/CLucene/analysis/standard95/StandardTokenizer.h
@@ -22,6 +22,12 @@ class StandardTokenizer : public Tokenizer {
StandardTokenizer(lucene::util::Reader* in, bool useStopWords)
: Tokenizer(in), useStopWords_(useStopWords) {
scanner_ = std::make_unique<StandardTokenizerImpl>(in);
+ Tokenizer::lowercase = true;
+ }
+ StandardTokenizer(lucene::util::Reader* in, bool useStopWords, bool
lowercase)
+ : Tokenizer(in), useStopWords_(useStopWords) {
+ scanner_ = std::make_unique<StandardTokenizerImpl>(in);
+ Tokenizer::lowercase = lowercase;
}
Token* next(Token* token) override {
@@ -37,9 +43,10 @@ class StandardTokenizer : public Tokenizer {
if (scanner_->yylength() <= maxTokenLength) {
std::string_view term = scanner_->getText();
if (tokenType == StandardTokenizerImpl::WORD_TYPE) {
- std::transform(term.begin(), term.end(),
- const_cast<char*>(term.data()),
- [](char c) { return to_lower(c); });
+ if (Tokenizer::lowercase) {
+ std::transform(term.begin(), term.end(),
const_cast<char*>(term.data()),
+ [](char c) { return to_lower(c); });
+ }
if (useStopWords_ && stop_words.count(term)) {
skippedPositions++;
continue;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]