This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new d810f78efb1 [fix](inverted_index) fix tokenization issues for some
characters in ik analyzer (#50141)
d810f78efb1 is described below
commit d810f78efb1b5606203ca41318aacbf12891aa60
Author: Ryan19929 <[email protected]>
AuthorDate: Thu Jun 19 14:44:26 2025 +0800
[fix](inverted_index) fix tokenization issues for some characters in ik
analyzer (#50141)
Problem Summary:
This PR fixes the issue of IK Analyzer's abnormal handling of full-width
characters and adds support for Emoji and rare character tokenization,
consistent with Elasticsearch IK behavior.
Co-authored-by: Ryan19929 <[email protected]>
---
.../inverted_index/analyzer/ik/IKTokenizer.cpp | 9 +-
.../inverted_index/analyzer/ik/cfg/Configuration.h | 1 +
.../analyzer/ik/core/AnalyzeContext.cpp | 11 +-
.../analyzer/ik/core/AnalyzeContext.h | 15 ++-
.../analyzer/ik/core/CN_QuantifierSegmenter.h | 1 -
.../analyzer/ik/core/CharacterUtil.cpp | 148 ++++++++-------------
.../analyzer/ik/core/CharacterUtil.h | 6 +-
.../analyzer/ik/core/IKSegmenter.cpp | 1 +
.../inverted_index/analyzer/ik/core/IKSegmenter.h | 1 +
.../analyzer/ik/core/LetterSegmenter.cpp | 52 ++++----
.../analyzer/ik/core/LetterSegmenter.h | 7 +-
.../analyzer/ik/core/SurrogatePairSegmenter.cpp | 37 ++++++
...ntifierSegmenter.h => SurrogatePairSegmenter.h} | 31 ++---
.../inverted_index/analyzer/ik_anayzer_test.cpp | 148 ++++++++++++++++++++-
.../{ => analyzer}/test_ik_analyzer.out | Bin
.../data/inverted_index_p0/test_tokenize.out | Bin 3555 -> 3771 bytes
.../{ => analyzer}/test_ik_analyzer.groovy | 10 +-
.../suites/inverted_index_p0/test_tokenize.groovy | 5 +
18 files changed, 322 insertions(+), 161 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
index e1f451804e5..72b906fa4e1 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
@@ -32,13 +32,10 @@ Token* IKTokenizer::next(Token* token) {
}
std::string& token_text = tokens_text_[buffer_index_++];
+ // full-width to half-width, and lowercase
+ // TODO(ryan19929): do regularizeString in fillBuffer.
+ CharacterUtil::regularizeString(token_text, this->lowercase);
size_t size = std::min(token_text.size(),
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
- if (this->lowercase) {
- if (!token_text.empty() && static_cast<uint8_t>(token_text[0]) < 0x80)
{
- std::transform(token_text.begin(), token_text.end(),
token_text.begin(),
- [](char c) { return to_lower(c); });
- }
- }
token->setNoCopy(token_text.data(), 0, size);
return token;
}
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
index a9be1d76220..a0c9c894c5c 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
@@ -25,6 +25,7 @@ namespace doris::segment_v2 {
class Configuration {
private:
bool use_smart_;
+ // TODO(ryan19929): delete config_->lower_case_, because it is always
true(java version is same)
bool enable_lowercase_;
std::string dict_path_;
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
index d648f8e715a..3356210f20c 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
@@ -68,7 +68,7 @@ size_t AnalyzeContext::fillBuffer(lucene::util::Reader*
reader) {
int32_t readCount = 0;
if (buffer_offset_ == 0) {
readCount = max(0, reader->readCopy(segment_buff_.data(), 0,
BUFF_SIZE));
- CharacterUtil::decodeStringToRunes(segment_buff_.c_str(),
readCount, typed_runes_,
+ CharacterUtil::decodeStringToRunes(segment_buff_.data(),
readCount, typed_runes_,
config_->isEnableLowercase());
} else {
size_t offset = available_ -
typed_runes_[cursor_].getNextBytePosition();
@@ -82,7 +82,7 @@ size_t AnalyzeContext::fillBuffer(lucene::util::Reader*
reader) {
} else {
readCount = std::max(0, reader->readCopy(segment_buff_.data(),
0, BUFF_SIZE));
}
- CharacterUtil::decodeStringToRunes(segment_buff_.c_str(),
readCount, typed_runes_,
+ CharacterUtil::decodeStringToRunes(segment_buff_.data(),
readCount, typed_runes_,
config_->isEnableLowercase());
}
// Ensure readCount is set to 0 in case of
@@ -172,7 +172,6 @@ bool AnalyzeContext::moveCursor() {
void AnalyzeContext::initCursor() {
cursor_ = 0;
- typed_runes_[cursor_].regularize(config_->isEnableLowercase());
}
bool AnalyzeContext::isBufferConsumed() const {
@@ -199,6 +198,9 @@ void AnalyzeContext::lockBuffer(SegmenterType type) {
case SegmenterType::LETTER_SEGMENTER:
buffer_locker_ |= LETTER_SEGMENTER_FLAG;
break;
+ case SegmenterType::SURROGATE_PAIR_SEGMENTER:
+ buffer_locker_ |= SURROGATE_PAIR_SEGMENTER_FLAG;
+ break;
}
}
@@ -213,6 +215,9 @@ void AnalyzeContext::unlockBuffer(SegmenterType type) {
case SegmenterType::LETTER_SEGMENTER:
buffer_locker_ &= ~LETTER_SEGMENTER_FLAG;
break;
+ case SegmenterType::SURROGATE_PAIR_SEGMENTER:
+ buffer_locker_ &= ~SURROGATE_PAIR_SEGMENTER_FLAG;
+ break;
}
}
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
index b6b363d9d1a..d9e947a713d 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
@@ -41,10 +41,10 @@ private:
static const size_t BUFF_SIZE = 4096;
static const size_t BUFF_EXHAUST_CRITICAL = 100;
- static constexpr uint8_t CJK_SEGMENTER_FLAG = 0x01; // 0001
- static constexpr uint8_t CN_QUANTIFIER_FLAG = 0x02; // 0010
- static constexpr uint8_t LETTER_SEGMENTER_FLAG = 0x04; // 0100
-
+ static constexpr uint8_t CJK_SEGMENTER_FLAG = 0x01; // 0001
+ static constexpr uint8_t CN_QUANTIFIER_FLAG = 0x02; // 0010
+ static constexpr uint8_t LETTER_SEGMENTER_FLAG = 0x04; // 0100
+ static constexpr uint8_t SURROGATE_PAIR_SEGMENTER_FLAG = 0x08; // 1000
// String buffer
std::string segment_buff_;
// An array storing Unicode code points (runes)Character information array
@@ -73,7 +73,12 @@ private:
void compound(Lexeme& lexeme);
public:
- enum class SegmenterType { CJK_SEGMENTER, CN_QUANTIFIER, LETTER_SEGMENTER
};
+ enum class SegmenterType {
+ CJK_SEGMENTER,
+ CN_QUANTIFIER,
+ LETTER_SEGMENTER,
+ SURROGATE_PAIR_SEGMENTER
+ };
const CharacterUtil::TypedRuneArray& getTypedRuneArray() const { return
typed_runes_; }
explicit AnalyzeContext(IKMemoryPool<Cell>& pool,
std::shared_ptr<Configuration> config);
virtual ~AnalyzeContext();
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
index 27ccef61a83..a20341d3f3e 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
@@ -29,7 +29,6 @@ class CN_QuantifierSegmenter : public ISegmenter {
public:
static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE =
AnalyzeContext::SegmenterType::CN_QUANTIFIER;
- static const std::string SEGMENTER_NAME;
static const std::u32string CHINESE_NUMBERS;
static const std::unordered_set<char32_t> CHINESE_NUMBER_CHARS;
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
index 808edc14039..a991967392c 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
@@ -20,34 +20,34 @@
namespace doris::segment_v2 {
int32_t CharacterUtil::identifyCharType(int32_t rune) {
- // Numbers
- if (rune >= 0x30 && rune <= 0x39) {
+ if (rune >= '0' && rune <= '9') {
return CHAR_ARABIC;
}
-
- // English
- if ((rune >= 0x61 && rune <= 0x7a) || (rune >= 0x41 && rune <= 0x5a)) {
+ if ((rune >= 'a' && rune <= 'z') || (rune >= 'A' && rune <= 'Z')) {
return CHAR_ENGLISH;
}
- // CJK Unified Chinese Characters
- if ((rune >= 0x4E00 && rune <= 0x9FFF) || (rune >= 0x3400 && rune <=
0x4DBF) ||
- (rune >= 0x20000 && rune <= 0x2A6DF) || (rune >= 0x2A700 && rune <=
0x2B73F) ||
- (rune >= 0x2B740 && rune <= 0x2B81F) || (rune >= 0x2B820 && rune <=
0x2CEAF) ||
- (rune >= 0x2CEB0 && rune <= 0x2EBEF) || (rune >= 0x30000 && rune <=
0x3134F)) {
+ UBlockCode block = ublock_getCode(rune);
+
+ if (block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS || block ==
UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS ||
+ block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A ||
+ block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B ||
+ block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C ||
+ block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D ||
+ block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E ||
+ block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F ||
+ block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G) {
return CHAR_CHINESE;
}
- // Japanese and Korean characters
- if ((rune >= 0x3040 && rune <= 0x309F) || (rune >= 0x30A0 && rune <=
0x30FF) ||
- (rune >= 0x31F0 && rune <= 0x31FF) || (rune >= 0xAC00 && rune <=
0xD7AF) ||
- (rune >= 0x1100 && rune <= 0x11FF)) {
+ if (block == UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS || block ==
UBLOCK_HANGUL_SYLLABLES ||
+ block == UBLOCK_HANGUL_JAMO || block ==
UBLOCK_HANGUL_COMPATIBILITY_JAMO ||
+ block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA ||
+ block == UBLOCK_KATAKANA_PHONETIC_EXTENSIONS) {
return CHAR_OTHER_CJK;
}
- // UTF-16 surrogate pairs and private zone
- if ((rune >= 0xD800 && rune <= 0xDBFF) || (rune >= 0xDC00 && rune <=
0xDFFF) ||
- (rune >= 0xE000 && rune <= 0xF8FF)) {
+ if (rune > 0xFFFF) {
return CHAR_SURROGATE;
}
@@ -55,42 +55,21 @@ int32_t CharacterUtil::identifyCharType(int32_t rune) {
}
int32_t CharacterUtil::regularize(int32_t rune, bool use_lowercase) {
- // Full-width to half-width
if (rune == 0x3000) {
- return 0x0020; // Convert full-width space to half-width
- }
-
- // Full-width numbers
- if (rune >= 0xFF10 && rune <= 0xFF19) {
- return rune - 0xFEE0; // Convert to half-width numbers
- }
-
- // Full-width letters
- if (rune >= 0xFF21 && rune <= 0xFF3A) {
+ return 0x0020;
+ } else if (rune > 0xFF00 && rune < 0xFF5F) {
rune = rune - 0xFEE0;
- if (use_lowercase) {
- rune += 32; // Convert to lowercase
- }
- return rune;
- }
- if (rune >= 0xFF41 && rune <= 0xFF5A) {
- return rune - 0xFEE0;
- }
-
- // Convert half-width uppercase letters to lowercase
- if (use_lowercase && rune >= 0x41 && rune <= 0x5A) {
- return rune + 32;
+ } else if (use_lowercase && rune >= 0x41 && rune <= 0x5A) {
+ // This else-if causes full-width letters unable to be converted to
lowercase
+ rune += 32;
}
return rune;
}
void CharacterUtil::TypedRune::regularize(bool use_lowercase) {
- CharacterUtil::regularizeCharInfo(*this, use_lowercase);
-}
-
-void CharacterUtil::regularizeCharInfo(TypedRune& typedRune, bool
use_lowercase) {
- typedRune.rune = regularize(typedRune.rune, use_lowercase);
+ this->rune = CharacterUtil::regularize(this->rune, use_lowercase);
+ this->char_type = CharacterUtil::identifyCharType(this->rune);
}
CharacterUtil::RuneStrLite CharacterUtil::decodeChar(const char* str, size_t
length) {
@@ -101,7 +80,7 @@ bool CharacterUtil::decodeString(const char* str, size_t
length, RuneStrArray& r
return cppjieba::DecodeRunesInString(str, length, runes);
}
-void CharacterUtil::decodeStringToRunes(const char* str, size_t length,
TypedRuneArray& typed_runes,
+void CharacterUtil::decodeStringToRunes(char* str, size_t length,
TypedRuneArray& typed_runes,
bool use_lowercase) {
typed_runes.clear();
size_t byte_pos = 0;
@@ -111,59 +90,48 @@ void CharacterUtil::decodeStringToRunes(const char* str,
size_t length, TypedRun
if (runeStr.len == 0) {
break;
}
+ if (runeStr.len == 1 && use_lowercase && str[byte_pos] >= 'A' &&
str[byte_pos] <= 'Z') {
+ str[byte_pos] += 32;
+ }
typed_runes.emplace_back(runeStr.rune, byte_pos, runeStr.len,
typed_runes.size(), 1);
- if (use_lowercase) {
- typed_runes.back().regularize(true);
- }
+ typed_runes.back().regularize(use_lowercase);
+
byte_pos += runeStr.len;
}
}
-// TODO: Maybe delete this function
-size_t CharacterUtil::adjustToCompleteChar(const char* buffer, size_t
buffer_length) {
- if (buffer_length == 0) return 0;
-
- unsigned char last_byte = buffer[buffer_length - 1];
-
- if (last_byte < 0x80) {
- return buffer_length;
- }
-
- if ((last_byte & 0xC0) == 0x80) {
- size_t adjustedLen = buffer_length - 1;
- while (adjustedLen > 0) {
- unsigned char byte = buffer[adjustedLen - 1];
- if ((byte & 0xC0) != 0x80) {
- int charLen = 0;
- if ((byte & 0xE0) == 0xC0)
- charLen = 2;
- else if ((byte & 0xF0) == 0xE0)
- charLen = 3;
- else if ((byte & 0xF8) == 0xF0)
- charLen = 4;
- if (buffer_length - adjustedLen + 1 < charLen) {
- return adjustedLen - 1;
+void CharacterUtil::regularizeString(std::string& input, bool use_lowercase) {
+ std::string temp;
+ size_t len = input.size();
+ temp.reserve(len);
+ for (size_t i = 0; i < len;) {
+ unsigned char c = input[i];
+ if ((c & 0xF0) == 0xE0 && i + 2 < len) {
+ int rune = ((c & 0x0F) << 12) | ((input[i + 1] & 0x3F) << 6) |
(input[i + 2] & 0x3F);
+ if (rune == 0x3000) {
+ temp += ' ';
+ } else if (rune >= 0xFF01 && rune <= 0xFF5E) {
+ char half = static_cast<char>(rune - 0xFEE0);
+ if (use_lowercase && half >= 'A' && half <= 'Z') {
+ half += 32;
}
- return buffer_length;
+ temp += half;
+ } else {
+ temp += input[i];
+ temp += input[i + 1];
+ temp += input[i + 2];
+ }
+ i += 3;
+ } else {
+ char ch = input[i];
+ if (use_lowercase && ch >= 'A' && ch <= 'Z') {
+ ch += 32;
}
- adjustedLen--;
+ temp += ch;
+ i += 1;
}
- return 0;
- }
-
- int charLen = 0;
- if ((last_byte & 0xE0) == 0xC0)
- charLen = 2;
- else if ((last_byte & 0xF0) == 0xE0)
- charLen = 3;
- else if ((last_byte & 0xF8) == 0xF0)
- charLen = 4;
-
- if (charLen > 1) {
- return buffer_length - 1;
}
-
- return buffer_length;
+ input = std::move(temp);
}
} // namespace doris::segment_v2
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
index 2f3dac6a520..c60f8bb30ce 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
@@ -17,6 +17,8 @@
#pragma once
+#include <unicode/uchar.h>
+
#include <functional>
#include <memory>
#include <vector>
@@ -70,7 +72,7 @@ public:
static int32_t identifyCharType(int32_t rune);
- static void decodeStringToRunes(const char* str, size_t length,
TypedRuneArray& typed_runes,
+ static void decodeStringToRunes(char* str, size_t length, TypedRuneArray&
typed_runes,
bool use_lowercase);
static int32_t regularize(int32_t rune, bool use_lowercase);
@@ -80,7 +82,7 @@ public:
static void regularizeCharInfo(TypedRune& type_rune, bool use_lowercase);
- static size_t adjustToCompleteChar(const char* buffer, size_t
buffer_length);
+ static void regularizeString(std::string& input, bool use_lowercase =
true);
};
} // namespace doris::segment_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
index 6c1d049ac70..674f22dfd2d 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
@@ -33,6 +33,7 @@ std::vector<std::unique_ptr<ISegmenter>>
IKSegmenter::loadSegmenters() {
segmenters.push_back(std::make_unique<LetterSegmenter>());
segmenters.push_back(std::make_unique<CN_QuantifierSegmenter>());
segmenters.push_back(std::make_unique<CJKSegmenter>());
+ segmenters.push_back(std::make_unique<SurrogatePairSegmenter>());
return segmenters;
}
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
index 4f94fa435db..33defbbe31a 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
@@ -27,6 +27,7 @@
#include "IKArbitrator.h"
#include "ISegmenter.h"
#include "LetterSegmenter.h"
+#include "SurrogatePairSegmenter.h"
#include
"olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h"
namespace doris::segment_v2 {
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
index 2b631399132..c593a1ec63d 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
@@ -53,7 +53,6 @@ void LetterSegmenter::reset() {
bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) {
bool need_lock = false;
- const auto& typed_runes = context.getTypedRuneArray();
if (english_start_ == -1) {
// The current tokenizer has not yet started processing English
characters
if (context.getCurrentCharType() == CharacterUtil::CHAR_ENGLISH) {
@@ -68,9 +67,8 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext&
context) {
english_end_ = context.getCursor();
} else {
// Encounter non-English characters, output tokens
- Lexeme newLexeme(context.getBufferOffset(),
typed_runes[english_start_].offset,
- english_end_ - english_start_ + 1,
Lexeme::Type::English,
- english_start_, english_end_);
+ Lexeme newLexeme =
+ createLexeme(context, english_start_, english_end_,
Lexeme::Type::English);
context.addLexeme(newLexeme);
english_start_ = -1;
english_end_ = -1;
@@ -78,9 +76,8 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext&
context) {
}
if (context.isBufferConsumed() && (english_start_ != -1 && english_end_ !=
-1)) {
- Lexeme newLexeme(context.getBufferOffset(),
typed_runes[english_start_].offset,
- english_end_ - english_start_ + 1,
Lexeme::Type::English, english_start_,
- english_end_);
+ Lexeme newLexeme =
+ createLexeme(context, english_start_, english_end_,
Lexeme::Type::English);
context.addLexeme(newLexeme);
english_start_ = -1;
english_end_ = -1;
@@ -96,7 +93,6 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext&
context) {
bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) {
bool need_lock = false;
- const auto& typed_runes = context.getTypedRuneArray();
if (arabic_start_ == -1) {
// The current tokenizer has not yet started processing numeric
characters
@@ -115,9 +111,8 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext&
context) {
// Do not output numbers, but do not mark the end
} else {
// Encounter non-Arabic characters, output tokens
- Lexeme newLexeme(context.getBufferOffset(),
typed_runes[arabic_start_].offset,
- arabic_end_ - arabic_start_ + 1,
Lexeme::Type::Arabic, arabic_start_,
- arabic_end_);
+ Lexeme newLexeme =
+ createLexeme(context, arabic_start_, arabic_end_,
Lexeme::Type::Arabic);
context.addLexeme(newLexeme);
arabic_start_ = -1;
arabic_end_ = -1;
@@ -125,9 +120,7 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext&
context) {
}
if (context.isBufferConsumed() && (arabic_start_ != -1 && arabic_end_ !=
-1)) {
- Lexeme newLexeme(context.getBufferOffset(),
typed_runes[arabic_start_].offset,
- arabic_end_ - arabic_start_ + 1,
Lexeme::Type::Arabic, arabic_start_,
- arabic_end_);
+ Lexeme newLexeme = createLexeme(context, arabic_start_, arabic_end_,
Lexeme::Type::Arabic);
context.addLexeme(newLexeme);
arabic_start_ = -1;
arabic_end_ = -1;
@@ -143,7 +136,6 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext&
context) {
bool LetterSegmenter::processMixLetter(AnalyzeContext& context) {
bool need_lock = false;
- const auto& typed_runes = context.getTypedRuneArray();
if (start_ == -1) {
// The current tokenizer has not yet started processing characters.
@@ -164,8 +156,7 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext&
context) {
end_ = context.getCursor();
} else {
// Encounter non-letter characters, output a token
- Lexeme newLexeme(context.getBufferOffset(),
typed_runes[start_].offset,
- end_ - start_ + 1, Lexeme::Type::Letter, start_,
end_);
+ Lexeme newLexeme = createLexeme(context, start_, end_,
Lexeme::Type::Letter);
context.addLexeme(newLexeme);
start_ = -1;
end_ = -1;
@@ -173,8 +164,7 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext&
context) {
}
if (context.isBufferConsumed() && (start_ != -1 && end_ != -1)) {
- Lexeme newLexeme(context.getBufferOffset(),
typed_runes[start_].offset, end_ - start_ + 1,
- Lexeme::Type::Letter, start_, end_);
+ Lexeme newLexeme = createLexeme(context, start_, end_,
Lexeme::Type::Letter);
context.addLexeme(newLexeme);
start_ = -1;
end_ = -1;
@@ -184,11 +174,27 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext&
context) {
return need_lock;
}
-bool LetterSegmenter::isLetterConnector(char input) {
- return std::binary_search(std::begin(letter_connectors_),
std::end(letter_connectors_), input);
+bool LetterSegmenter::isLetterConnector(int32_t input) {
+ if (input < 128) {
+ return std::binary_search(std::begin(letter_connectors_),
std::end(letter_connectors_),
+ static_cast<char>(input));
+ }
+ return false;
+}
+
+bool LetterSegmenter::isNumConnector(int32_t input) {
+ if (input < 128) {
+ return std::binary_search(std::begin(num_connectors_),
std::end(num_connectors_),
+ static_cast<char>(input));
+ }
+ return false;
}
-bool LetterSegmenter::isNumConnector(char input) {
- return std::binary_search(std::begin(num_connectors_),
std::end(num_connectors_), input);
+Lexeme LetterSegmenter::createLexeme(AnalyzeContext& context, int start, int
end,
+ Lexeme::Type type) {
+ const auto& typed_runes = context.getTypedRuneArray();
+ return Lexeme(context.getBufferOffset(),
typed_runes[start].getBytePosition(),
+ typed_runes[end].getNextBytePosition() -
typed_runes[start].getBytePosition(),
+ type, start, end);
}
} // namespace doris::segment_v2
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
index 87b36e83fbf..70dc6b4988f 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
@@ -30,7 +30,6 @@ class LetterSegmenter : public ISegmenter {
public:
static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE =
AnalyzeContext::SegmenterType::LETTER_SEGMENTER;
- static const std::string SEGMENTER_NAME;
LetterSegmenter();
~LetterSegmenter() override = default;
@@ -41,8 +40,10 @@ private:
bool processEnglishLetter(AnalyzeContext& context);
bool processArabicLetter(AnalyzeContext& context);
bool processMixLetter(AnalyzeContext& context);
- bool isLetterConnector(char input);
- bool isNumConnector(char input);
+ bool isLetterConnector(int32_t input);
+ bool isNumConnector(int32_t input);
+
+ Lexeme createLexeme(AnalyzeContext& context, int start, int end,
Lexeme::Type type);
int start_ {-1};
int end_ {-1};
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
new file mode 100644
index 00000000000..0aea370a502
--- /dev/null
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "SurrogatePairSegmenter.h"
+
+namespace doris::segment_v2 {
+
+void SurrogatePairSegmenter::analyze(AnalyzeContext& context) {
+ const auto& current_char_type = context.getCurrentCharType();
+
+ if (current_char_type == CharacterUtil::CHAR_SURROGATE) {
+ Lexeme newLexeme(context.getBufferOffset(),
context.getCurrentCharOffset(),
+ context.getCurrentCharLen(), Lexeme::Type::CNChar,
context.getCursor(),
+ context.getCursor());
+ context.addLexeme(newLexeme);
+ }
+
+ context.unlockBuffer(SEGMENTER_TYPE);
+}
+
+void SurrogatePairSegmenter::reset() {}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
similarity index 59%
copy from
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
copy to
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
index 27ccef61a83..bad22658b51 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
@@ -17,36 +17,23 @@
#pragma once
-#include <memory>
-#include <unordered_set>
-#include <vector>
-
#include "AnalyzeContext.h"
+#include "CharacterUtil.h"
#include "ISegmenter.h"
+#include "Lexeme.h"
+
namespace doris::segment_v2 {
-class CN_QuantifierSegmenter : public ISegmenter {
+class SurrogatePairSegmenter : public ISegmenter {
public:
static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE =
- AnalyzeContext::SegmenterType::CN_QUANTIFIER;
- static const std::string SEGMENTER_NAME;
- static const std::u32string CHINESE_NUMBERS;
- static const std::unordered_set<char32_t> CHINESE_NUMBER_CHARS;
+ AnalyzeContext::SegmenterType::SURROGATE_PAIR_SEGMENTER;
- CN_QuantifierSegmenter();
- ~CN_QuantifierSegmenter() override = default;
+ SurrogatePairSegmenter() = default;
+ ~SurrogatePairSegmenter() override = default;
void analyze(AnalyzeContext& context) override;
void reset() override;
-
-private:
- void processCNumber(AnalyzeContext& context);
- void processCount(AnalyzeContext& context);
- bool needCountScan(AnalyzeContext& context);
- void outputNumLexeme(AnalyzeContext& context);
-
- int number_start_;
- int number_end_;
- std::vector<Hit> count_hits_;
};
-} // namespace doris::segment_v2
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
index 18dc16b6925..690499830f5 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
@@ -264,7 +264,11 @@ TEST_F(IKTokenizerTest, TestSpecialCharacters) {
// Test with special characters
std::string specialText = "😊🚀👍测试特殊符号:@#¥%……&*()";
tokenize(specialText, datas, true);
- ASSERT_EQ(datas.size(), 2);
+ ASSERT_EQ(datas.size(), 5);
+ std::vector<std::string> expectedTokens = {"😊", "🚀", "👍", "测试", "特殊符号"};
+ for (size_t i = 0; i < datas.size(); i++) {
+ ASSERT_EQ(datas[i], expectedTokens[i]);
+ }
}
TEST_F(IKTokenizerTest, TestBufferBoundaryWithSpace) {
@@ -428,6 +432,148 @@ TEST_F(IKTokenizerTest, TestLongTextCompareWithJava) {
}
}
+TEST_F(IKTokenizerTest, TestFullWidthCharacters) {
+ std::vector<std::string> datas;
+
+ // test full width numbers
+ std::string fullWidthNumbersText = "4 3 2";
+ tokenize(fullWidthNumbersText, datas, true);
+ std::vector<std::string> expectedNumbers = {"4", "3", "2"}; // half width
numbers
+ ASSERT_EQ(datas.size(), expectedNumbers.size());
+ for (size_t i = 0; i < datas.size(); i++) {
+ ASSERT_EQ(datas[i], expectedNumbers[i]);
+ }
+ datas.clear();
+
+ fullWidthNumbersText = "432";
+ tokenize(fullWidthNumbersText, datas, false);
+ expectedNumbers = {"432"};
+ ASSERT_EQ(datas.size(), expectedNumbers.size());
+ for (size_t i = 0; i < datas.size(); i++) {
+ ASSERT_EQ(datas[i], expectedNumbers[i]);
+ }
+ datas.clear();
+
+ // test full width currency symbol
+ std::string currencyText = "¥";
+ tokenize(currencyText, datas, false);
+ ASSERT_EQ(datas.size(), 1);
+ ASSERT_EQ(datas[0], "¥");
+ datas.clear();
+
+ // test full width symbol in word
+ std::string mixedText = "High&Low";
+ tokenize(mixedText, datas, false);
+ std::vector<std::string> expectedMixed = {"high&low", "high", "low"};
+ ASSERT_EQ(datas.size(), expectedMixed.size());
+ for (size_t i = 0; i < datas.size(); i++) {
+ ASSERT_EQ(datas[i], expectedMixed[i]);
+ }
+ datas.clear();
+
+ // test special separator
+ std::string specialSeparatorText = "1・2";
+ tokenize(specialSeparatorText, datas, false);
+ std::vector<std::string> expectedSeparator = {"1", "・", "2"};
+ ASSERT_EQ(datas.size(), expectedSeparator.size());
+ for (size_t i = 0; i < datas.size(); i++) {
+ ASSERT_EQ(datas[i], expectedSeparator[i]);
+ }
+ datas.clear();
+
+ // test special character
+ std::string specialCharText = "﨑";
+ tokenize(specialCharText, datas, false);
+ ASSERT_EQ(datas.size(), 1);
+ ASSERT_EQ(datas[0], "﨑");
+ datas.clear();
+}
+
+TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) {
+ std::vector<std::string> datas;
+
+ // test emoji
+ std::string emojiText = "🐼";
+ tokenize(emojiText, datas, false);
+ ASSERT_EQ(datas.size(), 1);
+ ASSERT_EQ(datas[0], "🐼");
+ datas.clear();
+
+ std::string emojiText2 = "🝢";
+ tokenize(emojiText2, datas, false);
+ ASSERT_EQ(datas.size(), 1);
+ ASSERT_EQ(datas[0], "🝢");
+ datas.clear();
+
+ // test special latin character
+ std::string specialLatinText1 = "abcşabc";
+ tokenize(specialLatinText1, datas, false);
+ ASSERT_EQ(datas.size(), 2);
+ ASSERT_EQ(datas[0], "abc");
+ ASSERT_EQ(datas[1], "abc");
+ datas.clear();
+
+ std::string specialLatinText2 = "abcīabc";
+ tokenize(specialLatinText2, datas, false);
+ ASSERT_EQ(datas.size(), 2);
+ ASSERT_EQ(datas[0], "abc");
+ ASSERT_EQ(datas[1], "abc");
+ datas.clear();
+
+ std::string specialLatinText3 = "celebrity…get";
+ tokenize(specialLatinText3, datas, false);
+ std::vector<std::string> expectedEllipsis = {"celebrity", "get"};
+ ASSERT_EQ(datas.size(), expectedEllipsis.size());
+ for (size_t i = 0; i < datas.size(); i++) {
+ ASSERT_EQ(datas[i], expectedEllipsis[i]);
+ }
+ datas.clear();
+
+ // test mixed alphabet word
+ std::string mixedAlphabetText1 = "Hulyaiрole";
+ tokenize(mixedAlphabetText1, datas, false);
+ ASSERT_EQ(datas.size(), 2);
+ ASSERT_EQ(datas[0], "hulyai");
+ ASSERT_EQ(datas[1], "ole");
+ datas.clear();
+
+ std::string mixedAlphabetText2 = "Nisa Aşgabat";
+ tokenize(mixedAlphabetText2, datas, false);
+ std::vector<std::string> expectedName = {"nisa", "gabat"};
+ ASSERT_EQ(datas.size(), expectedName.size());
+ for (size_t i = 0; i < datas.size(); i++) {
+ ASSERT_EQ(datas[i], expectedName[i]);
+ }
+ datas.clear();
+
+ // test special connector
+ std::string specialConnectorText = "alـameer";
+ tokenize(specialConnectorText, datas, false);
+ ASSERT_EQ(datas.size(), 2);
+ ASSERT_EQ(datas[0], "al");
+ ASSERT_EQ(datas[1], "ameer");
+ datas.clear();
+
+ // test rare unicode character
+ std::string rareUnicodeText1 = "𐓚";
+ tokenize(rareUnicodeText1, datas, false);
+ ASSERT_EQ(datas.size(), 1);
+ ASSERT_EQ(datas[0], "𐓚");
+ datas.clear();
+
+ std::string rareUnicodeText2 = "𑪱";
+ tokenize(rareUnicodeText2, datas, false);
+ ASSERT_EQ(datas.size(), 1);
+ ASSERT_EQ(datas[0], "𑪱");
+ datas.clear();
+
+ std::string rareUnicodeText3 = "𐴗";
+ tokenize(rareUnicodeText3, datas, false);
+ ASSERT_EQ(datas.size(), 1);
+ ASSERT_EQ(datas[0], "𐴗");
+ datas.clear();
+}
+
// Test the exception handling capabilities of the IKTokenizer and
AnalyzeContext
TEST_F(IKTokenizerTest, TestExceptionHandling) {
// Common mock reader class for testing exception handling
diff --git a/regression-test/data/inverted_index_p0/test_ik_analyzer.out
b/regression-test/data/inverted_index_p0/analyzer/test_ik_analyzer.out
similarity index 100%
rename from regression-test/data/inverted_index_p0/test_ik_analyzer.out
rename to regression-test/data/inverted_index_p0/analyzer/test_ik_analyzer.out
diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out
b/regression-test/data/inverted_index_p0/test_tokenize.out
index 32e7968cb8b..68f030b7276 100644
Binary files a/regression-test/data/inverted_index_p0/test_tokenize.out and
b/regression-test/data/inverted_index_p0/test_tokenize.out differ
diff --git a/regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy
b/regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy
similarity index 95%
rename from regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy
rename to
regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy
index c28aa68920b..b57d40a4a00 100644
--- a/regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy
+++ b/regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy
@@ -22,7 +22,7 @@ suite("test_ik_analyzer", "p0") {
sql "DROP TABLE IF EXISTS ${tableNameSmart}"
sql "DROP TABLE IF EXISTS ${tableNameMaxWord}"
- // 创建smart模式测试表
+ // Create test table for smart mode
sql """
CREATE TABLE ${tableNameSmart} (
`id` int(11) NULL COMMENT "",
@@ -37,7 +37,7 @@ suite("test_ik_analyzer", "p0") {
);
"""
- // 创建max_word模式测试表
+ // Create test table for max_word mode
sql """
CREATE TABLE ${tableNameMaxWord} (
`id` int(11) NULL COMMENT "",
@@ -52,7 +52,7 @@ suite("test_ik_analyzer", "p0") {
);
"""
- // 插入测试数据
+ // Insert test data
def insertData = { table ->
sql """ INSERT INTO ${table} VALUES (1, "我爱北京天安门"); """
sql """ INSERT INTO ${table} VALUES (2, "Apache Doris是一个现代化的MPP数据库");
"""
@@ -68,14 +68,14 @@ suite("test_ik_analyzer", "p0") {
sql "sync"
sql """ set enable_common_expr_pushdown = true; """
- // 测试smart模式
+ // Testing ik smart mode
println "Testing ik smart mode:"
qt_sql """ select * from ${tableNameSmart} where content match_phrase
'北京'; """
qt_sql """ select * from ${tableNameSmart} where content match_phrase
'计算机科学'; """
qt_sql """ select * from ${tableNameSmart} where content match_phrase
'数据库管理系统'; """
qt_sql """ select * from ${tableNameSmart} where content match_phrase
'中华人民共和国'; """
- // 测试max_word模式
+ // Testing ik max_word mode
println "Testing ik max_word mode:"
qt_sql """ select * from ${tableNameMaxWord} where content
match_phrase '北京'; """
qt_sql """ select * from ${tableNameMaxWord} where content
match_phrase '计算机科学'; """
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index f8066e6ad86..d0bdada2e31 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -123,4 +123,9 @@ suite("test_tokenize"){
qt_tokenize_sql """SELECT TOKENIZE('北京大学计算机科学与技术系',
'"parser"="ik","parser_mode"="ik_max_word"');"""
qt_tokenize_sql """SELECT TOKENIZE('中华人民共和国',
'"parser"="ik","parser_mode"="ik_max_word"');"""
+ qt_tokenize_sql """SELECT TOKENIZE('😊🚀👍测试特殊符号:@#¥%……&*()',
'"parser"="ik","parser_mode"="ik_max_word"');"""
+ qt_tokenize_sql """SELECT TOKENIZE('High&Low',
'"parser"="ik","parser_mode"="ik_max_word"');"""
+ qt_tokenize_sql """SELECT TOKENIZE('1・2',
'"parser"="ik","parser_mode"="ik_max_word"');"""
+ qt_tokenize_sql """SELECT TOKENIZE('abcşīabc',
'"parser"="ik","parser_mode"="ik_max_word"');"""
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]