(doris) branch master updated: [fix](inverted_index) fix tokenization issues for some characters in ik analyzer (#50141)

airborne Wed, 18 Jun 2025 23:44:39 -0700

This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new d810f78efb1 [fix](inverted_index) fix tokenization issues for some 
characters in ik analyzer (#50141)
d810f78efb1 is described below

commit d810f78efb1b5606203ca41318aacbf12891aa60
Author: Ryan19929 <[email protected]>
AuthorDate: Thu Jun 19 14:44:26 2025 +0800

    [fix](inverted_index) fix tokenization issues for some characters in ik 
analyzer (#50141)
    
    Problem Summary:
    This PR fixes the issue of IK Analyzer's abnormal handling of full-width
    characters and adds support for Emoji and rare character tokenization,
    consistent with Elasticsearch IK behavior.
    
    Co-authored-by: Ryan19929 <[email protected]>
---
 .../inverted_index/analyzer/ik/IKTokenizer.cpp     |   9 +-
 .../inverted_index/analyzer/ik/cfg/Configuration.h |   1 +
 .../analyzer/ik/core/AnalyzeContext.cpp            |  11 +-
 .../analyzer/ik/core/AnalyzeContext.h              |  15 ++-
 .../analyzer/ik/core/CN_QuantifierSegmenter.h      |   1 -
 .../analyzer/ik/core/CharacterUtil.cpp             | 148 ++++++++-------------
 .../analyzer/ik/core/CharacterUtil.h               |   6 +-
 .../analyzer/ik/core/IKSegmenter.cpp               |   1 +
 .../inverted_index/analyzer/ik/core/IKSegmenter.h  |   1 +
 .../analyzer/ik/core/LetterSegmenter.cpp           |  52 ++++----
 .../analyzer/ik/core/LetterSegmenter.h             |   7 +-
 .../analyzer/ik/core/SurrogatePairSegmenter.cpp    |  37 ++++++
 ...ntifierSegmenter.h => SurrogatePairSegmenter.h} |  31 ++---
 .../inverted_index/analyzer/ik_anayzer_test.cpp    | 148 ++++++++++++++++++++-
 .../{ => analyzer}/test_ik_analyzer.out            | Bin
 .../data/inverted_index_p0/test_tokenize.out       | Bin 3555 -> 3771 bytes
 .../{ => analyzer}/test_ik_analyzer.groovy         |  10 +-
 .../suites/inverted_index_p0/test_tokenize.groovy  |   5 +
 18 files changed, 322 insertions(+), 161 deletions(-)

diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
index e1f451804e5..72b906fa4e1 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
@@ -32,13 +32,10 @@ Token* IKTokenizer::next(Token* token) {
     }
 
     std::string& token_text = tokens_text_[buffer_index_++];
+    // full-width to half-width, and lowercase
+    // TODO(ryan19929): do regularizeString in fillBuffer.
+    CharacterUtil::regularizeString(token_text, this->lowercase);
     size_t size = std::min(token_text.size(), 
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
-    if (this->lowercase) {
-        if (!token_text.empty() && static_cast<uint8_t>(token_text[0]) < 0x80) 
{
-            std::transform(token_text.begin(), token_text.end(), 
token_text.begin(),
-                           [](char c) { return to_lower(c); });
-        }
-    }
     token->setNoCopy(token_text.data(), 0, size);
     return token;
 }
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
index a9be1d76220..a0c9c894c5c 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
@@ -25,6 +25,7 @@ namespace doris::segment_v2 {
 class Configuration {
 private:
     bool use_smart_;
+    // TODO(ryan19929): delete config_->lower_case_, because it is always 
true(java version is same)
     bool enable_lowercase_;
     std::string dict_path_;
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
index d648f8e715a..3356210f20c 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
@@ -68,7 +68,7 @@ size_t AnalyzeContext::fillBuffer(lucene::util::Reader* 
reader) {
         int32_t readCount = 0;
         if (buffer_offset_ == 0) {
             readCount = max(0, reader->readCopy(segment_buff_.data(), 0, 
BUFF_SIZE));
-            CharacterUtil::decodeStringToRunes(segment_buff_.c_str(), 
readCount, typed_runes_,
+            CharacterUtil::decodeStringToRunes(segment_buff_.data(), 
readCount, typed_runes_,
                                                config_->isEnableLowercase());
         } else {
             size_t offset = available_ - 
typed_runes_[cursor_].getNextBytePosition();
@@ -82,7 +82,7 @@ size_t AnalyzeContext::fillBuffer(lucene::util::Reader* 
reader) {
             } else {
                 readCount = std::max(0, reader->readCopy(segment_buff_.data(), 
0, BUFF_SIZE));
             }
-            CharacterUtil::decodeStringToRunes(segment_buff_.c_str(), 
readCount, typed_runes_,
+            CharacterUtil::decodeStringToRunes(segment_buff_.data(), 
readCount, typed_runes_,
                                                config_->isEnableLowercase());
         }
         // Ensure readCount is set to 0 in case of
@@ -172,7 +172,6 @@ bool AnalyzeContext::moveCursor() {
 
 void AnalyzeContext::initCursor() {
     cursor_ = 0;
-    typed_runes_[cursor_].regularize(config_->isEnableLowercase());
 }
 
 bool AnalyzeContext::isBufferConsumed() const {
@@ -199,6 +198,9 @@ void AnalyzeContext::lockBuffer(SegmenterType type) {
     case SegmenterType::LETTER_SEGMENTER:
         buffer_locker_ |= LETTER_SEGMENTER_FLAG;
         break;
+    case SegmenterType::SURROGATE_PAIR_SEGMENTER:
+        buffer_locker_ |= SURROGATE_PAIR_SEGMENTER_FLAG;
+        break;
     }
 }
 
@@ -213,6 +215,9 @@ void AnalyzeContext::unlockBuffer(SegmenterType type) {
     case SegmenterType::LETTER_SEGMENTER:
         buffer_locker_ &= ~LETTER_SEGMENTER_FLAG;
         break;
+    case SegmenterType::SURROGATE_PAIR_SEGMENTER:
+        buffer_locker_ &= ~SURROGATE_PAIR_SEGMENTER_FLAG;
+        break;
     }
 }
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
index b6b363d9d1a..d9e947a713d 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
@@ -41,10 +41,10 @@ private:
     static const size_t BUFF_SIZE = 4096;
     static const size_t BUFF_EXHAUST_CRITICAL = 100;
 
-    static constexpr uint8_t CJK_SEGMENTER_FLAG = 0x01;    // 0001
-    static constexpr uint8_t CN_QUANTIFIER_FLAG = 0x02;    // 0010
-    static constexpr uint8_t LETTER_SEGMENTER_FLAG = 0x04; // 0100
-
+    static constexpr uint8_t CJK_SEGMENTER_FLAG = 0x01;            // 0001
+    static constexpr uint8_t CN_QUANTIFIER_FLAG = 0x02;            // 0010
+    static constexpr uint8_t LETTER_SEGMENTER_FLAG = 0x04;         // 0100
+    static constexpr uint8_t SURROGATE_PAIR_SEGMENTER_FLAG = 0x08; // 1000
     // String buffer
     std::string segment_buff_;
     // An array storing Unicode code points (runes)Character information array
@@ -73,7 +73,12 @@ private:
     void compound(Lexeme& lexeme);
 
 public:
-    enum class SegmenterType { CJK_SEGMENTER, CN_QUANTIFIER, LETTER_SEGMENTER 
};
+    enum class SegmenterType {
+        CJK_SEGMENTER,
+        CN_QUANTIFIER,
+        LETTER_SEGMENTER,
+        SURROGATE_PAIR_SEGMENTER
+    };
     const CharacterUtil::TypedRuneArray& getTypedRuneArray() const { return 
typed_runes_; }
     explicit AnalyzeContext(IKMemoryPool<Cell>& pool, 
std::shared_ptr<Configuration> config);
     virtual ~AnalyzeContext();
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
index 27ccef61a83..a20341d3f3e 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
@@ -29,7 +29,6 @@ class CN_QuantifierSegmenter : public ISegmenter {
 public:
     static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE =
             AnalyzeContext::SegmenterType::CN_QUANTIFIER;
-    static const std::string SEGMENTER_NAME;
     static const std::u32string CHINESE_NUMBERS;
     static const std::unordered_set<char32_t> CHINESE_NUMBER_CHARS;
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
index 808edc14039..a991967392c 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
@@ -20,34 +20,34 @@
 namespace doris::segment_v2 {
 
 int32_t CharacterUtil::identifyCharType(int32_t rune) {
-    // Numbers
-    if (rune >= 0x30 && rune <= 0x39) {
+    if (rune >= '0' && rune <= '9') {
         return CHAR_ARABIC;
     }
-
-    // English
-    if ((rune >= 0x61 && rune <= 0x7a) || (rune >= 0x41 && rune <= 0x5a)) {
+    if ((rune >= 'a' && rune <= 'z') || (rune >= 'A' && rune <= 'Z')) {
         return CHAR_ENGLISH;
     }
 
-    // CJK Unified Chinese Characters
-    if ((rune >= 0x4E00 && rune <= 0x9FFF) || (rune >= 0x3400 && rune <= 
0x4DBF) ||
-        (rune >= 0x20000 && rune <= 0x2A6DF) || (rune >= 0x2A700 && rune <= 
0x2B73F) ||
-        (rune >= 0x2B740 && rune <= 0x2B81F) || (rune >= 0x2B820 && rune <= 
0x2CEAF) ||
-        (rune >= 0x2CEB0 && rune <= 0x2EBEF) || (rune >= 0x30000 && rune <= 
0x3134F)) {
+    UBlockCode block = ublock_getCode(rune);
+
+    if (block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS || block == 
UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F ||
+        block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G) {
         return CHAR_CHINESE;
     }
 
-    // Japanese and Korean characters
-    if ((rune >= 0x3040 && rune <= 0x309F) || (rune >= 0x30A0 && rune <= 
0x30FF) ||
-        (rune >= 0x31F0 && rune <= 0x31FF) || (rune >= 0xAC00 && rune <= 
0xD7AF) ||
-        (rune >= 0x1100 && rune <= 0x11FF)) {
+    if (block == UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS || block == 
UBLOCK_HANGUL_SYLLABLES ||
+        block == UBLOCK_HANGUL_JAMO || block == 
UBLOCK_HANGUL_COMPATIBILITY_JAMO ||
+        block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA ||
+        block == UBLOCK_KATAKANA_PHONETIC_EXTENSIONS) {
         return CHAR_OTHER_CJK;
     }
 
-    // UTF-16 surrogate pairs and private zone
-    if ((rune >= 0xD800 && rune <= 0xDBFF) || (rune >= 0xDC00 && rune <= 
0xDFFF) ||
-        (rune >= 0xE000 && rune <= 0xF8FF)) {
+    if (rune > 0xFFFF) {
         return CHAR_SURROGATE;
     }
 
@@ -55,42 +55,21 @@ int32_t CharacterUtil::identifyCharType(int32_t rune) {
 }
 
 int32_t CharacterUtil::regularize(int32_t rune, bool use_lowercase) {
-    // Full-width to half-width
     if (rune == 0x3000) {
-        return 0x0020; // Convert full-width space to half-width
-    }
-
-    // Full-width numbers
-    if (rune >= 0xFF10 && rune <= 0xFF19) {
-        return rune - 0xFEE0; // Convert to half-width numbers
-    }
-
-    // Full-width letters
-    if (rune >= 0xFF21 && rune <= 0xFF3A) {
+        return 0x0020;
+    } else if (rune > 0xFF00 && rune < 0xFF5F) {
         rune = rune - 0xFEE0;
-        if (use_lowercase) {
-            rune += 32; // Convert to lowercase
-        }
-        return rune;
-    }
-    if (rune >= 0xFF41 && rune <= 0xFF5A) {
-        return rune - 0xFEE0;
-    }
-
-    // Convert half-width uppercase letters to lowercase
-    if (use_lowercase && rune >= 0x41 && rune <= 0x5A) {
-        return rune + 32;
+    } else if (use_lowercase && rune >= 0x41 && rune <= 0x5A) {
+        // This else-if causes full-width letters unable to be converted to 
lowercase
+        rune += 32;
     }
 
     return rune;
 }
 
 void CharacterUtil::TypedRune::regularize(bool use_lowercase) {
-    CharacterUtil::regularizeCharInfo(*this, use_lowercase);
-}
-
-void CharacterUtil::regularizeCharInfo(TypedRune& typedRune, bool 
use_lowercase) {
-    typedRune.rune = regularize(typedRune.rune, use_lowercase);
+    this->rune = CharacterUtil::regularize(this->rune, use_lowercase);
+    this->char_type = CharacterUtil::identifyCharType(this->rune);
 }
 
 CharacterUtil::RuneStrLite CharacterUtil::decodeChar(const char* str, size_t 
length) {
@@ -101,7 +80,7 @@ bool CharacterUtil::decodeString(const char* str, size_t 
length, RuneStrArray& r
     return cppjieba::DecodeRunesInString(str, length, runes);
 }
 
-void CharacterUtil::decodeStringToRunes(const char* str, size_t length, 
TypedRuneArray& typed_runes,
+void CharacterUtil::decodeStringToRunes(char* str, size_t length, 
TypedRuneArray& typed_runes,
                                         bool use_lowercase) {
     typed_runes.clear();
     size_t byte_pos = 0;
@@ -111,59 +90,48 @@ void CharacterUtil::decodeStringToRunes(const char* str, 
size_t length, TypedRun
         if (runeStr.len == 0) {
             break;
         }
+        if (runeStr.len == 1 && use_lowercase && str[byte_pos] >= 'A' && 
str[byte_pos] <= 'Z') {
+            str[byte_pos] += 32;
+        }
         typed_runes.emplace_back(runeStr.rune, byte_pos, runeStr.len, 
typed_runes.size(), 1);
 
-        if (use_lowercase) {
-            typed_runes.back().regularize(true);
-        }
+        typed_runes.back().regularize(use_lowercase);
+
         byte_pos += runeStr.len;
     }
 }
 
-// TODO: Maybe delete this function
-size_t CharacterUtil::adjustToCompleteChar(const char* buffer, size_t 
buffer_length) {
-    if (buffer_length == 0) return 0;
-
-    unsigned char last_byte = buffer[buffer_length - 1];
-
-    if (last_byte < 0x80) {
-        return buffer_length;
-    }
-
-    if ((last_byte & 0xC0) == 0x80) {
-        size_t adjustedLen = buffer_length - 1;
-        while (adjustedLen > 0) {
-            unsigned char byte = buffer[adjustedLen - 1];
-            if ((byte & 0xC0) != 0x80) {
-                int charLen = 0;
-                if ((byte & 0xE0) == 0xC0)
-                    charLen = 2;
-                else if ((byte & 0xF0) == 0xE0)
-                    charLen = 3;
-                else if ((byte & 0xF8) == 0xF0)
-                    charLen = 4;
-                if (buffer_length - adjustedLen + 1 < charLen) {
-                    return adjustedLen - 1;
+void CharacterUtil::regularizeString(std::string& input, bool use_lowercase) {
+    std::string temp;
+    size_t len = input.size();
+    temp.reserve(len);
+    for (size_t i = 0; i < len;) {
+        unsigned char c = input[i];
+        if ((c & 0xF0) == 0xE0 && i + 2 < len) {
+            int rune = ((c & 0x0F) << 12) | ((input[i + 1] & 0x3F) << 6) | 
(input[i + 2] & 0x3F);
+            if (rune == 0x3000) {
+                temp += ' ';
+            } else if (rune >= 0xFF01 && rune <= 0xFF5E) {
+                char half = static_cast<char>(rune - 0xFEE0);
+                if (use_lowercase && half >= 'A' && half <= 'Z') {
+                    half += 32;
                 }
-                return buffer_length;
+                temp += half;
+            } else {
+                temp += input[i];
+                temp += input[i + 1];
+                temp += input[i + 2];
+            }
+            i += 3;
+        } else {
+            char ch = input[i];
+            if (use_lowercase && ch >= 'A' && ch <= 'Z') {
+                ch += 32;
             }
-            adjustedLen--;
+            temp += ch;
+            i += 1;
         }
-        return 0;
-    }
-
-    int charLen = 0;
-    if ((last_byte & 0xE0) == 0xC0)
-        charLen = 2;
-    else if ((last_byte & 0xF0) == 0xE0)
-        charLen = 3;
-    else if ((last_byte & 0xF8) == 0xF0)
-        charLen = 4;
-
-    if (charLen > 1) {
-        return buffer_length - 1;
     }
-
-    return buffer_length;
+    input = std::move(temp);
 }
 } // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
index 2f3dac6a520..c60f8bb30ce 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
@@ -17,6 +17,8 @@
 
 #pragma once
 
+#include <unicode/uchar.h>
+
 #include <functional>
 #include <memory>
 #include <vector>
@@ -70,7 +72,7 @@ public:
 
     static int32_t identifyCharType(int32_t rune);
 
-    static void decodeStringToRunes(const char* str, size_t length, 
TypedRuneArray& typed_runes,
+    static void decodeStringToRunes(char* str, size_t length, TypedRuneArray& 
typed_runes,
                                     bool use_lowercase);
 
     static int32_t regularize(int32_t rune, bool use_lowercase);
@@ -80,7 +82,7 @@ public:
 
     static void regularizeCharInfo(TypedRune& type_rune, bool use_lowercase);
 
-    static size_t adjustToCompleteChar(const char* buffer, size_t 
buffer_length);
+    static void regularizeString(std::string& input, bool use_lowercase = 
true);
 };
 
 } // namespace doris::segment_v2
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
index 6c1d049ac70..674f22dfd2d 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
@@ -33,6 +33,7 @@ std::vector<std::unique_ptr<ISegmenter>> 
IKSegmenter::loadSegmenters() {
     segmenters.push_back(std::make_unique<LetterSegmenter>());
     segmenters.push_back(std::make_unique<CN_QuantifierSegmenter>());
     segmenters.push_back(std::make_unique<CJKSegmenter>());
+    segmenters.push_back(std::make_unique<SurrogatePairSegmenter>());
     return segmenters;
 }
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
index 4f94fa435db..33defbbe31a 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
@@ -27,6 +27,7 @@
 #include "IKArbitrator.h"
 #include "ISegmenter.h"
 #include "LetterSegmenter.h"
+#include "SurrogatePairSegmenter.h"
 #include 
"olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h"
 namespace doris::segment_v2 {
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
index 2b631399132..c593a1ec63d 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
@@ -53,7 +53,6 @@ void LetterSegmenter::reset() {
 bool LetterSegmenter::processEnglishLetter(AnalyzeContext& context) {
     bool need_lock = false;
 
-    const auto& typed_runes = context.getTypedRuneArray();
     if (english_start_ == -1) {
         // The current tokenizer has not yet started processing English 
characters
         if (context.getCurrentCharType() == CharacterUtil::CHAR_ENGLISH) {
@@ -68,9 +67,8 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& 
context) {
             english_end_ = context.getCursor();
         } else {
             // Encounter non-English characters, output tokens
-            Lexeme newLexeme(context.getBufferOffset(), 
typed_runes[english_start_].offset,
-                             english_end_ - english_start_ + 1, 
Lexeme::Type::English,
-                             english_start_, english_end_);
+            Lexeme newLexeme =
+                    createLexeme(context, english_start_, english_end_, 
Lexeme::Type::English);
             context.addLexeme(newLexeme);
             english_start_ = -1;
             english_end_ = -1;
@@ -78,9 +76,8 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& 
context) {
     }
 
     if (context.isBufferConsumed() && (english_start_ != -1 && english_end_ != 
-1)) {
-        Lexeme newLexeme(context.getBufferOffset(), 
typed_runes[english_start_].offset,
-                         english_end_ - english_start_ + 1, 
Lexeme::Type::English, english_start_,
-                         english_end_);
+        Lexeme newLexeme =
+                createLexeme(context, english_start_, english_end_, 
Lexeme::Type::English);
         context.addLexeme(newLexeme);
         english_start_ = -1;
         english_end_ = -1;
@@ -96,7 +93,6 @@ bool LetterSegmenter::processEnglishLetter(AnalyzeContext& 
context) {
 
 bool LetterSegmenter::processArabicLetter(AnalyzeContext& context) {
     bool need_lock = false;
-    const auto& typed_runes = context.getTypedRuneArray();
 
     if (arabic_start_ == -1) {
         // The current tokenizer has not yet started processing numeric 
characters
@@ -115,9 +111,8 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext& 
context) {
             // Do not output numbers, but do not mark the end
         } else {
             // Encounter non-Arabic characters, output tokens
-            Lexeme newLexeme(context.getBufferOffset(), 
typed_runes[arabic_start_].offset,
-                             arabic_end_ - arabic_start_ + 1, 
Lexeme::Type::Arabic, arabic_start_,
-                             arabic_end_);
+            Lexeme newLexeme =
+                    createLexeme(context, arabic_start_, arabic_end_, 
Lexeme::Type::Arabic);
             context.addLexeme(newLexeme);
             arabic_start_ = -1;
             arabic_end_ = -1;
@@ -125,9 +120,7 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext& 
context) {
     }
 
     if (context.isBufferConsumed() && (arabic_start_ != -1 && arabic_end_ != 
-1)) {
-        Lexeme newLexeme(context.getBufferOffset(), 
typed_runes[arabic_start_].offset,
-                         arabic_end_ - arabic_start_ + 1, 
Lexeme::Type::Arabic, arabic_start_,
-                         arabic_end_);
+        Lexeme newLexeme = createLexeme(context, arabic_start_, arabic_end_, 
Lexeme::Type::Arabic);
         context.addLexeme(newLexeme);
         arabic_start_ = -1;
         arabic_end_ = -1;
@@ -143,7 +136,6 @@ bool LetterSegmenter::processArabicLetter(AnalyzeContext& 
context) {
 
 bool LetterSegmenter::processMixLetter(AnalyzeContext& context) {
     bool need_lock = false;
-    const auto& typed_runes = context.getTypedRuneArray();
 
     if (start_ == -1) {
         // The current tokenizer has not yet started processing characters.
@@ -164,8 +156,7 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext& 
context) {
             end_ = context.getCursor();
         } else {
             // Encounter non-letter characters, output a token
-            Lexeme newLexeme(context.getBufferOffset(), 
typed_runes[start_].offset,
-                             end_ - start_ + 1, Lexeme::Type::Letter, start_, 
end_);
+            Lexeme newLexeme = createLexeme(context, start_, end_, 
Lexeme::Type::Letter);
             context.addLexeme(newLexeme);
             start_ = -1;
             end_ = -1;
@@ -173,8 +164,7 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext& 
context) {
     }
 
     if (context.isBufferConsumed() && (start_ != -1 && end_ != -1)) {
-        Lexeme newLexeme(context.getBufferOffset(), 
typed_runes[start_].offset, end_ - start_ + 1,
-                         Lexeme::Type::Letter, start_, end_);
+        Lexeme newLexeme = createLexeme(context, start_, end_, 
Lexeme::Type::Letter);
         context.addLexeme(newLexeme);
         start_ = -1;
         end_ = -1;
@@ -184,11 +174,27 @@ bool LetterSegmenter::processMixLetter(AnalyzeContext& 
context) {
     return need_lock;
 }
 
-bool LetterSegmenter::isLetterConnector(char input) {
-    return std::binary_search(std::begin(letter_connectors_), 
std::end(letter_connectors_), input);
+bool LetterSegmenter::isLetterConnector(int32_t input) {
+    if (input < 128) {
+        return std::binary_search(std::begin(letter_connectors_), 
std::end(letter_connectors_),
+                                  static_cast<char>(input));
+    }
+    return false;
+}
+
+bool LetterSegmenter::isNumConnector(int32_t input) {
+    if (input < 128) {
+        return std::binary_search(std::begin(num_connectors_), 
std::end(num_connectors_),
+                                  static_cast<char>(input));
+    }
+    return false;
 }
 
-bool LetterSegmenter::isNumConnector(char input) {
-    return std::binary_search(std::begin(num_connectors_), 
std::end(num_connectors_), input);
+Lexeme LetterSegmenter::createLexeme(AnalyzeContext& context, int start, int 
end,
+                                     Lexeme::Type type) {
+    const auto& typed_runes = context.getTypedRuneArray();
+    return Lexeme(context.getBufferOffset(), 
typed_runes[start].getBytePosition(),
+                  typed_runes[end].getNextBytePosition() - 
typed_runes[start].getBytePosition(),
+                  type, start, end);
 }
 } // namespace doris::segment_v2
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
index 87b36e83fbf..70dc6b4988f 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
@@ -30,7 +30,6 @@ class LetterSegmenter : public ISegmenter {
 public:
     static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE =
             AnalyzeContext::SegmenterType::LETTER_SEGMENTER;
-    static const std::string SEGMENTER_NAME;
     LetterSegmenter();
     ~LetterSegmenter() override = default;
 
@@ -41,8 +40,10 @@ private:
     bool processEnglishLetter(AnalyzeContext& context);
     bool processArabicLetter(AnalyzeContext& context);
     bool processMixLetter(AnalyzeContext& context);
-    bool isLetterConnector(char input);
-    bool isNumConnector(char input);
+    bool isLetterConnector(int32_t input);
+    bool isNumConnector(int32_t input);
+
+    Lexeme createLexeme(AnalyzeContext& context, int start, int end, 
Lexeme::Type type);
 
     int start_ {-1};
     int end_ {-1};
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
new file mode 100644
index 00000000000..0aea370a502
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "SurrogatePairSegmenter.h"
+
+namespace doris::segment_v2 {
+
+void SurrogatePairSegmenter::analyze(AnalyzeContext& context) {
+    const auto& current_char_type = context.getCurrentCharType();
+
+    if (current_char_type == CharacterUtil::CHAR_SURROGATE) {
+        Lexeme newLexeme(context.getBufferOffset(), 
context.getCurrentCharOffset(),
+                         context.getCurrentCharLen(), Lexeme::Type::CNChar, 
context.getCursor(),
+                         context.getCursor());
+        context.addLexeme(newLexeme);
+    }
+
+    context.unlockBuffer(SEGMENTER_TYPE);
+}
+
+void SurrogatePairSegmenter::reset() {}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
similarity index 59%
copy from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
copy to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
index 27ccef61a83..bad22658b51 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
@@ -17,36 +17,23 @@
 
 #pragma once
 
-#include <memory>
-#include <unordered_set>
-#include <vector>
-
 #include "AnalyzeContext.h"
+#include "CharacterUtil.h"
 #include "ISegmenter.h"
+#include "Lexeme.h"
+
 namespace doris::segment_v2 {
 
-class CN_QuantifierSegmenter : public ISegmenter {
+class SurrogatePairSegmenter : public ISegmenter {
 public:
     static constexpr AnalyzeContext::SegmenterType SEGMENTER_TYPE =
-            AnalyzeContext::SegmenterType::CN_QUANTIFIER;
-    static const std::string SEGMENTER_NAME;
-    static const std::u32string CHINESE_NUMBERS;
-    static const std::unordered_set<char32_t> CHINESE_NUMBER_CHARS;
+            AnalyzeContext::SegmenterType::SURROGATE_PAIR_SEGMENTER;
 
-    CN_QuantifierSegmenter();
-    ~CN_QuantifierSegmenter() override = default;
+    SurrogatePairSegmenter() = default;
+    ~SurrogatePairSegmenter() override = default;
 
     void analyze(AnalyzeContext& context) override;
     void reset() override;
-
-private:
-    void processCNumber(AnalyzeContext& context);
-    void processCount(AnalyzeContext& context);
-    bool needCountScan(AnalyzeContext& context);
-    void outputNumLexeme(AnalyzeContext& context);
-
-    int number_start_;
-    int number_end_;
-    std::vector<Hit> count_hits_;
 };
-} // namespace doris::segment_v2
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp 
b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
index 18dc16b6925..690499830f5 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
@@ -264,7 +264,11 @@ TEST_F(IKTokenizerTest, TestSpecialCharacters) {
     // Test with special characters
     std::string specialText = "😊🚀👍测试特殊符号：@#¥%……&*（）";
     tokenize(specialText, datas, true);
-    ASSERT_EQ(datas.size(), 2);
+    ASSERT_EQ(datas.size(), 5);
+    std::vector<std::string> expectedTokens = {"😊", "🚀", "👍", "测试", "特殊符号"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedTokens[i]);
+    }
 }
 
 TEST_F(IKTokenizerTest, TestBufferBoundaryWithSpace) {
@@ -428,6 +432,148 @@ TEST_F(IKTokenizerTest, TestLongTextCompareWithJava) {
     }
 }
 
+TEST_F(IKTokenizerTest, TestFullWidthCharacters) {
+    std::vector<std::string> datas;
+
+    // test full width numbers
+    std::string fullWidthNumbersText = "４ ３ ２";
+    tokenize(fullWidthNumbersText, datas, true);
+    std::vector<std::string> expectedNumbers = {"4", "3", "2"}; // half width 
numbers
+    ASSERT_EQ(datas.size(), expectedNumbers.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedNumbers[i]);
+    }
+    datas.clear();
+
+    fullWidthNumbersText = "４３２";
+    tokenize(fullWidthNumbersText, datas, false);
+    expectedNumbers = {"432"};
+    ASSERT_EQ(datas.size(), expectedNumbers.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedNumbers[i]);
+    }
+    datas.clear();
+
+    // test full width currency symbol
+    std::string currencyText = "￥";
+    tokenize(currencyText, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "￥");
+    datas.clear();
+
+    // test full width symbol in word
+    std::string mixedText = "High＆Low";
+    tokenize(mixedText, datas, false);
+    std::vector<std::string> expectedMixed = {"high&low", "high", "low"};
+    ASSERT_EQ(datas.size(), expectedMixed.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedMixed[i]);
+    }
+    datas.clear();
+
+    // test special separator
+    std::string specialSeparatorText = "1･2";
+    tokenize(specialSeparatorText, datas, false);
+    std::vector<std::string> expectedSeparator = {"1", "･", "2"};
+    ASSERT_EQ(datas.size(), expectedSeparator.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedSeparator[i]);
+    }
+    datas.clear();
+
+    // test special character
+    std::string specialCharText = "﨑";
+    tokenize(specialCharText, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "﨑");
+    datas.clear();
+}
+
+TEST_F(IKTokenizerTest, TestEmojiAndSpecialCharacters) {
+    std::vector<std::string> datas;
+
+    // test emoji
+    std::string emojiText = "🐼";
+    tokenize(emojiText, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "🐼");
+    datas.clear();
+
+    std::string emojiText2 = "🝢";
+    tokenize(emojiText2, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "🝢");
+    datas.clear();
+
+    // test special latin character
+    std::string specialLatinText1 = "abcşabc";
+    tokenize(specialLatinText1, datas, false);
+    ASSERT_EQ(datas.size(), 2);
+    ASSERT_EQ(datas[0], "abc");
+    ASSERT_EQ(datas[1], "abc");
+    datas.clear();
+
+    std::string specialLatinText2 = "abcīabc";
+    tokenize(specialLatinText2, datas, false);
+    ASSERT_EQ(datas.size(), 2);
+    ASSERT_EQ(datas[0], "abc");
+    ASSERT_EQ(datas[1], "abc");
+    datas.clear();
+
+    std::string specialLatinText3 = "celebrity…get";
+    tokenize(specialLatinText3, datas, false);
+    std::vector<std::string> expectedEllipsis = {"celebrity", "get"};
+    ASSERT_EQ(datas.size(), expectedEllipsis.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedEllipsis[i]);
+    }
+    datas.clear();
+
+    // test mixed alphabet word
+    std::string mixedAlphabetText1 = "Hulyaiрole";
+    tokenize(mixedAlphabetText1, datas, false);
+    ASSERT_EQ(datas.size(), 2);
+    ASSERT_EQ(datas[0], "hulyai");
+    ASSERT_EQ(datas[1], "ole");
+    datas.clear();
+
+    std::string mixedAlphabetText2 = "Nisa Aşgabat";
+    tokenize(mixedAlphabetText2, datas, false);
+    std::vector<std::string> expectedName = {"nisa", "gabat"};
+    ASSERT_EQ(datas.size(), expectedName.size());
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], expectedName[i]);
+    }
+    datas.clear();
+
+    // test special connector
+    std::string specialConnectorText = "alـameer";
+    tokenize(specialConnectorText, datas, false);
+    ASSERT_EQ(datas.size(), 2);
+    ASSERT_EQ(datas[0], "al");
+    ASSERT_EQ(datas[1], "ameer");
+    datas.clear();
+
+    // test rare unicode character
+    std::string rareUnicodeText1 = "𐓚";
+    tokenize(rareUnicodeText1, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "𐓚");
+    datas.clear();
+
+    std::string rareUnicodeText2 = "𑪱";
+    tokenize(rareUnicodeText2, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "𑪱");
+    datas.clear();
+
+    std::string rareUnicodeText3 = "𐴗";
+    tokenize(rareUnicodeText3, datas, false);
+    ASSERT_EQ(datas.size(), 1);
+    ASSERT_EQ(datas[0], "𐴗");
+    datas.clear();
+}
+
 // Test the exception handling capabilities of the IKTokenizer and 
AnalyzeContext
 TEST_F(IKTokenizerTest, TestExceptionHandling) {
     // Common mock reader class for testing exception handling
diff --git a/regression-test/data/inverted_index_p0/test_ik_analyzer.out 
b/regression-test/data/inverted_index_p0/analyzer/test_ik_analyzer.out
similarity index 100%
rename from regression-test/data/inverted_index_p0/test_ik_analyzer.out
rename to regression-test/data/inverted_index_p0/analyzer/test_ik_analyzer.out
diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out 
b/regression-test/data/inverted_index_p0/test_tokenize.out
index 32e7968cb8b..68f030b7276 100644
Binary files a/regression-test/data/inverted_index_p0/test_tokenize.out and 
b/regression-test/data/inverted_index_p0/test_tokenize.out differ
diff --git a/regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy 
b/regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy
similarity index 95%
rename from regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy
rename to 
regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy
index c28aa68920b..b57d40a4a00 100644
--- a/regression-test/suites/inverted_index_p0/test_ik_analyzer.groovy
+++ b/regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy
@@ -22,7 +22,7 @@ suite("test_ik_analyzer", "p0") {
     sql "DROP TABLE IF EXISTS ${tableNameSmart}"
     sql "DROP TABLE IF EXISTS ${tableNameMaxWord}"
 
-    // 创建smart模式测试表
+    // Create test table for smart mode
     sql """
       CREATE TABLE ${tableNameSmart} (
       `id` int(11) NULL COMMENT "",
@@ -37,7 +37,7 @@ suite("test_ik_analyzer", "p0") {
       );
     """
 
-    // 创建max_word模式测试表
+    // Create test table for max_word mode
     sql """
       CREATE TABLE ${tableNameMaxWord} (
       `id` int(11) NULL COMMENT "",
@@ -52,7 +52,7 @@ suite("test_ik_analyzer", "p0") {
       );
     """
 
-    // 插入测试数据
+    // Insert test data
     def insertData = { table ->
         sql """ INSERT INTO ${table} VALUES (1, "我爱北京天安门"); """
         sql """ INSERT INTO ${table} VALUES (2, "Apache Doris是一个现代化的MPP数据库"); 
"""
@@ -68,14 +68,14 @@ suite("test_ik_analyzer", "p0") {
         sql "sync"
         sql """ set enable_common_expr_pushdown = true; """
 
-        // 测试smart模式
+        // Testing ik smart mode
         println "Testing ik smart mode:"
         qt_sql """ select * from ${tableNameSmart} where content match_phrase 
'北京'; """
         qt_sql """ select * from ${tableNameSmart} where content match_phrase 
'计算机科学'; """
         qt_sql """ select * from ${tableNameSmart} where content match_phrase 
'数据库管理系统'; """
         qt_sql """ select * from ${tableNameSmart} where content match_phrase 
'中华人民共和国'; """
 
-        // 测试max_word模式
+        // Testing ik max_word mode
         println "Testing ik max_word mode:"
         qt_sql """ select * from ${tableNameMaxWord} where content 
match_phrase '北京'; """
         qt_sql """ select * from ${tableNameMaxWord} where content 
match_phrase '计算机科学'; """
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy 
b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index f8066e6ad86..d0bdada2e31 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -123,4 +123,9 @@ suite("test_tokenize"){
     qt_tokenize_sql """SELECT TOKENIZE('北京大学计算机科学与技术系', 
'"parser"="ik","parser_mode"="ik_max_word"');"""
     qt_tokenize_sql """SELECT TOKENIZE('中华人民共和国', 
'"parser"="ik","parser_mode"="ik_max_word"');"""
 
+    qt_tokenize_sql """SELECT TOKENIZE('😊🚀👍测试特殊符号：@#¥%……&*（）', 
'"parser"="ik","parser_mode"="ik_max_word"');"""
+    qt_tokenize_sql """SELECT TOKENIZE('High＆Low', 
'"parser"="ik","parser_mode"="ik_max_word"');"""
+    qt_tokenize_sql """SELECT TOKENIZE('1･2', 
'"parser"="ik","parser_mode"="ik_max_word"');"""
+    qt_tokenize_sql """SELECT TOKENIZE('abcşīabc', 
'"parser"="ik","parser_mode"="ik_max_word"');"""
+
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch master updated: [fix](inverted_index) fix tokenization issues for some characters in ik analyzer (#50141)

Reply via email to