[doris-thirdparty] branch clucene updated: [fix] compatible with utf8 and invalid utf8 (#110)

jianliangqi Thu, 03 Aug 2023 00:58:54 -0700

This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene by this push:
     new dda894af [fix] compatible with utf8 and invalid utf8 (#110)
dda894af is described below

commit dda894af51024226f10336eea3d344cebeef310d
Author: zzzxl <[email protected]>
AuthorDate: Thu Aug 3 15:58:43 2023 +0800

    [fix] compatible with utf8 and invalid utf8 (#110)
    
    1. supports utf8 and non-utf8 strings
    2. optimize string_to_wstring function
---
 src/core/CLucene/index/TermInfosWriter.cpp |  61 +++++------------
 src/core/CLucene/store/IndexOutput.cpp     |  13 ----
 src/core/CLucene/store/IndexOutput.h       |   2 -
 src/core/CLucene/util/stringUtil.h         |  77 ++++++++++++++-------
 src/shared/CLucene/config/utf8.cpp         |   6 +-
 src/test/CMakeLists.txt                    |   1 +
 src/test/test.h                            |   1 +
 src/test/tests.cpp                         |   1 +
 src/test/util/testStrConvert.cpp           | 106 +++++++++++++++++++++++++++++
 9 files changed, 183 insertions(+), 85 deletions(-)

diff --git a/src/core/CLucene/index/TermInfosWriter.cpp 
b/src/core/CLucene/index/TermInfosWriter.cpp
index 9be81373..32b6a100 100644
--- a/src/core/CLucene/index/TermInfosWriter.cpp
+++ b/src/core/CLucene/index/TermInfosWriter.cpp
@@ -178,51 +178,26 @@ void STermInfosWriter<T>::close() {
 template <typename T>
 void STermInfosWriter<T>::writeTerm(int32_t fieldNumber, const T *termText, 
int32_t termTextLength) {
     if constexpr (std::is_same_v<T, char>) {
-        std::string_view utf8Str(termText, termTextLength);
-        if (StringUtil::validate_utf8(utf8Str) == 0) {
-            int32_t utf8Length = 0;
-            {
-                size_t i = 0;
-                for (; i < utf8Str.size();) {
-                  int32_t n = StringUtil::utf8_byte_count(utf8Str[i]);
-                  i += n;
-                  utf8Length++;
-                }
-                assert(i == utf8Str.size());
-            }
+        std::string_view newTermStr(termText, termTextLength);
+        std::wstring newTermWStr = StringUtil::string_to_wstring(newTermStr);
 
-            int32_t start = 0;
-            int32_t utf8Start = 0;
-            int32_t limit = termTextLength < lastTermTextLength ? 
termTextLength : lastTermTextLength;
-            auto prefixCompare = [this, &utf8Str, &termText](int32_t& start, 
int32_t& utf8Start, int32_t limit) {
-              while (start < limit) {
-                int32_t n = StringUtil::utf8_byte_count(utf8Str[start]);
-                for (int32_t j = 0; j < n; j++) {
-                  int32_t cur = start + j;
-                  if (termText[cur] != lastTermText.values[cur]) {
-                    return;
-                  }
-                }
-                start += n;
-                utf8Start++;
-              }
-            };
-
-            prefixCompare(start, utf8Start, limit);
-            assert(start <= termTextLength);
-            assert(utf8Start <= utf8Length);
-            int32_t length = termTextLength - start;
-            utf8Length -= utf8Start;
-
-            // std::cout << "term: " << utf8Str << ", utf8Start: " << 
utf8Start << ", utf8Length: " << utf8Length << ", length: " << length << 
std::endl;
-
-            output->writeVInt(utf8Start);
-            output->writeVInt(utf8Length);
-            output->writeU8SChars(termText + start, length);
-            output->writeVInt(fieldNumber);
-        } else {
-            _CLTHROWA(CL_ERR_Runtime, (std::string("Not utf8, the character 
encoding is abnormal: ") + std::string(utf8Str.data(), 
utf8Str.size())).c_str());
+        std::string_view oldTermStr(lastTermText.values, lastTermTextLength);
+        std::wstring oldTermWStr = StringUtil::string_to_wstring(oldTermStr);
+        
+        int32_t start = 0;
+        const int32_t limit = newTermWStr.length() < oldTermWStr.length() ? 
newTermWStr.length() : oldTermWStr.length();
+        while (start < limit) {
+            if (newTermWStr[start] != oldTermWStr[start])
+                break;
+            start++;
         }
+
+        int32_t length = newTermWStr.length() - start;
+
+        output->writeVInt(start);
+        output->writeVInt(length);
+        output->writeSChars(newTermWStr.data() + start, length);
+        output->writeVInt(fieldNumber);
     } else {
         int32_t start = 0;
         const int32_t limit = termTextLength < lastTermTextLength ? 
termTextLength : lastTermTextLength;
diff --git a/src/core/CLucene/store/IndexOutput.cpp 
b/src/core/CLucene/store/IndexOutput.cpp
index 1d44aff1..05e7695f 100644
--- a/src/core/CLucene/store/IndexOutput.cpp
+++ b/src/core/CLucene/store/IndexOutput.cpp
@@ -186,19 +186,6 @@ CL_NS_DEF(store)
       writeBytes((const uint8_t*)s, length);
   }
 
-  void IndexOutput::writeU8SChars(const char* s, const int32_t length) {
-    if ( length < 0 )
-      _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a 
positive value.");
-
-    for (int32_t i = 0; i < length;) {
-      auto* chars = (const uint8_t*)s + i;
-      int32_t n = StringUtil::utf8_byte_count(*chars);
-      assert(n >= 1 && n <= 4);
-      writeBytes(chars, (n > 2 ? 3 : n));
-      i += n;
-    }
-  }
-
   void IndexOutput::writeChars(const TCHAR* s, const int32_t length){
     if ( length < 0 )
       _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a 
positive value.");
diff --git a/src/core/CLucene/store/IndexOutput.h 
b/src/core/CLucene/store/IndexOutput.h
index e8eff025..6b6ca321 100644
--- a/src/core/CLucene/store/IndexOutput.h
+++ b/src/core/CLucene/store/IndexOutput.h
@@ -83,8 +83,6 @@ public:
        void writeChars(const TCHAR* s, const int32_t length);
     template<typename T>
     void writeSChars(const T* s, int32_t length);
-       
-       void writeU8SChars(const char* s, const int32_t length);
 
     /** Closes this stream to further operations. */
        virtual void close() = 0;
diff --git a/src/core/CLucene/util/stringUtil.h 
b/src/core/CLucene/util/stringUtil.h
index 2c39b8f0..8e8ca1e9 100644
--- a/src/core/CLucene/util/stringUtil.h
+++ b/src/core/CLucene/util/stringUtil.h
@@ -11,6 +11,8 @@
 #include <sse2neon.h>
 #endif
 
+#include <cstring>
+
 template <typename T>
 const T* LUCENE_BLANK_SSTRING();
 
@@ -234,35 +236,62 @@ public:
         int32_t surplus_bytes = 0;
         uint32_t codepoint = 0;
         for (uint8_t c : str) {
-        if (bytes_in_char == 0) {
-            if ((c & 0x80) == 0) {
-                codepoint = c;
-                continue;
-            } else if ((c & 0xE0) == 0xC0) {
-                codepoint = c & 0x1F;
-                bytes_in_char = 1;
-            } else if ((c & 0xF0) == 0xE0) {
-                codepoint = c & 0x0F;
-                bytes_in_char = 2;
-            } else if ((c & 0xF8) == 0xF0) {
-                codepoint = c & 0x07;
-                bytes_in_char = 3;
+            if (bytes_in_char == 0) {
+                if ((c & 0x80) == 0) {
+                    codepoint = c;
+                    continue;
+                } else if ((c & 0xE0) == 0xC0) {
+                    codepoint = c & 0x1F;
+                    bytes_in_char = 1;
+                } else if ((c & 0xF0) == 0xE0) {
+                    codepoint = c & 0x0F;
+                    bytes_in_char = 2;
+                } else if ((c & 0xF8) == 0xF0) {
+                    codepoint = c & 0x07;
+                    bytes_in_char = 3;
+                } else {
+                    return -1;
+                }
+                surplus_bytes = 1;
             } else {
-                return -1;
-            }
-            surplus_bytes = 1;
-        } else {
-            if ((c & 0xC0) != 0x80) return -1;
-            codepoint = (codepoint << 6) | (c & 0x3F);
-            if (!is_valid_codepoint(codepoint)) {
-                return -1;
+                if ((c & 0xC0) != 0x80) return -1;
+                codepoint = (codepoint << 6) | (c & 0x3F);
+                if (!is_valid_codepoint(codepoint)) {
+                    return -1;
+                }
+                bytes_in_char--;
+                surplus_bytes++;
             }
-            bytes_in_char--;
-            surplus_bytes++;
-        }
         }
         return bytes_in_char == 0 ? 0 : surplus_bytes;
     }
+
+    // utf8: 1-4 char = 1 wchar_t, invalid utf8: 1 char = 1 wchar_t
+    static inline std::wstring string_to_wstring(const std::string_view& 
utf8_str) {
+        std::wstring wstr;
+        wstr.reserve(utf8_str.size());
+        size_t i = 0;
+        while (i < utf8_str.size()) {
+            wchar_t wc = utf8_str[i];
+            int32_t n = utf8_byte_count(utf8_str[i]);
+            if ((n >= 1 && n <= 4) &&
+                (i + n <= utf8_str.size()) &&
+                validate_utf8(std::string_view(utf8_str.data() + i, n)) == 0) {
+                if (n == 2) {
+                    wc = ((utf8_str[i] & 0x1F) << 6) | (utf8_str[i + 1] & 
0x3F);
+                } else if (n == 3) {
+                    wc = ((utf8_str[i] & 0x0F) << 12) | ((utf8_str[i + 1] & 
0x3F) << 6) | (utf8_str[i + 2] & 0x3F);
+                } else if (n == 4) {
+                    wc = ((utf8_str[i] & 0x07) << 18) | ((utf8_str[i + 1] & 
0x3F) << 12) | ((utf8_str[i + 2] & 0x3F) << 6) | (utf8_str[i + 3] & 0x3F);
+                }
+                i += n;
+            } else {
+                i += 1;
+            }
+            wstr.push_back(wc);
+        }
+        return wstr;
+    }
 };
 
 #endif//_lucene_util__stringutil_H
diff --git a/src/shared/CLucene/config/utf8.cpp 
b/src/shared/CLucene/config/utf8.cpp
index 4f9f7546..e8d97e50 100644
--- a/src/shared/CLucene/config/utf8.cpp
+++ b/src/shared/CLucene/config/utf8.cpp
@@ -259,7 +259,7 @@ std::string lucene_wcstoutf8string(const wchar_t* str, 
size_t strlen){
   return result;
 }
 
-std::wstring lucene_utf8stows(const std::string_view& s) {
+std::wstring lucene_utf8stows(const std::string_view &s) {
   std::wstring ws;
   size_t size = 0;
   for (size_t i = 0; i < s.size();) {
@@ -267,8 +267,8 @@ std::wstring lucene_utf8stows(const std::string_view& s) {
     size++;
     i += n;
   }
-  ws.resize(size + 1);
-  lucene_utf8towcs(ws.data(), s.data(), s.length());
+  ws.resize(size);
+  lucene_utf8towcs(ws.data(), s.data(), ws.size());
   return ws;
 }
 
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 26fe12ea..e70bf1b8 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -100,6 +100,7 @@ SET(test_files ./tests.cpp
         ./util/TestMSBRadixSorter.cpp
         ./util/TestStringBuffer.cpp
         ./util/English.cpp
+        ./util/testStrConvert.cpp
         ${test_HEADERS})
 IF (USE_SHARED_OBJECT_FILES)
     GET_SHARED_FILES(clucene_shared_Files)
diff --git a/src/test/test.h b/src/test/test.h
index 7b185682..ed57d9d2 100644
--- a/src/test/test.h
+++ b/src/test/test.h
@@ -80,6 +80,7 @@ CuSuite *testSpanQueries(void);
 CuSuite *testStringBuffer(void);
 CuSuite *testTermVectorsReader(void);
 CuSuite *teststandard95(void);
+CuSuite *testStrConvert(void);
 
 #ifdef TEST_CONTRIB_LIBS
 //CuSuite *testGermanAnalyzer(void);
diff --git a/src/test/tests.cpp b/src/test/tests.cpp
index 37465a6f..241cbe70 100644
--- a/src/test/tests.cpp
+++ b/src/test/tests.cpp
@@ -46,6 +46,7 @@ unittest tests[] = {
 //        {"spanqueries", testSpanQueries},
 //        {"stringbuffer", testStringBuffer},
 //        {"termvectorsreader", testTermVectorsReader},
+          {"strconvert", testStrConvert},      
 #ifdef TEST_CONTRIB_LIBS
         {"chinese", testchinese},
 #endif
diff --git a/src/test/util/testStrConvert.cpp b/src/test/util/testStrConvert.cpp
new file mode 100644
index 00000000..226ae64d
--- /dev/null
+++ b/src/test/util/testStrConvert.cpp
@@ -0,0 +1,106 @@
+#include <chrono>
+#include <random>
+
+#include "CLucene/util/stringUtil.h"
+#include "test.h"
+
+std::string ranUtf8String(uint32_t i) {
+  if (!StringUtil::is_valid_codepoint(i)) {
+    return "";
+  }
+
+  std::string utf8_str;
+  if (i <= 0x7F) {
+    utf8_str.push_back(static_cast<char>(i));
+  } else if (i <= 0x7FF) {
+    utf8_str.push_back(static_cast<char>(0xC0 | (i >> 6)));
+    utf8_str.push_back(static_cast<char>(0x80 | (i & 0x3F)));
+  } else if (i <= 0xFFFF) {
+    utf8_str.push_back(static_cast<char>(0xE0 | (i >> 12)));
+    utf8_str.push_back(static_cast<char>(0x80 | ((i >> 6) & 0x3F)));
+    utf8_str.push_back(static_cast<char>(0x80 | (i & 0x3F)));
+  } else {
+    utf8_str.push_back(static_cast<char>(0xF0 | (i >> 18)));
+    utf8_str.push_back(static_cast<char>(0x80 | ((i >> 12) & 0x3F)));
+    utf8_str.push_back(static_cast<char>(0x80 | ((i >> 6) & 0x3F)));
+    utf8_str.push_back(static_cast<char>(0x80 | (i & 0x3F)));
+  }
+  return utf8_str;
+}
+
+static void testSingleUtf8(CuTest *tc) {
+  for (uint32_t i = 1; i <= 0x10FFFF; ++i) {
+    std::string s = ranUtf8String(i);
+    if (s.empty()) continue;
+    std::wstring ws = StringUtil::string_to_wstring(s);
+    CLUCENE_ASSERT(ws.size() == 1);
+    CLUCENE_ASSERT(ws.size() == wcslen(ws.c_str()));
+  }
+}
+
+// day
+unsigned getSeed() {
+  auto now = std::chrono::system_clock::now();
+  auto now_time_t = std::chrono::system_clock::to_time_t(now);
+
+  std::tm now_tm = *std::localtime(&now_time_t);
+
+  now_tm.tm_hour = 0;
+  now_tm.tm_min = 0;
+  now_tm.tm_sec = 0;
+
+  auto today_time_t = std::mktime(&now_tm);
+  return static_cast<unsigned>(today_time_t);
+}
+
+static void testMultiUtf8(CuTest *tc) {
+  unsigned seed = getSeed();
+  std::mt19937 generator(seed);
+
+  for (int32_t i = 0; i < 10000; i++) {
+    std::string s;
+    int32_t k = 0;
+    for (int32_t j = 0; j < 3; j++) {
+      std::uniform_int_distribution<uint32_t> dis(1, 1114111);
+      uint32_t random_code_point = dis(generator);
+      std::string temp = ranUtf8String(random_code_point);
+      if (temp.empty()) continue;
+      s += temp;
+      k++;
+    }
+    std::wstring ws = StringUtil::string_to_wstring(s);
+    CLUCENE_ASSERT(ws.size() == k);
+    CLUCENE_ASSERT(ws.size() == wcslen(ws.c_str()));
+  }
+}
+
+string generateBlobString(int length) {
+  vector<char> data(length);
+
+  uint32_t seed = getSeed();
+  std::mt19937 generator(seed);
+
+  for (int i = 0; i < length; i++) {
+    std::uniform_int_distribution<uint32_t> dis(0, 256);
+    uint32_t code = dis(generator);
+    data[i] = (char)code;
+  }
+
+  return string(data.begin(), data.end());
+}
+
+static void testAll(CuTest *tc) {
+  for (int32_t i = 0; i < 10000; i++) {
+    string s = generateBlobString(100);
+    std::wstring ws = StringUtil::string_to_wstring(s);
+  }
+}
+
+CuSuite *testStrConvert(void) {
+  CuSuite *suite = CuSuiteNew(_T("CLucene str convert Test"));
+
+  SUITE_ADD_TEST(suite, testSingleUtf8);
+  SUITE_ADD_TEST(suite, testMultiUtf8);
+  SUITE_ADD_TEST(suite, testAll);
+  return suite;
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris-thirdparty] branch clucene updated: [fix] compatible with utf8 and invalid utf8 (#110)

Reply via email to