This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new dda894af [fix] compatible with utf8 and invalid utf8 (#110)
dda894af is described below
commit dda894af51024226f10336eea3d344cebeef310d
Author: zzzxl <[email protected]>
AuthorDate: Thu Aug 3 15:58:43 2023 +0800
[fix] compatible with utf8 and invalid utf8 (#110)
1. supports utf8 and non-utf8 strings
2. optimize string_to_wstring function
---
src/core/CLucene/index/TermInfosWriter.cpp | 61 +++++------------
src/core/CLucene/store/IndexOutput.cpp | 13 ----
src/core/CLucene/store/IndexOutput.h | 2 -
src/core/CLucene/util/stringUtil.h | 77 ++++++++++++++-------
src/shared/CLucene/config/utf8.cpp | 6 +-
src/test/CMakeLists.txt | 1 +
src/test/test.h | 1 +
src/test/tests.cpp | 1 +
src/test/util/testStrConvert.cpp | 106 +++++++++++++++++++++++++++++
9 files changed, 183 insertions(+), 85 deletions(-)
diff --git a/src/core/CLucene/index/TermInfosWriter.cpp
b/src/core/CLucene/index/TermInfosWriter.cpp
index 9be81373..32b6a100 100644
--- a/src/core/CLucene/index/TermInfosWriter.cpp
+++ b/src/core/CLucene/index/TermInfosWriter.cpp
@@ -178,51 +178,26 @@ void STermInfosWriter<T>::close() {
template <typename T>
void STermInfosWriter<T>::writeTerm(int32_t fieldNumber, const T *termText,
int32_t termTextLength) {
if constexpr (std::is_same_v<T, char>) {
- std::string_view utf8Str(termText, termTextLength);
- if (StringUtil::validate_utf8(utf8Str) == 0) {
- int32_t utf8Length = 0;
- {
- size_t i = 0;
- for (; i < utf8Str.size();) {
- int32_t n = StringUtil::utf8_byte_count(utf8Str[i]);
- i += n;
- utf8Length++;
- }
- assert(i == utf8Str.size());
- }
+ std::string_view newTermStr(termText, termTextLength);
+ std::wstring newTermWStr = StringUtil::string_to_wstring(newTermStr);
- int32_t start = 0;
- int32_t utf8Start = 0;
- int32_t limit = termTextLength < lastTermTextLength ?
termTextLength : lastTermTextLength;
- auto prefixCompare = [this, &utf8Str, &termText](int32_t& start,
int32_t& utf8Start, int32_t limit) {
- while (start < limit) {
- int32_t n = StringUtil::utf8_byte_count(utf8Str[start]);
- for (int32_t j = 0; j < n; j++) {
- int32_t cur = start + j;
- if (termText[cur] != lastTermText.values[cur]) {
- return;
- }
- }
- start += n;
- utf8Start++;
- }
- };
-
- prefixCompare(start, utf8Start, limit);
- assert(start <= termTextLength);
- assert(utf8Start <= utf8Length);
- int32_t length = termTextLength - start;
- utf8Length -= utf8Start;
-
- // std::cout << "term: " << utf8Str << ", utf8Start: " <<
utf8Start << ", utf8Length: " << utf8Length << ", length: " << length <<
std::endl;
-
- output->writeVInt(utf8Start);
- output->writeVInt(utf8Length);
- output->writeU8SChars(termText + start, length);
- output->writeVInt(fieldNumber);
- } else {
- _CLTHROWA(CL_ERR_Runtime, (std::string("Not utf8, the character
encoding is abnormal: ") + std::string(utf8Str.data(),
utf8Str.size())).c_str());
+ std::string_view oldTermStr(lastTermText.values, lastTermTextLength);
+ std::wstring oldTermWStr = StringUtil::string_to_wstring(oldTermStr);
+
+ int32_t start = 0;
+ const int32_t limit = newTermWStr.length() < oldTermWStr.length() ?
newTermWStr.length() : oldTermWStr.length();
+ while (start < limit) {
+ if (newTermWStr[start] != oldTermWStr[start])
+ break;
+ start++;
}
+
+ int32_t length = newTermWStr.length() - start;
+
+ output->writeVInt(start);
+ output->writeVInt(length);
+ output->writeSChars(newTermWStr.data() + start, length);
+ output->writeVInt(fieldNumber);
} else {
int32_t start = 0;
const int32_t limit = termTextLength < lastTermTextLength ?
termTextLength : lastTermTextLength;
diff --git a/src/core/CLucene/store/IndexOutput.cpp
b/src/core/CLucene/store/IndexOutput.cpp
index 1d44aff1..05e7695f 100644
--- a/src/core/CLucene/store/IndexOutput.cpp
+++ b/src/core/CLucene/store/IndexOutput.cpp
@@ -186,19 +186,6 @@ CL_NS_DEF(store)
writeBytes((const uint8_t*)s, length);
}
- void IndexOutput::writeU8SChars(const char* s, const int32_t length) {
- if ( length < 0 )
- _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a
positive value.");
-
- for (int32_t i = 0; i < length;) {
- auto* chars = (const uint8_t*)s + i;
- int32_t n = StringUtil::utf8_byte_count(*chars);
- assert(n >= 1 && n <= 4);
- writeBytes(chars, (n > 2 ? 3 : n));
- i += n;
- }
- }
-
void IndexOutput::writeChars(const TCHAR* s, const int32_t length){
if ( length < 0 )
_CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a
positive value.");
diff --git a/src/core/CLucene/store/IndexOutput.h
b/src/core/CLucene/store/IndexOutput.h
index e8eff025..6b6ca321 100644
--- a/src/core/CLucene/store/IndexOutput.h
+++ b/src/core/CLucene/store/IndexOutput.h
@@ -83,8 +83,6 @@ public:
void writeChars(const TCHAR* s, const int32_t length);
template<typename T>
void writeSChars(const T* s, int32_t length);
-
- void writeU8SChars(const char* s, const int32_t length);
/** Closes this stream to further operations. */
virtual void close() = 0;
diff --git a/src/core/CLucene/util/stringUtil.h
b/src/core/CLucene/util/stringUtil.h
index 2c39b8f0..8e8ca1e9 100644
--- a/src/core/CLucene/util/stringUtil.h
+++ b/src/core/CLucene/util/stringUtil.h
@@ -11,6 +11,8 @@
#include <sse2neon.h>
#endif
+#include <cstring>
+
template <typename T>
const T* LUCENE_BLANK_SSTRING();
@@ -234,35 +236,62 @@ public:
int32_t surplus_bytes = 0;
uint32_t codepoint = 0;
for (uint8_t c : str) {
- if (bytes_in_char == 0) {
- if ((c & 0x80) == 0) {
- codepoint = c;
- continue;
- } else if ((c & 0xE0) == 0xC0) {
- codepoint = c & 0x1F;
- bytes_in_char = 1;
- } else if ((c & 0xF0) == 0xE0) {
- codepoint = c & 0x0F;
- bytes_in_char = 2;
- } else if ((c & 0xF8) == 0xF0) {
- codepoint = c & 0x07;
- bytes_in_char = 3;
+ if (bytes_in_char == 0) {
+ if ((c & 0x80) == 0) {
+ codepoint = c;
+ continue;
+ } else if ((c & 0xE0) == 0xC0) {
+ codepoint = c & 0x1F;
+ bytes_in_char = 1;
+ } else if ((c & 0xF0) == 0xE0) {
+ codepoint = c & 0x0F;
+ bytes_in_char = 2;
+ } else if ((c & 0xF8) == 0xF0) {
+ codepoint = c & 0x07;
+ bytes_in_char = 3;
+ } else {
+ return -1;
+ }
+ surplus_bytes = 1;
} else {
- return -1;
- }
- surplus_bytes = 1;
- } else {
- if ((c & 0xC0) != 0x80) return -1;
- codepoint = (codepoint << 6) | (c & 0x3F);
- if (!is_valid_codepoint(codepoint)) {
- return -1;
+ if ((c & 0xC0) != 0x80) return -1;
+ codepoint = (codepoint << 6) | (c & 0x3F);
+ if (!is_valid_codepoint(codepoint)) {
+ return -1;
+ }
+ bytes_in_char--;
+ surplus_bytes++;
}
- bytes_in_char--;
- surplus_bytes++;
- }
}
return bytes_in_char == 0 ? 0 : surplus_bytes;
}
+
+ // utf8: 1-4 char = 1 wchar_t, invalid utf8: 1 char = 1 wchar_t
+ static inline std::wstring string_to_wstring(const std::string_view&
utf8_str) {
+ std::wstring wstr;
+ wstr.reserve(utf8_str.size());
+ size_t i = 0;
+ while (i < utf8_str.size()) {
+ wchar_t wc = utf8_str[i];
+ int32_t n = utf8_byte_count(utf8_str[i]);
+ if ((n >= 1 && n <= 4) &&
+ (i + n <= utf8_str.size()) &&
+ validate_utf8(std::string_view(utf8_str.data() + i, n)) == 0) {
+ if (n == 2) {
+ wc = ((utf8_str[i] & 0x1F) << 6) | (utf8_str[i + 1] &
0x3F);
+ } else if (n == 3) {
+ wc = ((utf8_str[i] & 0x0F) << 12) | ((utf8_str[i + 1] &
0x3F) << 6) | (utf8_str[i + 2] & 0x3F);
+ } else if (n == 4) {
+ wc = ((utf8_str[i] & 0x07) << 18) | ((utf8_str[i + 1] &
0x3F) << 12) | ((utf8_str[i + 2] & 0x3F) << 6) | (utf8_str[i + 3] & 0x3F);
+ }
+ i += n;
+ } else {
+ i += 1;
+ }
+ wstr.push_back(wc);
+ }
+ return wstr;
+ }
};
#endif//_lucene_util__stringutil_H
diff --git a/src/shared/CLucene/config/utf8.cpp
b/src/shared/CLucene/config/utf8.cpp
index 4f9f7546..e8d97e50 100644
--- a/src/shared/CLucene/config/utf8.cpp
+++ b/src/shared/CLucene/config/utf8.cpp
@@ -259,7 +259,7 @@ std::string lucene_wcstoutf8string(const wchar_t* str,
size_t strlen){
return result;
}
-std::wstring lucene_utf8stows(const std::string_view& s) {
+std::wstring lucene_utf8stows(const std::string_view &s) {
std::wstring ws;
size_t size = 0;
for (size_t i = 0; i < s.size();) {
@@ -267,8 +267,8 @@ std::wstring lucene_utf8stows(const std::string_view& s) {
size++;
i += n;
}
- ws.resize(size + 1);
- lucene_utf8towcs(ws.data(), s.data(), s.length());
+ ws.resize(size);
+ lucene_utf8towcs(ws.data(), s.data(), ws.size());
return ws;
}
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 26fe12ea..e70bf1b8 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -100,6 +100,7 @@ SET(test_files ./tests.cpp
./util/TestMSBRadixSorter.cpp
./util/TestStringBuffer.cpp
./util/English.cpp
+ ./util/testStrConvert.cpp
${test_HEADERS})
IF (USE_SHARED_OBJECT_FILES)
GET_SHARED_FILES(clucene_shared_Files)
diff --git a/src/test/test.h b/src/test/test.h
index 7b185682..ed57d9d2 100644
--- a/src/test/test.h
+++ b/src/test/test.h
@@ -80,6 +80,7 @@ CuSuite *testSpanQueries(void);
CuSuite *testStringBuffer(void);
CuSuite *testTermVectorsReader(void);
CuSuite *teststandard95(void);
+CuSuite *testStrConvert(void);
#ifdef TEST_CONTRIB_LIBS
//CuSuite *testGermanAnalyzer(void);
diff --git a/src/test/tests.cpp b/src/test/tests.cpp
index 37465a6f..241cbe70 100644
--- a/src/test/tests.cpp
+++ b/src/test/tests.cpp
@@ -46,6 +46,7 @@ unittest tests[] = {
// {"spanqueries", testSpanQueries},
// {"stringbuffer", testStringBuffer},
// {"termvectorsreader", testTermVectorsReader},
+ {"strconvert", testStrConvert},
#ifdef TEST_CONTRIB_LIBS
{"chinese", testchinese},
#endif
diff --git a/src/test/util/testStrConvert.cpp b/src/test/util/testStrConvert.cpp
new file mode 100644
index 00000000..226ae64d
--- /dev/null
+++ b/src/test/util/testStrConvert.cpp
@@ -0,0 +1,106 @@
+#include <chrono>
+#include <random>
+
+#include "CLucene/util/stringUtil.h"
+#include "test.h"
+
+std::string ranUtf8String(uint32_t i) {
+ if (!StringUtil::is_valid_codepoint(i)) {
+ return "";
+ }
+
+ std::string utf8_str;
+ if (i <= 0x7F) {
+ utf8_str.push_back(static_cast<char>(i));
+ } else if (i <= 0x7FF) {
+ utf8_str.push_back(static_cast<char>(0xC0 | (i >> 6)));
+ utf8_str.push_back(static_cast<char>(0x80 | (i & 0x3F)));
+ } else if (i <= 0xFFFF) {
+ utf8_str.push_back(static_cast<char>(0xE0 | (i >> 12)));
+ utf8_str.push_back(static_cast<char>(0x80 | ((i >> 6) & 0x3F)));
+ utf8_str.push_back(static_cast<char>(0x80 | (i & 0x3F)));
+ } else {
+ utf8_str.push_back(static_cast<char>(0xF0 | (i >> 18)));
+ utf8_str.push_back(static_cast<char>(0x80 | ((i >> 12) & 0x3F)));
+ utf8_str.push_back(static_cast<char>(0x80 | ((i >> 6) & 0x3F)));
+ utf8_str.push_back(static_cast<char>(0x80 | (i & 0x3F)));
+ }
+ return utf8_str;
+}
+
+static void testSingleUtf8(CuTest *tc) {
+ for (uint32_t i = 1; i <= 0x10FFFF; ++i) {
+ std::string s = ranUtf8String(i);
+ if (s.empty()) continue;
+ std::wstring ws = StringUtil::string_to_wstring(s);
+ CLUCENE_ASSERT(ws.size() == 1);
+ CLUCENE_ASSERT(ws.size() == wcslen(ws.c_str()));
+ }
+}
+
+// day
+unsigned getSeed() {
+ auto now = std::chrono::system_clock::now();
+ auto now_time_t = std::chrono::system_clock::to_time_t(now);
+
+ std::tm now_tm = *std::localtime(&now_time_t);
+
+ now_tm.tm_hour = 0;
+ now_tm.tm_min = 0;
+ now_tm.tm_sec = 0;
+
+ auto today_time_t = std::mktime(&now_tm);
+ return static_cast<unsigned>(today_time_t);
+}
+
+static void testMultiUtf8(CuTest *tc) {
+ unsigned seed = getSeed();
+ std::mt19937 generator(seed);
+
+ for (int32_t i = 0; i < 10000; i++) {
+ std::string s;
+ int32_t k = 0;
+ for (int32_t j = 0; j < 3; j++) {
+ std::uniform_int_distribution<uint32_t> dis(1, 1114111);
+ uint32_t random_code_point = dis(generator);
+ std::string temp = ranUtf8String(random_code_point);
+ if (temp.empty()) continue;
+ s += temp;
+ k++;
+ }
+ std::wstring ws = StringUtil::string_to_wstring(s);
+ CLUCENE_ASSERT(ws.size() == k);
+ CLUCENE_ASSERT(ws.size() == wcslen(ws.c_str()));
+ }
+}
+
+string generateBlobString(int length) {
+ vector<char> data(length);
+
+ uint32_t seed = getSeed();
+ std::mt19937 generator(seed);
+
+ for (int i = 0; i < length; i++) {
+ std::uniform_int_distribution<uint32_t> dis(0, 256);
+ uint32_t code = dis(generator);
+ data[i] = (char)code;
+ }
+
+ return string(data.begin(), data.end());
+}
+
+static void testAll(CuTest *tc) {
+ for (int32_t i = 0; i < 10000; i++) {
+ string s = generateBlobString(100);
+ std::wstring ws = StringUtil::string_to_wstring(s);
+ }
+}
+
+CuSuite *testStrConvert(void) {
+ CuSuite *suite = CuSuiteNew(_T("CLucene str convert Test"));
+
+ SUITE_ADD_TEST(suite, testSingleUtf8);
+ SUITE_ADD_TEST(suite, testMultiUtf8);
+ SUITE_ADD_TEST(suite, testAll);
+ return suite;
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]