This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new a24fa95a [Fix] fix compile and unitest problems (#100)
a24fa95a is described below
commit a24fa95aa8935c980e3040dda8f948bf3a1b73a3
Author: airborne12 <[email protected]>
AuthorDate: Wed Jul 12 15:58:48 2023 +0800
[Fix] fix compile and unitest problems (#100)
1. fix CMake when build clucene test alone.
2. revise and add more chinese unitest.
---
src/contribs-lib/CMakeLists.txt | 12 +++----
src/test/contribs-lib/analysis/testChinese.cpp | 48 ++++++++++++++++++++------
2 files changed, 44 insertions(+), 16 deletions(-)
diff --git a/src/contribs-lib/CMakeLists.txt b/src/contribs-lib/CMakeLists.txt
index df959fee..afc752e8 100644
--- a/src/contribs-lib/CMakeLists.txt
+++ b/src/contribs-lib/CMakeLists.txt
@@ -88,12 +88,12 @@ ENDIF()
file(GLOB_RECURSE HEADERS ${clucene-contribs-lib_SOURCE_DIR}/*.h)
#add extra capabilities
-find_package(ZLIB)
-IF ( NOT ZLIB_FOUND )
- MESSAGE ( FATAL "ZLib not found" )
-ENDIF ( NOT ZLIB_FOUND )
-INCLUDE_DIRECTORIES( ${ZLIB_INCLUDE_DIR} )
-SET ( clucene_contrib_extra_libs "${clucene_contrib_extra_libs}"
${ZLIB_LIBRARIES} )
+#find_package(ZLIB)
+#IF ( NOT ZLIB_FOUND )
+# MESSAGE ( FATAL "ZLib not found" )
+#ENDIF ( NOT ZLIB_FOUND )
+#INCLUDE_DIRECTORIES( ${ZLIB_INCLUDE_DIR} )
+#SET ( clucene_contrib_extra_libs "${clucene_contrib_extra_libs}"
${ZLIB_LIBRARIES} )
find_package(Iconv)
#find_package(Strigi)
diff --git a/src/test/contribs-lib/analysis/testChinese.cpp
b/src/test/contribs-lib/analysis/testChinese.cpp
index 95f0c24b..7e47aa2d 100644
--- a/src/test/contribs-lib/analysis/testChinese.cpp
+++ b/src/test/contribs-lib/analysis/testChinese.cpp
@@ -212,8 +212,6 @@ void testSimpleJiebaAllModeTokenizer(CuTest* tc) {
a.initDict(get_dict_path());
ts = a.tokenStream(_T("contents"), stringReader);
- CLUCENE_ASSERT(ts->next(&t) != NULL);
- CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0);
CLUCENE_ASSERT(ts->next(&t) != NULL);
CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0);
CLUCENE_ASSERT(ts->next(&t) != NULL);
@@ -230,6 +228,43 @@ void testSimpleJiebaAllModeTokenizer(CuTest* tc) {
_CLDELETE(ts);
}
+void testSimpleJiebaDefaultModeTokenizer2(CuTest* tc) {
+ LanguageBasedAnalyzer a;
+ const char* field_value_data = "中国的科技发展在世界上处于领先";
+ auto stringReader =
+ _CLNEW lucene::util::SStringReader<char>(field_value_data,
strlen(field_value_data), false);
+ TokenStream* ts;
+ Token t;
+
+ //test with chinese
+ a.setLanguage(_T("chinese"));
+ a.setStem(false);
+ a.setMode(lucene::analysis::AnalyzerMode::Default);
+ a.initDict(get_dict_path());
+ ts = a.tokenStream(_T("contents"), stringReader);
+
+ /*char tmp[255] = {};
+ while(ts->next(&t) != nullptr) {
+ lucene_wcstoutf8(tmp, t.termBuffer<TCHAR>(), 254);
+ std::cout << tmp << std::endl;
+ }*/
+
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("中国")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("科技")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("发展")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("在世界上")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("处于")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) != NULL);
+ CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("领先")) == 0);
+ CLUCENE_ASSERT(ts->next(&t) == NULL);
+ _CLDELETE(ts);
+}
+
void testSimpleJiebaDefaultModeTokenizer(CuTest* tc) {
LanguageBasedAnalyzer a;
const char* field_value_data = "我来到北京清华大学";
@@ -245,8 +280,6 @@ void testSimpleJiebaDefaultModeTokenizer(CuTest* tc) {
a.initDict(get_dict_path());
ts = a.tokenStream(_T("contents"), stringReader);
- CLUCENE_ASSERT(ts->next(&t) != NULL);
- CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0);
CLUCENE_ASSERT(ts->next(&t) != NULL);
CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0);
CLUCENE_ASSERT(ts->next(&t) != NULL);
@@ -330,14 +363,10 @@ void testSimpleJiebaTokenizer2(CuTest* tc) {
CLUCENE_ASSERT(ts->next(&t) != NULL);
CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("人民")) == 0);
CLUCENE_ASSERT(ts->next(&t) != NULL);
- CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("可以")) == 0);
- CLUCENE_ASSERT(ts->next(&t) != NULL);
CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("得到")) == 0);
CLUCENE_ASSERT(ts->next(&t) != NULL);
CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("更")) == 0);
CLUCENE_ASSERT(ts->next(&t) != NULL);
- CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("多")) == 0);
- CLUCENE_ASSERT(ts->next(&t) != NULL);
CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("实惠")) == 0);
CLUCENE_ASSERT(ts->next(&t) == NULL);
_CLDELETE(ts);
@@ -380,8 +409,6 @@ void testSimpleJiebaTokenizer4(CuTest* tc) {
CLUCENE_ASSERT(ts->next(&t) != NULL);
CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("人民")) == 0);
CLUCENE_ASSERT(ts->next(&t) != NULL);
- CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T(",")) == 0);
- CLUCENE_ASSERT(ts->next(&t) != NULL);
CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("银行")) == 0);
CLUCENE_ASSERT(ts->next(&t) == NULL);
_CLDELETE(ts);
@@ -1280,6 +1307,7 @@ CuSuite *testchinese(void) {
SUITE_ADD_TEST(suite, testJiebaMatchHuge);
SUITE_ADD_TEST(suite, testSimpleJiebaAllModeTokenizer);
SUITE_ADD_TEST(suite, testSimpleJiebaDefaultModeTokenizer);
+ SUITE_ADD_TEST(suite, testSimpleJiebaDefaultModeTokenizer2);
SUITE_ADD_TEST(suite, testSimpleJiebaSearchModeTokenizer);
SUITE_ADD_TEST(suite, testSimpleJiebaAllModeTokenizer2);
SUITE_ADD_TEST(suite, testSimpleJiebaSearchModeTokenizer2);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]