[doris-thirdparty] branch clucene updated: [Fix] fix compile and unitest problems (#100)

jianliangqi Wed, 12 Jul 2023 00:59:00 -0700

This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene by this push:
     new a24fa95a [Fix] fix compile and unitest problems (#100)
a24fa95a is described below

commit a24fa95aa8935c980e3040dda8f948bf3a1b73a3
Author: airborne12 <[email protected]>
AuthorDate: Wed Jul 12 15:58:48 2023 +0800

    [Fix] fix compile and unitest problems (#100)
    
    1. fix CMake when build clucene test alone.
    2. revise and add more chinese unitest.
---
 src/contribs-lib/CMakeLists.txt                | 12 +++----
 src/test/contribs-lib/analysis/testChinese.cpp | 48 ++++++++++++++++++++------
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/contribs-lib/CMakeLists.txt b/src/contribs-lib/CMakeLists.txt
index df959fee..afc752e8 100644
--- a/src/contribs-lib/CMakeLists.txt
+++ b/src/contribs-lib/CMakeLists.txt
@@ -88,12 +88,12 @@ ENDIF()
 file(GLOB_RECURSE HEADERS ${clucene-contribs-lib_SOURCE_DIR}/*.h)
 
 #add extra capabilities
-find_package(ZLIB)
-IF ( NOT ZLIB_FOUND )
-  MESSAGE ( FATAL "ZLib not found" )
-ENDIF ( NOT ZLIB_FOUND )
-INCLUDE_DIRECTORIES( ${ZLIB_INCLUDE_DIR} )
-SET ( clucene_contrib_extra_libs "${clucene_contrib_extra_libs}" 
${ZLIB_LIBRARIES} )
+#find_package(ZLIB)
+#IF ( NOT ZLIB_FOUND )
+#  MESSAGE ( FATAL "ZLib not found" )
+#ENDIF ( NOT ZLIB_FOUND )
+#INCLUDE_DIRECTORIES( ${ZLIB_INCLUDE_DIR} )
+#SET ( clucene_contrib_extra_libs "${clucene_contrib_extra_libs}" 
${ZLIB_LIBRARIES} )
 
 find_package(Iconv)
 #find_package(Strigi)
diff --git a/src/test/contribs-lib/analysis/testChinese.cpp 
b/src/test/contribs-lib/analysis/testChinese.cpp
index 95f0c24b..7e47aa2d 100644
--- a/src/test/contribs-lib/analysis/testChinese.cpp
+++ b/src/test/contribs-lib/analysis/testChinese.cpp
@@ -212,8 +212,6 @@ void testSimpleJiebaAllModeTokenizer(CuTest* tc) {
     a.initDict(get_dict_path());
     ts = a.tokenStream(_T("contents"), stringReader);
 
-    CLUCENE_ASSERT(ts->next(&t) != NULL);
-    CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0);
     CLUCENE_ASSERT(ts->next(&t) != NULL);
     CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0);
     CLUCENE_ASSERT(ts->next(&t) != NULL);
@@ -230,6 +228,43 @@ void testSimpleJiebaAllModeTokenizer(CuTest* tc) {
     _CLDELETE(ts);
 }
 
+void testSimpleJiebaDefaultModeTokenizer2(CuTest* tc) {
+    LanguageBasedAnalyzer a;
+    const char* field_value_data = "中国的科技发展在世界上处于领先";
+    auto stringReader =
+            _CLNEW lucene::util::SStringReader<char>(field_value_data, 
strlen(field_value_data), false);
+    TokenStream* ts;
+    Token t;
+
+    //test with chinese
+    a.setLanguage(_T("chinese"));
+    a.setStem(false);
+    a.setMode(lucene::analysis::AnalyzerMode::Default);
+    a.initDict(get_dict_path());
+    ts = a.tokenStream(_T("contents"), stringReader);
+
+    /*char tmp[255] = {};
+    while(ts->next(&t) != nullptr) {
+        lucene_wcstoutf8(tmp, t.termBuffer<TCHAR>(), 254);
+        std::cout << tmp << std::endl;
+    }*/
+
+    CLUCENE_ASSERT(ts->next(&t) != NULL);
+    CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("中国")) == 0);
+    CLUCENE_ASSERT(ts->next(&t) != NULL);
+    CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("科技")) == 0);
+    CLUCENE_ASSERT(ts->next(&t) != NULL);
+    CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("发展")) == 0);
+    CLUCENE_ASSERT(ts->next(&t) != NULL);
+    CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("在世界上")) == 0);
+    CLUCENE_ASSERT(ts->next(&t) != NULL);
+    CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("处于")) == 0);
+    CLUCENE_ASSERT(ts->next(&t) != NULL);
+    CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("领先")) == 0);
+    CLUCENE_ASSERT(ts->next(&t) == NULL);
+    _CLDELETE(ts);
+}
+
 void testSimpleJiebaDefaultModeTokenizer(CuTest* tc) {
     LanguageBasedAnalyzer a;
     const char* field_value_data = "我来到北京清华大学";
@@ -245,8 +280,6 @@ void testSimpleJiebaDefaultModeTokenizer(CuTest* tc) {
     a.initDict(get_dict_path());
     ts = a.tokenStream(_T("contents"), stringReader);
 
-    CLUCENE_ASSERT(ts->next(&t) != NULL);
-    CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0);
     CLUCENE_ASSERT(ts->next(&t) != NULL);
     CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0);
     CLUCENE_ASSERT(ts->next(&t) != NULL);
@@ -330,14 +363,10 @@ void testSimpleJiebaTokenizer2(CuTest* tc) {
     CLUCENE_ASSERT(ts->next(&t) != NULL);
     CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("人民")) == 0);
     CLUCENE_ASSERT(ts->next(&t) != NULL);
-    CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("可以")) == 0);
-    CLUCENE_ASSERT(ts->next(&t) != NULL);
     CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("得到")) == 0);
     CLUCENE_ASSERT(ts->next(&t) != NULL);
     CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("更")) == 0);
     CLUCENE_ASSERT(ts->next(&t) != NULL);
-    CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("多")) == 0);
-    CLUCENE_ASSERT(ts->next(&t) != NULL);
     CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("实惠")) == 0);
     CLUCENE_ASSERT(ts->next(&t) == NULL);
     _CLDELETE(ts);
@@ -380,8 +409,6 @@ void testSimpleJiebaTokenizer4(CuTest* tc) {
     CLUCENE_ASSERT(ts->next(&t) != NULL);
     CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("人民")) == 0);
     CLUCENE_ASSERT(ts->next(&t) != NULL);
-    CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("，")) == 0);
-    CLUCENE_ASSERT(ts->next(&t) != NULL);
     CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("银行")) == 0);
     CLUCENE_ASSERT(ts->next(&t) == NULL);
     _CLDELETE(ts);
@@ -1280,6 +1307,7 @@ CuSuite *testchinese(void) {
     SUITE_ADD_TEST(suite, testJiebaMatchHuge);
     SUITE_ADD_TEST(suite, testSimpleJiebaAllModeTokenizer);
     SUITE_ADD_TEST(suite, testSimpleJiebaDefaultModeTokenizer);
+    SUITE_ADD_TEST(suite, testSimpleJiebaDefaultModeTokenizer2);
     SUITE_ADD_TEST(suite, testSimpleJiebaSearchModeTokenizer);
     SUITE_ADD_TEST(suite, testSimpleJiebaAllModeTokenizer2);
     SUITE_ADD_TEST(suite, testSimpleJiebaSearchModeTokenizer2);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris-thirdparty] branch clucene updated: [Fix] fix compile and unitest problems (#100)

Reply via email to