[incubator-nlpcraft] 02/03: WIP.

sergeykamov Mon, 19 Dec 2022 00:34:09 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


commit 4b1d67b7910107371da42bb2b8d641dcced52ab8
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Dec 19 11:19:10 2022 +0400

    WIP.
---
 .../main/resources/stopwords/first_words.txt.gz    | Bin 4024880 -> 0 bytes
 .../src/main/resources/stopwords/noun_words.txt.gz | Bin 862 -> 0 bytes
 .../nlp/enrichers/NCStopWordsEnricherSpec.scala    |  23 ++++++++++++++-------
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/nlpcraft/src/main/resources/stopwords/first_words.txt.gz 
b/nlpcraft/src/main/resources/stopwords/first_words.txt.gz
deleted file mode 100644
index e92748b4..00000000
Binary files a/nlpcraft/src/main/resources/stopwords/first_words.txt.gz and 
/dev/null differ
diff --git a/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz 
b/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz
deleted file mode 100644
index bfeb6fac..00000000
Binary files a/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz and 
/dev/null differ
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
index 142c16b4..b81ee116 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
@@ -56,43 +56,50 @@ class NCStopWordsEnricherSpec extends AnyFunSuite:
             false
         )
         test(
-            new NCEnStopWordsTokenEnricher(Set("test"), Set("the")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("test"), exclSet = 
Set("the")),
             "the test",
             false,
             true
         )
         // The synonym is defined as lemma => all kind of input words should 
be found.
         test(
-            new NCEnStopWordsTokenEnricher(Set("woman")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("woman")),
             "woman women",
             true,
             true
         )
         // The synonym is defined in some form => only in the same form input 
words should be found.
         test(
-            new NCEnStopWordsTokenEnricher(Set("women")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("women")),
             "woman women",
             false,
             true
         )
         // The synonym is defined in some form, but stemmer is very rough =>  
all kind of input words should be found.
         test(
-            new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer 
= _.take(3)),
+            new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = 
_.take(3)),
             "woman women",
             true,
             true
         )
         // The synonym is defined as lemma => all kind of input words should 
be found, but excluded set is defined.
         test(
-            new NCEnStopWordsTokenEnricher(Set("woman"), Set("women")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("woman"), exclSet = 
Set("women")),
             "woman women",
             true,
             false
         )
-        // Very rough stemmer defined.
+        // Very rough stemmers defined.
         test(
-            new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer 
= _.head.toString),
-            "weather windows",
+            new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = 
_.head.toString),
+            "weather windows noun",
+            true,
+            true,
+            false
+        )
+        test(
+            new NCEnStopWordsTokenEnricher(stemmer = _ => ""),
+            "weather noun",
             true,
             true
         )

[incubator-nlpcraft] 02/03: WIP.

Reply via email to