This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-520 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 4b1d67b7910107371da42bb2b8d641dcced52ab8 Author: Sergey Kamov <[email protected]> AuthorDate: Mon Dec 19 11:19:10 2022 +0400 WIP. --- .../main/resources/stopwords/first_words.txt.gz | Bin 4024880 -> 0 bytes .../src/main/resources/stopwords/noun_words.txt.gz | Bin 862 -> 0 bytes .../nlp/enrichers/NCStopWordsEnricherSpec.scala | 23 ++++++++++++++------- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/nlpcraft/src/main/resources/stopwords/first_words.txt.gz b/nlpcraft/src/main/resources/stopwords/first_words.txt.gz deleted file mode 100644 index e92748b4..00000000 Binary files a/nlpcraft/src/main/resources/stopwords/first_words.txt.gz and /dev/null differ diff --git a/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz b/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz deleted file mode 100644 index bfeb6fac..00000000 Binary files a/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz and /dev/null differ diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala index 142c16b4..b81ee116 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala @@ -56,43 +56,50 @@ class NCStopWordsEnricherSpec extends AnyFunSuite: false ) test( - new NCEnStopWordsTokenEnricher(Set("test"), Set("the")), + new NCEnStopWordsTokenEnricher(addSet = Set("test"), exclSet = Set("the")), "the test", false, true ) // The synonym is defined as lemma => all kind of input words should be found. test( - new NCEnStopWordsTokenEnricher(Set("woman")), + new NCEnStopWordsTokenEnricher(addSet = Set("woman")), "woman women", true, true ) // The synonym is defined in some form => only in the same form input words should be found. test( - new NCEnStopWordsTokenEnricher(Set("women")), + new NCEnStopWordsTokenEnricher(addSet = Set("women")), "woman women", false, true ) // The synonym is defined in some form, but stemmer is very rough => all kind of input words should be found. test( - new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.take(3)), + new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.take(3)), "woman women", true, true ) // The synonym is defined as lemma => all kind of input words should be found, but excluded set is defined. test( - new NCEnStopWordsTokenEnricher(Set("woman"), Set("women")), + new NCEnStopWordsTokenEnricher(addSet = Set("woman"), exclSet = Set("women")), "woman women", true, false ) - // Very rough stemmer defined. + // Very rough stemmers defined. test( - new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.head.toString), - "weather windows", + new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.head.toString), + "weather windows noun", + true, + true, + false + ) + test( + new NCEnStopWordsTokenEnricher(stemmer = _ => ""), + "weather noun", true, true )
