This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-520 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 8f8e9cf487f250ae9b2ec5d3b47d0cbc8a38c165 Author: Sergey Kamov <[email protected]> AuthorDate: Mon Dec 19 12:34:14 2022 +0400 WIP. --- .../nlp/enrichers/impl/NCEnStopWordGenerator.scala | 4 +- .../nlp/enrichers/NCStopWordsEnricherSpec.scala | 57 ++++++++++++++-------- 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala index b90e0567..3295738a 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala @@ -156,7 +156,7 @@ import org.apache.nlpcraft.nlp.enrichers.impl.NCEnStopWordGenerator.* */ private[enrichers] class NCEnStopWordGenerator(stemmer: NCStemmer): def mkNounWords(): Set[String] = - val buf = new mutable.HashSet[String]() + val buf = new mutable.ArrayBuffer[String]() for (w1 <- NOUN_WORDS) buf += s"$w1" @@ -167,7 +167,7 @@ private[enrichers] class NCEnStopWordGenerator(stemmer: NCStemmer): buf.map(stem).toSet def mkFirstWords(): Set[String] = - val buf = new mutable.HashSet[String]() + val buf = new mutable.ArrayBuffer[String]() // is there for (w1 <- QWORDS2) diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala index b81ee116..b5b0ee25 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala @@ -18,11 +18,13 @@ package org.apache.nlpcraft.nlp.enrichers import org.apache.nlpcraft.* -import internal.util.NCResourceReader import nlp.util.* import nlp.enrichers.NCEnStopWordsTokenEnricher -import org.apache.nlpcraft.nlp.stemmer.NCStemmer import org.scalatest.funsuite.AnyFunSuite +import org.apache.nlpcraft.internal.util.NCUtils + +import scala.collection.* +import scala.concurrent.ExecutionContext /** * @@ -34,73 +36,90 @@ class NCStopWordsEnricherSpec extends AnyFunSuite: * @param txt * @param boolVals */ - private def test(stopEnricher: NCEnStopWordsTokenEnricher, txt: String, boolVals: Boolean*): Unit = - val toks = EN_TOK_PARSER.tokenize(txt) - require(toks.size == boolVals.size) + private def add(stopEnricher: => NCEnStopWordsTokenEnricher, txt: String, boolVals: Boolean*) + (using bodies: mutable.ArrayBuffer[() => Unit], errs: mutable.ArrayBuffer[Throwable]): Unit = + val body: () => Unit = () => + try + val toks = EN_TOK_PARSER.tokenize(txt) + require(toks.size == boolVals.size) + + toks.foreach(tok => require(tok.get[Boolean]("stopword").isEmpty)) - toks.foreach(tok => require(tok.get[Boolean]("stopword").isEmpty)) + val req = NCTestRequest(txt) - val req = NCTestRequest(txt) + EN_TOK_LEMMA_POS_ENRICHER.enrich(req, CFG, toks) + stopEnricher.enrich(req, CFG, toks) - EN_TOK_LEMMA_POS_ENRICHER.enrich(req, CFG, toks) - stopEnricher.enrich(req, CFG, toks) + NCTestUtils.printTokens(toks) + toks.zip(boolVals).foreach { (tok, boolVal) => require(tok[Boolean]("stopword") == boolVal) } + catch + case e: Throwable => errs.synchronized { errs += e } - NCTestUtils.printTokens(toks) - toks.zip(boolVals).foreach { (tok, boolVal) => require(tok[Boolean]("stopword") == boolVal) } + bodies += body test("test") { - test( + val errs = mutable.ArrayBuffer.empty[Throwable] + val bodies = mutable.ArrayBuffer.empty[() => Unit] + + given mutable.ArrayBuffer[Throwable] = errs + given mutable.ArrayBuffer[() => Unit] = bodies + + add( EN_TOK_STOP_ENRICHER, "the test", true, false ) - test( + add( new NCEnStopWordsTokenEnricher(addSet = Set("test"), exclSet = Set("the")), "the test", false, true ) // The synonym is defined as lemma => all kind of input words should be found. - test( + add( new NCEnStopWordsTokenEnricher(addSet = Set("woman")), "woman women", true, true ) // The synonym is defined in some form => only in the same form input words should be found. - test( + add( new NCEnStopWordsTokenEnricher(addSet = Set("women")), "woman women", false, true ) // The synonym is defined in some form, but stemmer is very rough => all kind of input words should be found. - test( + add( new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.take(3)), "woman women", true, true ) // The synonym is defined as lemma => all kind of input words should be found, but excluded set is defined. - test( + add( new NCEnStopWordsTokenEnricher(addSet = Set("woman"), exclSet = Set("women")), "woman women", true, false ) // Very rough stemmers defined. - test( + add( new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.head.toString), "weather windows noun", true, true, false ) - test( + add( new NCEnStopWordsTokenEnricher(stemmer = _ => ""), "weather noun", true, true ) + + NCUtils.execPar(bodies)(ExecutionContext.Implicits.global) + errs.foreach(_.printStackTrace) + require(errs.isEmpty) } \ No newline at end of file
