This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-520 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit b9f683a1d77a5e54052a58e9f8f0c3dd45fe57d4 Author: Sergey Kamov <[email protected]> AuthorDate: Wed Dec 14 14:19:39 2022 +0400 WIP. --- .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 54 ++++++++++++++-------- .../nlp/enrichers/NCStopWordsEnricherSpec.scala | 36 +++++++++++++++ 2 files changed, 71 insertions(+), 19 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala index 3fc1e57a..3ac14f84 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala @@ -198,11 +198,14 @@ class NCEnStopWordsTokenEnricher( private var stopWords: StopWordHolder = _ private var exceptions: StopWordHolder = _ + private case class TokenExtra(lemma: String, stem: String): + val lemmaStem: String = getStem(lemma) + init() private def read(path: String): Set[String] = NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet - private def stem(s: String): String = stemmer.stem(s.toLowerCase) - private def toStemKey(toks: Seq[NCToken]): String = toks.map(_.getText).map(stem).mkString(" ") + private def getStem(s: String): String = stemmer.stem(s.toLowerCase) + private def toStemKey(toks: Seq[NCToken]): String = toks.map(_.getText).map(getStem).mkString(" ") /** * Stop words holder, used for hash search. @@ -293,11 +296,11 @@ class NCEnStopWordsTokenEnricher( wildcardsOrigins.matches(toOriginalKey(toks), posOpt) /** - * + * */ private def init(): Unit = - addStems = if addStopsSet == null then Set.empty else addStopsSet.map(stem) - exclStems = if exclStopsSet == null then Set.empty else exclStopsSet.map(stem) + addStems = if addStopsSet == null then Set.empty else addStopsSet.map(getStem) + exclStems = if exclStopsSet == null then Set.empty else exclStopsSet.map(getStem) def check(name: String, set: Set[String]): Unit = if set.exists(_.exists(_.isWhitespace)) then throw E(s"$name contain a string with whitespaces.") @@ -308,7 +311,7 @@ class NCEnStopWordsTokenEnricher( val dups = addStems.intersect(exclStems) if dups.nonEmpty then E(s"Duplicate stems detected between additional and excluded stopwords [dups=${dups.mkString(",")}]") - percents = PERCENTS.map(stem) + percents = PERCENTS.map(getStem) // Stemmatization is done already by generator. NCUtils.execPar( @@ -429,7 +432,7 @@ class NCEnStopWordsTokenEnricher( val (word, form) = if isCase then (s, ORIG) else - if !hasPoses then (stem(s), STEM) else (stem(s), LEM) + if !hasPoses then (getStem(s), STEM) else (getStem(s), LEM) mHash((isExc, form)).addCondition(word, poses) else val b = s.take(idxWild) @@ -506,23 +509,29 @@ class NCEnStopWordsTokenEnricher( /** * Marks as stopwords, words with POS from configured list, which also placed before another stop words. */ - private def processCommonStops(ns: Seq[NCToken], stops: mutable.HashSet[NCToken]): Unit = + private def processCommonStops(ns: Seq[NCToken], extraToks: Map[NCToken, TokenExtra], stops: mutable.HashSet[NCToken]): Unit = /** * Marks as stopwords, words with POS from configured list, which also placed before another stop words. */ @tailrec - def processCommonStops0(ns: Seq[NCToken]): Unit = + def processCommonStops0(ns: Seq[NCToken], extraToks: Map[NCToken, TokenExtra]): Unit = val max = ns.size - 1 var stop = true - for ((tok, idx) <- ns.zipWithIndex if idx != max && !isStopWord(tok) && !exclStems.contains(stem(tok.getText)) && - POSES.contains(getPos(tok)) && isStopWord(ns(idx + 1))) + for ( + (tok, idx) <- ns.zipWithIndex; + extra = extraToks(tok) + if + idx != max && !isStopWord(tok) && + !exclStems.contains(extra.stem) && + !exclStems.contains(extra.lemmaStem) && + POSES.contains(getPos(tok)) && isStopWord(ns(idx + 1))) stops += tok stop = false - if !stop then processCommonStops0(ns) + if !stop then processCommonStops0(ns, extraToks) - processCommonStops0(ns) + processCommonStops0(ns, extraToks) /** @inheritdoc */ override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit = @@ -535,11 +544,15 @@ class NCEnStopWordsTokenEnricher( val stops = mutable.HashSet.empty[NCToken] - for (tok <- toks) + val extraToks = + scala.collection.mutable.LinkedHashMap.empty[NCToken, TokenExtra] ++= + toks.map(t => t -> TokenExtra(getLemma(t), getStem(t.getText))) + + for ((tok, extra) <- extraToks) val idx = tok.getIndex val pos = getPos(tok) - val lemma = getLemma(tok) - val st = stem(tok.getText) + val lemma = extra.lemma + val st = extra.stem def isFirst: Boolean = idx == 0 def isLast: Boolean = idx == toks.length - 1 @@ -615,10 +628,13 @@ class NCEnStopWordsTokenEnricher( // | Pass #6. | // | Processing additional and excluded stop words. | // +-------------------------------------------------+ - for (t <- toks if addStems.contains(stem(t.getText))) + for ((t, extra) <- extraToks if addStems.contains(extra.stem) || addStems.contains(extra.lemmaStem)) stops += t - for (t <- stops.filter(t => exclStems.contains(stem(t.getText)))) + for (t <- stops.filter( t => + val extra = extraToks(t) + exclStems.contains(extra.stem) || exclStems.contains(extra.lemmaStem)) + ) stops -= t // +-------------------------------------------------+ @@ -627,7 +643,7 @@ class NCEnStopWordsTokenEnricher( // | configured list, which also placed before | // | another stop words. | // +-------------------------------------------------+ - processCommonStops(toks, stops) + processCommonStops(toks, extraToks, stops) // +-------------------------------------------------+ // | Pass #8. | diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala index d0c92d40..142c16b4 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala @@ -21,6 +21,7 @@ import org.apache.nlpcraft.* import internal.util.NCResourceReader import nlp.util.* import nlp.enrichers.NCEnStopWordsTokenEnricher +import org.apache.nlpcraft.nlp.stemmer.NCStemmer import org.scalatest.funsuite.AnyFunSuite /** @@ -60,4 +61,39 @@ class NCStopWordsEnricherSpec extends AnyFunSuite: false, true ) + // The synonym is defined as lemma => all kind of input words should be found. + test( + new NCEnStopWordsTokenEnricher(Set("woman")), + "woman women", + true, + true + ) + // The synonym is defined in some form => only in the same form input words should be found. + test( + new NCEnStopWordsTokenEnricher(Set("women")), + "woman women", + false, + true + ) + // The synonym is defined in some form, but stemmer is very rough => all kind of input words should be found. + test( + new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.take(3)), + "woman women", + true, + true + ) + // The synonym is defined as lemma => all kind of input words should be found, but excluded set is defined. + test( + new NCEnStopWordsTokenEnricher(Set("woman"), Set("women")), + "woman women", + true, + false + ) + // Very rough stemmer defined. + test( + new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.head.toString), + "weather windows", + true, + true + ) } \ No newline at end of file
