This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-443 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit e15b5cc6469e0d753c00931a49bcaf8b5c82ee6c Author: Sergey Kamov <[email protected]> AuthorDate: Tue Sep 14 16:49:51 2021 +0300 WIP. --- .../nlpcraft/common/nlp/NCNlpSentenceToken.scala | 4 +- .../apache/nlpcraft/model/NCModelFileAdapter.java | 5 + .../org/apache/nlpcraft/model/NCModelView.java | 11 ++ .../nlpcraft/model/impl/json/NCModelJson.java | 7 + .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala | 8 +- .../mgrs/nlp/enrichers/limit/NCLimitEnricher.scala | 2 +- .../enrichers/relation/NCRelationEnricher.scala | 2 +- .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 2 +- .../enrichers/stopword/NCStopWordEnricher.scala | 92 ++++++++++-- .../probe/mgrs/sentence/NCSentenceManager.scala | 3 +- .../nlp/enrichers/NCServerEnrichmentManager.scala | 12 +- .../nlp/enrichers/numeric/NCNumericEnricher.scala | 158 ++++++++++----------- .../enrichers/stopword/NCStopWordEnricher.scala | 1 - .../abstract/NCAbstractTokensIntentsSpec.scala | 6 + .../model/abstract/NCAbstractTokensModel.scala | 2 + .../model/properties/NCTokensPropertiesSpec.scala | 2 + .../model/stop/NCStopWordsAllowedSpec.scala | 124 ++++++++++++++++ .../model/NCEnricherNestedModelSpec.scala | 15 +- .../model/NCEnricherNestedModelSpec4.scala | 2 + 19 files changed, 352 insertions(+), 106 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala index 4b94b98..00f1dd0 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala @@ -194,7 +194,9 @@ case class NCNlpSentenceToken( * @param reason */ def addStopReason(reason: NCNlpSentenceNote): Unit = stopsReasons += reason - + /** + * + */ override def toString: String = notes.toSeq.sortBy(t => (if (t.isNlp) 0 else 1, t.noteType)).mkString("NLP token [", "|", "]") } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java index efa2b68..61cb84d 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java @@ -559,6 +559,11 @@ abstract public class NCModelFileAdapter extends NCModelAdapter { } @Override + public boolean isStopWordsAllowed() { + return proxy.isStopWordsAllowed(); + } + + @Override public Map<String, Set<String>> getRestrictedCombinations() { return restrictedCombinations; } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java index 30a2b40..19046d8 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java @@ -278,6 +278,9 @@ public interface NCModelView extends NCMetadata { */ boolean DFLT_IS_NO_USER_TOKENS_ALLOWED = true; + // TODO: + boolean DFLT_IS_STOPWORDS_ALLOWED = true; + /** * Default set of enabled built-in tokens. The following built-in tokens are enabled by default: * <ul> @@ -1235,4 +1238,12 @@ public interface NCModelView extends NCMetadata { default Map<String, Set<String>> getRestrictedCombinations() { return Collections.emptyMap(); } + + /** + * TODO: + * @return + */ + default boolean isStopWordsAllowed() { + return DFLT_IS_STOPWORDS_ALLOWED; + } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java index f332e08..043297c 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java @@ -63,6 +63,7 @@ public class NCModelJson { private boolean maxSynonymsThresholdError = DFLT_MAX_SYNONYMS_THRESHOLD_ERROR; private long conversationTimeout = DFLT_CONV_TIMEOUT_MS; private int conversationDepth = DFLT_CONV_DEPTH; + private boolean isStopWordsAllowed = DFLT_IS_STOPWORDS_ALLOWED; public String getId() { return id; @@ -278,4 +279,10 @@ public class NCModelJson { return restrictedCombinations; } public void setRestrictedCombinations(Map<String, String[]> restrictedCombinations) { this.restrictedCombinations = restrictedCombinations;} + public boolean isStopWordsAllowed() { + return isStopWordsAllowed; + } + public void setStopWordsAllowed(boolean stopWordsAllowed) { + isStopWordsAllowed = stopWordsAllowed; + } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala index 75ae18b..ea41793 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala @@ -45,9 +45,13 @@ case class NCProbeModel( solver: NCIntentSolver, intents: Seq[NCIdlIntent], callbacks: Map[String /* Intent ID */, NCProbeModelCallback], - continuousSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , NCProbeSynonymsWrapper]], // Fast access map. + continuousSynonyms: + Map[ + String /*Element ID*/, + /*Fast access map.*/ Map[Int /*Synonym length*/ , NCProbeSynonymsWrapper] + ], sparseSynonyms: Map[String /*Element ID*/, Seq[NCProbeSynonym]], - idlSynonyms: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast access map. + idlSynonyms: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], addStopWordsStems: Set[String], exclStopWordsStems: Set[String], suspWordsStems: Set[String], diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala index 7bad3c5..0286db3 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala @@ -262,7 +262,7 @@ object NCLimitEnricher extends NCProbeEnricher { // Tries to grab tokens reverse way. // Example: A, B, C => ABC, BC, AB .. (BC will be processed first) - for (toks <- ns.tokenMixWithStopWords().sortBy(p => (-p.size, -p.head.index)) if validImportant(ns, toks)) { + for (toks <- ns.tokenMix().sortBy(p => (-p.size, -p.head.index)) if validImportant(ns, toks)) { if (numsMap == null) { numsMap = NCNumericManager.find(ns).map(p => p.tokens -> p).toMap groupsMap = groupNums(ns, numsMap.values) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala index fa564b9..d44b4cb 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala @@ -163,7 +163,7 @@ object NCRelationEnricher extends NCProbeEnricher { // Example: A, B, C => ABC, AB, BC .. (AB will be processed first) val notes = mutable.HashSet.empty[NCNlpSentenceNote] - for (toks <- ns.tokenMixWithStopWords() if validImportant(ns, toks)) + for (toks <- ns.tokenMix() if validImportant(ns, toks)) tryToMatch(toks) match { case Some(m) => for (refNote <- m.refNotes if !restricted.contains(refNote)) { diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala index 286c8b4..fdb6d9a 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala @@ -454,7 +454,7 @@ object NCSortEnricher extends NCProbeEnricher { val notes = mutable.HashSet.empty[NCNlpSentenceNote] val matches = mutable.ArrayBuffer.empty[Match] - for (toks <- ns.tokenMixWithStopWords() if validImportant(ns, toks)) { + for (toks <- ns.tokenMix() if validImportant(ns, toks)) { tryToMatch(toks) match { case Some(m) => if (!matches.exists(_.isSubCase(m)) && !m.intersect(restricted)) { diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala index fc904d2..03e0ec9 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala @@ -17,16 +17,17 @@ package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword -import java.io.Serializable - import io.opencensus.trace.Span +import org.apache.nlpcraft.common.nlp.NCNlpSentenceToken.notes import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken} import org.apache.nlpcraft.common.{NCE, NCService, U} -import org.apache.nlpcraft.probe.mgrs.NCProbeModel import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher +import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonymsWrapper} +import java.io.Serializable import scala.annotation.tailrec +import scala.collection.mutable /** * Stop words enricher. @@ -215,6 +216,66 @@ object NCStopWordEnricher extends NCProbeEnricher { processCommonStops0(mdl, ns) } + /** + * + * @param mdl + * @param ns + */ + private def eraseNlpStops(mdl: NCProbeModel, ns: NCNlpSentence): Unit = { + val impStops = mutable.HashSet.empty[NCNlpSentenceToken] + + val allContSyns: Map[Int, Iterable[NCProbeSynonymsWrapper]] = + mdl.continuousSynonyms.values.flatMap(_.toSeq).groupBy(_._1).map(p => p._1 -> p._2.map(_._2)) + + for (toks <- ns.tokenMix(stopWords = true) if toks.exists(t => t.isStopWord && !impStops.contains(t))) { + allContSyns.get(toks.size) match { + case Some(ws) => + val stems = toks.map(_.stem).mkString(" ") + + if (ws.exists(w => w.txtDirectSynonyms.contains(stems) || w.txtNotDirectSynonyms.contains(stems))) + impStops ++= toks.filter(_.isStopWord) + + case None => // No-op. + } + } + + val del = ns.tokens.filter(t => t.isStopWord && !impStops.contains(t)) + + impStops.foreach(t => ns.fixNote(t.getNlpNote, "stopWord" -> false)) + + if (del.nonEmpty) { + del.foreach(t => require(t.isNlp)) + + // TODO: + logger.info( + s"Stopwords deleted from sentence [" + + s"srvReqId=${ns.srvReqId}, " + + s"text=${ns.text}, " + + s"stopWords=${del.map(p => s"${p.origText}(index=${p.wordIndexes.head})").mkString("|")}" + + s"]" + ) + + val delIdxs = del.flatMap(_.wordIndexes).sorted + + val old = ns.tokens.clone() + + ns.tokens.clear() + ns.tokens ++= old.filter(t => !del.contains(t)).zipWithIndex.map { case (t, idx) => t.clone(idx) } + + ns.tokens.foreach(t => { + val tokNotes = notes(t) + + tokNotes.foreach(n => { + val tokIdxs = n.tokenIndexes.map(i => i - delIdxs.count(_ < i)) + val wordIdxs = n.wordIndexes.map(i => i - delIdxs.count(_ < i)) + + t.remove(n) + t.add(n.clone(tokIdxs, wordIdxs)) + }) + }) + } + } + @throws[NCE] override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = { require(isStarted) @@ -225,12 +286,25 @@ object NCStopWordEnricher extends NCProbeEnricher { startScopedSpan( "enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text ) { _ => - mark(mdl.exclStopWordsStems, f = false) - mark(mdl.addStopWordsStems, f = true) - processGeo(ns) - processDate(ns) - processNums(ns) - processCommonStops(mdl, ns) + if (mdl.model.isStopWordsAllowed) { + mark(mdl.exclStopWordsStems, f = false) + mark(mdl.addStopWordsStems, f = true) + + // If stop word swallowed by any built token (numeric, date etc) - it's stop word marking dropped. + ns.filter(t => t.isStopWord && !t.isNlp).foreach(t => ns.fixNote(t.getNlpNote, "stopWord" -> false)) + + processGeo(ns) + processDate(ns) + processNums(ns) + + eraseNlpStops(mdl, ns) + + processCommonStops(mdl, ns) + + eraseNlpStops(mdl, ns) + } + else + ns.filter(_.isStopWord).foreach(t => ns.fixNote(t.getNlpNote, "stopWord" -> false)) } } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index d5dfc1e..f6855ea 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -213,7 +213,8 @@ object NCSentenceManager extends NCService { private def simpleCopy( ns: NCNlpSentence, history: mutable.ArrayBuffer[(Int, Int)], - toksCopy: NCNlpSentence, i: Int + toksCopy: NCNlpSentence, + i: Int ): Seq[NCNlpSentenceToken] = { val tokCopy = toksCopy(i) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala index 636b263..12b21bd 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala @@ -153,12 +153,12 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance { catching(wrapIE) { cache(normTxt) match { case Some(h) => - if (h.enabledBuiltInTokens == normEnabledBuiltInToks) { - prepareAsciiTable(h.sentence).info(logger, Some(s"Sentence enriched (from cache): '$normTxt'")) - - h.sentence - } - else +// if (h.enabledBuiltInTokens == normEnabledBuiltInToks) { +// prepareAsciiTable(h.sentence).info(logger, Some(s"Sentence enriched (from cache): '$normTxt'")) +// +// h.sentence +// } +// else process(srvReqId, normTxt, enabledBuiltInToks, span) case None => process(srvReqId, normTxt, enabledBuiltInToks, span) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala index 670a4dc..cf39575 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala @@ -207,7 +207,7 @@ object NCNumericEnricher extends NCServerEnricher { toIncl: Boolean, toFractional: Boolean, unitDataOpt: Option[NCNumericUnitData], - ): Seq[NCNlpSentenceNote] = { + ): Unit= { val params = mutable.ArrayBuffer.empty[(String, Any)] ++ Seq( @@ -223,7 +223,7 @@ object NCNumericEnricher extends NCServerEnricher { "isToPositiveInfinity" -> (to == MAX_VALUE) ) - def mkAndAssign(toks: Seq[NCNlpSentenceToken], typ: String, params: (String, Any)*):NCNlpSentenceNote = { + def mkAndAssign(toks: Seq[NCNlpSentenceToken], params: (String, Any)*):NCNlpSentenceNote = { val note = NCNlpSentenceNote(toks.map(_.index), "nlpcraft:num", params:_*) toks.foreach(_.add(note)) @@ -241,17 +241,17 @@ object NCNumericEnricher extends NCServerEnricher { } if (unitData.tokens == toks) - Seq(mkAndAssign(toks, "nlpcraft:num", extend():_*)) + Seq(mkAndAssign(toks, extend():_*)) else { Seq( mkAndAssign( - toks.filter(t => !unitData.tokens.contains(t)), "nlpcraft:num", params.clone():_* + toks.filter(t => !unitData.tokens.contains(t)), params.clone():_* ), - mkAndAssign(toks, "nlpcraft:num", extend():_*) + mkAndAssign(toks, extend():_*) ) } - case None => Seq(mkAndAssign(toks, "nlpcraft:num", params:_*)) + case None => Seq(mkAndAssign(toks, params:_*)) } } @@ -316,7 +316,7 @@ object NCNumericEnricher extends NCServerEnricher { Some(NCNumericUnitData(num1.unitData.get.unit, num1.tokens ++ num2.tokens)) } - val notes = p._2 match { + p._2 match { case BETWEEN_EXCLUSIVE => mkNotes( prepToks, @@ -364,79 +364,75 @@ object NCNumericEnricher extends NCServerEnricher { processed ++= toks - val notes = - prep.prepositionType match { - case MORE => - mkNotes( - toks, - num.value, - fromIncl = false, - fromFractional = num.isFractional, - to = MAX_VALUE, - toIncl = true, - toFractional = num.isFractional, - num.unitData - ) - case MORE_OR_EQUAL => - mkNotes( - toks, - num.value, - fromIncl = true, - fromFractional = num.isFractional, - to = MAX_VALUE, - toIncl = true, - toFractional = num.isFractional, - num.unitData - ) - case LESS => - mkNotes( - toks, - MIN_VALUE, - fromIncl = true, - fromFractional = num.isFractional, - to = num.value, - toIncl = false, - toFractional = num.isFractional, - num.unitData - ) - case LESS_OR_EQUAL => - mkNotes( - toks, - MIN_VALUE, - fromIncl = true, - fromFractional = num.isFractional, - to = num.value, - toIncl = true, - toFractional = num.isFractional, - num.unitData - ) - case EQUAL => - mkNotes( - toks, - num.value, - fromIncl = true, - fromFractional = num.isFractional, - to = num.value, - toIncl = true, - toFractional = num.isFractional, - num.unitData - ) - case NOT_EQUAL => - mkNotes( - toks, - num.value, - fromIncl = false, - fromFractional = num.isFractional, - to = num.value, - toIncl = false, - toFractional = num.isFractional, - num.unitData - ) - case _ => throw new AssertionError(s"Illegal note type: ${prep.prepositionType}.") - } - - for (note <- notes) - toks.foreach(_.add(note)) + prep.prepositionType match { + case MORE => + mkNotes( + toks, + num.value, + fromIncl = false, + fromFractional = num.isFractional, + to = MAX_VALUE, + toIncl = true, + toFractional = num.isFractional, + num.unitData + ) + case MORE_OR_EQUAL => + mkNotes( + toks, + num.value, + fromIncl = true, + fromFractional = num.isFractional, + to = MAX_VALUE, + toIncl = true, + toFractional = num.isFractional, + num.unitData + ) + case LESS => + mkNotes( + toks, + MIN_VALUE, + fromIncl = true, + fromFractional = num.isFractional, + to = num.value, + toIncl = false, + toFractional = num.isFractional, + num.unitData + ) + case LESS_OR_EQUAL => + mkNotes( + toks, + MIN_VALUE, + fromIncl = true, + fromFractional = num.isFractional, + to = num.value, + toIncl = true, + toFractional = num.isFractional, + num.unitData + ) + case EQUAL => + mkNotes( + toks, + num.value, + fromIncl = true, + fromFractional = num.isFractional, + to = num.value, + toIncl = true, + toFractional = num.isFractional, + num.unitData + ) + case NOT_EQUAL => + mkNotes( + toks, + num.value, + fromIncl = false, + fromFractional = num.isFractional, + to = num.value, + toIncl = false, + toFractional = num.isFractional, + num.unitData + ) + case _ => throw new AssertionError(s"Illegal note type: ${prep.prepositionType}.") + } } } @@ -448,7 +444,7 @@ object NCNumericEnricher extends NCServerEnricher { // Numeric without conditions. for (num <- nums if !processed.exists(num.tokens.contains)) { - val notes = mkNotes( + mkNotes( num.tokens, num.value, fromIncl = true, diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala index a4e396f..5a9169d 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala @@ -683,7 +683,6 @@ object NCStopWordEnricher extends NCServerEnricher { "percent" ).map(NCNlpCoreManager.stem) - // Stemmatization is done already by generator. possessiveWords = U.readTextGzipResource("stopwords/possessive_words.txt.gz", "UTF-8", logger).toSet firstWords = U.readTextGzipResource("stopwords/first_words.txt.gz", "UTF-8", logger).toSet diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala index 33ab3c3..c47661f 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala @@ -21,6 +21,9 @@ import org.apache.nlpcraft.model.{NCIntent, NCIntentMatch, NCResult} import org.apache.nlpcraft.{NCTestContext, NCTestEnvironment} import org.junit.jupiter.api.Test +import java.util +import scala.jdk.CollectionConverters.{SetHasAsJava, SetHasAsScala} + class NCAbstractTokensModelIntents extends NCAbstractTokensModel { @NCIntent("intent=wrapAnyWordIntent term(t)={# == 'wrapAnyWord'}") private def onWrapInternal(ctx: NCIntentMatch): NCResult = NCResult.text("OK") @@ -33,6 +36,9 @@ class NCAbstractTokensModelIntents extends NCAbstractTokensModel { @NCIntent("intent=wrapWrapLimit term(t1)={# == 'wrapWrapLimit'} term(t2)={# == 'wrapAnyWord'}") private def wrapWrapLimit(ctx: NCIntentMatch): NCResult = NCResult.text("OK") + + // TODO: w1 and w2 are stopwords according to src/main/resources/stopwords/stop_words.txt + override def getExcludedStopWords: util.Set[String] = (Set("w1", "w2") ++ super.getExcludedStopWords.asScala).asJava } @NCTestEnvironment(model = classOf[NCAbstractTokensModelIntents], startClient = true) diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala index 3fb8319..15700fe 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala @@ -38,4 +38,6 @@ class NCAbstractTokensModel extends NCModelAdapter( override def getAbstractTokens: util.Set[String] = Set("nlpcraft:num", "anyWord").asJava override def isPermutateSynonyms: Boolean = false override def isSparse: Boolean = false + + override def getExcludedStopWords: util.Set[String] = Set("the").asJava } diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala index 0dd39bf..a60d762 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala @@ -38,6 +38,8 @@ abstract class NCTokenPropertiesModelAbstract extends NCModelAdapter( override def isPermutateSynonyms: Boolean = true override def isSparse: Boolean = true + + override def isStopWordsAllowed: Boolean = false } case class NCPropTestElement( diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsAllowedSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsAllowedSpec.scala new file mode 100644 index 0000000..7ec0386 --- /dev/null +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsAllowedSpec.scala @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nlpcraft.model.stop + +import org.apache.nlpcraft.model.{NCContext, NCElement, NCModelAdapter, NCResult} +import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment} +import org.junit.jupiter.api.Test + +import java.util +import scala.jdk.CollectionConverters.CollectionHasAsScala +import scala.language.implicitConversions + +/** + * + */ +class NCStopWordsAllowedModelAdapter extends NCModelAdapter("nlpcraft.test", "Test Model", "1.0") { + override def getElements: util.Set[NCElement] = Set(NCTestElement("a", "the test")) +} + +/** + * + */ +class NCStopWordsAllowedModel extends NCStopWordsAllowedModelAdapter { + override def isStopWordsAllowed: Boolean = true + + override def onContext(ctx: NCContext): NCResult = { + ctx.getRequest.getNormalizedText match { + case "the" => + // One empty variant. + require(ctx.getVariants.size() == 1) + require(ctx.getVariants.asScala.head.size() == 0) + // Should be processed same way. + case "the test" | "the the test" => + // One variant. + require(ctx.getVariants.size() == 1) + + // One token (user token) + require(ctx.getVariants.asScala.head.size() == 1) + require(ctx.getVariants.asScala.head.asScala.head.getId == "a") + + case "test the the test" => + // One variant. + require(ctx.getVariants.size() == 1) + + require(ctx.getVariants.asScala.head.size() == 2) + require(ctx.getVariants.asScala.head.asScala.head.getId == "nlpcraft:nlp") + require(ctx.getVariants.asScala.head.asScala.last.getId == "a") + + case _ => throw new IllegalStateException(s"Unsupported test: ${ctx.getRequest.getNormalizedText}") + } + + NCResult.text("OK") + } +} + +/** + * + */ +class NCStopWordsNotAllowedModel extends NCStopWordsAllowedModelAdapter { + override def isStopWordsAllowed: Boolean = false + + override def onContext(ctx: NCContext): NCResult = { + ctx.getRequest.getNormalizedText match { + case "the" => + // One variant. + require(ctx.getVariants.size() == 1) + + // One free token (nlp) + require(ctx.getVariants.asScala.head.size() == 1) + require(ctx.getVariants.asScala.head.asScala.head.getId == "nlpcraft:nlp") + + case "the test" => + // One variant. + require(ctx.getVariants.size() == 1) + + // One token (user token) + require(ctx.getVariants.asScala.head.size() == 1) + require(ctx.getVariants.asScala.head.asScala.head.getId == "a") + + case "the the test" | "test the the test" => + // There are shouldn't be stop words. + ctx.getVariants.asScala.foreach(v => require(v.getStopWordTokens.asScala.isEmpty)) + + case _ => throw new IllegalStateException(s"Unsupported test: ${ctx.getRequest.getNormalizedText}") + } + + NCResult.text("OK") + } +} + +/** + * + */ +@NCTestEnvironment(model = classOf[NCStopWordsAllowedModel], startClient = true) +class NCStopWordsAllowedSpec extends NCTestContext { + @Test + def test(): Unit = { + checkResult("the", "OK") + checkResult("the test", "OK") + checkResult("the the test", "OK") + checkResult("test the the test", "OK") + } +} + +/** + * + */ +@NCTestEnvironment(model = classOf[NCStopWordsNotAllowedModel], startClient = true) +class NCStopWordsNotAllowedSpec extends NCStopWordsAllowedSpec \ No newline at end of file diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala index 4d5d991..bf4d6f1 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala @@ -92,12 +92,23 @@ class NCEnricherNestedModelSpec2 extends NCEnricherNestedModelSpec1 { usr(text = "test tomorrow", id = "x3"), nlp(text = "xxx"), ), + _ => checkExists( "y the y", - usr(text = "y y", id = "y3"), - nlp(text = "the", isStop = true) + usr(text = "y y", id = "y3") + ), + _ => checkExists( + "y the y", + usr(text = "y", id = "y1"), + usr(text = "y", id = "y1") ), _ => checkExists( + "y the y", + usr(text = "y", id = "y2"), + usr(text = "y", id = "y2") + ), + + _ => checkExists( "y xxx y", usr(text = "y y", id = "y3"), nlp(text = "xxx") diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala index 27082f1..be643d5 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala @@ -42,6 +42,8 @@ class NCNestedTestModel41 extends NCModelAdapter("nlpcraft.nested4.test.mdl", "N override def isPermutateSynonyms: Boolean = false override def isSparse: Boolean = false + + override def getExcludedStopWords: util.Set[String] = Set("the", "a").asJava } /**
