[incubator-nlpcraft] 01/01: WIP.

sergeykamov Tue, 14 Sep 2021 06:50:02 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


commit e15b5cc6469e0d753c00931a49bcaf8b5c82ee6c
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Sep 14 16:49:51 2021 +0300

    WIP.
---
 .../nlpcraft/common/nlp/NCNlpSentenceToken.scala   |   4 +-
 .../apache/nlpcraft/model/NCModelFileAdapter.java  |   5 +
 .../org/apache/nlpcraft/model/NCModelView.java     |  11 ++
 .../nlpcraft/model/impl/json/NCModelJson.java      |   7 +
 .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala  |   8 +-
 .../mgrs/nlp/enrichers/limit/NCLimitEnricher.scala |   2 +-
 .../enrichers/relation/NCRelationEnricher.scala    |   2 +-
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   |   2 +-
 .../enrichers/stopword/NCStopWordEnricher.scala    |  92 ++++++++++--
 .../probe/mgrs/sentence/NCSentenceManager.scala    |   3 +-
 .../nlp/enrichers/NCServerEnrichmentManager.scala  |  12 +-
 .../nlp/enrichers/numeric/NCNumericEnricher.scala  | 158 ++++++++++-----------
 .../enrichers/stopword/NCStopWordEnricher.scala    |   1 -
 .../abstract/NCAbstractTokensIntentsSpec.scala     |   6 +
 .../model/abstract/NCAbstractTokensModel.scala     |   2 +
 .../model/properties/NCTokensPropertiesSpec.scala  |   2 +
 .../model/stop/NCStopWordsAllowedSpec.scala        | 124 ++++++++++++++++
 .../model/NCEnricherNestedModelSpec.scala          |  15 +-
 .../model/NCEnricherNestedModelSpec4.scala         |   2 +
 19 files changed, 352 insertions(+), 106 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
index 4b94b98..00f1dd0 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
@@ -194,7 +194,9 @@ case class NCNlpSentenceToken(
       * @param reason
       */
     def addStopReason(reason: NCNlpSentenceNote): Unit = stopsReasons += reason
-
+    /**
+      *
+      */
     override def toString: String =
         notes.toSeq.sortBy(t => (if (t.isNlp) 0 else 1, 
t.noteType)).mkString("NLP token [", "|", "]")
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java
index efa2b68..61cb84d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java
@@ -559,6 +559,11 @@ abstract public class NCModelFileAdapter extends 
NCModelAdapter {
     }
 
     @Override
+    public boolean isStopWordsAllowed() {
+        return proxy.isStopWordsAllowed();
+    }
+
+    @Override
     public Map<String, Set<String>> getRestrictedCombinations() {
         return restrictedCombinations;
     }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java
index 30a2b40..19046d8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java
@@ -278,6 +278,9 @@ public interface NCModelView extends NCMetadata {
      */
     boolean DFLT_IS_NO_USER_TOKENS_ALLOWED = true;
 
+    // TODO:
+    boolean DFLT_IS_STOPWORDS_ALLOWED = true;
+
     /**
      * Default set of enabled built-in tokens. The following built-in tokens 
are enabled by default:
      * <ul>
@@ -1235,4 +1238,12 @@ public interface NCModelView extends NCMetadata {
     default Map<String, Set<String>> getRestrictedCombinations() {
         return Collections.emptyMap();
     }
+
+    /**
+     * TODO:
+     * @return
+     */
+    default boolean isStopWordsAllowed() {
+        return DFLT_IS_STOPWORDS_ALLOWED;
+    }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java
index f332e08..043297c 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java
@@ -63,6 +63,7 @@ public class NCModelJson {
     private boolean maxSynonymsThresholdError = 
DFLT_MAX_SYNONYMS_THRESHOLD_ERROR;
     private long conversationTimeout = DFLT_CONV_TIMEOUT_MS;
     private int conversationDepth = DFLT_CONV_DEPTH;
+    private boolean isStopWordsAllowed = DFLT_IS_STOPWORDS_ALLOWED;
 
     public String getId() {
         return id;
@@ -278,4 +279,10 @@ public class NCModelJson {
         return restrictedCombinations;
     }
     public void setRestrictedCombinations(Map<String, String[]> 
restrictedCombinations) { this.restrictedCombinations = restrictedCombinations;}
+    public boolean isStopWordsAllowed() {
+        return isStopWordsAllowed;
+    }
+    public void setStopWordsAllowed(boolean stopWordsAllowed) {
+        isStopWordsAllowed = stopWordsAllowed;
+    }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
index 75ae18b..ea41793 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
@@ -45,9 +45,13 @@ case class NCProbeModel(
     solver: NCIntentSolver,
     intents: Seq[NCIdlIntent],
     callbacks: Map[String /* Intent ID */, NCProbeModelCallback],
-    continuousSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ 
, NCProbeSynonymsWrapper]], // Fast access map.
+    continuousSynonyms:
+        Map[
+            String /*Element ID*/,
+            /*Fast access map.*/ Map[Int /*Synonym length*/ , 
NCProbeSynonymsWrapper]
+        ],
     sparseSynonyms: Map[String /*Element ID*/, Seq[NCProbeSynonym]],
-    idlSynonyms: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast 
access map.
+    idlSynonyms: Map[String /*Element ID*/ , Seq[NCProbeSynonym]],
     addStopWordsStems: Set[String],
     exclStopWordsStems: Set[String],
     suspWordsStems: Set[String],
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
index 7bad3c5..0286db3 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
@@ -262,7 +262,7 @@ object NCLimitEnricher extends NCProbeEnricher {
 
             // Tries to grab tokens reverse way.
             // Example: A, B, C => ABC, BC, AB .. (BC will be processed first)
-            for (toks <- ns.tokenMixWithStopWords().sortBy(p => (-p.size, 
-p.head.index)) if validImportant(ns, toks)) {
+            for (toks <- ns.tokenMix().sortBy(p => (-p.size, -p.head.index)) 
if validImportant(ns, toks)) {
                 if (numsMap == null) {
                     numsMap = NCNumericManager.find(ns).map(p => p.tokens -> 
p).toMap
                     groupsMap = groupNums(ns, numsMap.values)
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
index fa564b9..d44b4cb 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
@@ -163,7 +163,7 @@ object NCRelationEnricher extends NCProbeEnricher {
             // Example: A, B, C => ABC, AB, BC .. (AB will be processed first)
             val notes = mutable.HashSet.empty[NCNlpSentenceNote]
 
-            for (toks <- ns.tokenMixWithStopWords() if validImportant(ns, 
toks))
+            for (toks <- ns.tokenMix() if validImportant(ns, toks))
                 tryToMatch(toks) match {
                     case Some(m) =>
                         for (refNote <- m.refNotes if 
!restricted.contains(refNote)) {
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 286c8b4..fdb6d9a 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -454,7 +454,7 @@ object NCSortEnricher extends NCProbeEnricher {
             val notes = mutable.HashSet.empty[NCNlpSentenceNote]
             val matches = mutable.ArrayBuffer.empty[Match]
 
-            for (toks <- ns.tokenMixWithStopWords() if validImportant(ns, 
toks)) {
+            for (toks <- ns.tokenMix() if validImportant(ns, toks)) {
                 tryToMatch(toks) match {
                     case Some(m)  =>
                         if (!matches.exists(_.isSubCase(m)) && 
!m.intersect(restricted)) {
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
index fc904d2..03e0ec9 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
@@ -17,16 +17,17 @@
 
 package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword
 
-import java.io.Serializable
-
 import io.opencensus.trace.Span
+import org.apache.nlpcraft.common.nlp.NCNlpSentenceToken.notes
 import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken}
 import org.apache.nlpcraft.common.{NCE, NCService, U}
-import org.apache.nlpcraft.probe.mgrs.NCProbeModel
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
+import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonymsWrapper}
 
+import java.io.Serializable
 import scala.annotation.tailrec
+import scala.collection.mutable
 
 /**
   * Stop words enricher.
@@ -215,6 +216,66 @@ object NCStopWordEnricher extends NCProbeEnricher {
         processCommonStops0(mdl, ns)
     }
 
+    /**
+      *
+      * @param mdl
+      * @param ns
+      */
+    private def eraseNlpStops(mdl: NCProbeModel, ns: NCNlpSentence): Unit = {
+        val impStops = mutable.HashSet.empty[NCNlpSentenceToken]
+
+        val allContSyns: Map[Int, Iterable[NCProbeSynonymsWrapper]] =
+            mdl.continuousSynonyms.values.flatMap(_.toSeq).groupBy(_._1).map(p 
=> p._1 -> p._2.map(_._2))
+
+        for (toks <- ns.tokenMix(stopWords = true) if toks.exists(t => 
t.isStopWord && !impStops.contains(t))) {
+            allContSyns.get(toks.size) match {
+                case Some(ws) =>
+                    val stems = toks.map(_.stem).mkString(" ")
+
+                    if (ws.exists(w => w.txtDirectSynonyms.contains(stems) || 
w.txtNotDirectSynonyms.contains(stems)))
+                        impStops ++= toks.filter(_.isStopWord)
+
+                case None => // No-op.
+            }
+        }
+
+        val del = ns.tokens.filter(t => t.isStopWord && !impStops.contains(t))
+
+        impStops.foreach(t => ns.fixNote(t.getNlpNote, "stopWord" -> false))
+
+        if (del.nonEmpty) {
+            del.foreach(t => require(t.isNlp))
+
+            // TODO:
+            logger.info(
+                s"Stopwords deleted from sentence [" +
+                s"srvReqId=${ns.srvReqId}, " +
+                s"text=${ns.text}, " +
+                s"stopWords=${del.map(p => 
s"${p.origText}(index=${p.wordIndexes.head})").mkString("|")}" +
+                s"]"
+            )
+
+            val delIdxs = del.flatMap(_.wordIndexes).sorted
+
+            val old = ns.tokens.clone()
+
+            ns.tokens.clear()
+            ns.tokens ++= old.filter(t => !del.contains(t)).zipWithIndex.map { 
case (t, idx) => t.clone(idx) }
+
+            ns.tokens.foreach(t => {
+                val tokNotes = notes(t)
+
+                tokNotes.foreach(n => {
+                    val tokIdxs = n.tokenIndexes.map(i => i - delIdxs.count(_ 
< i))
+                    val wordIdxs = n.wordIndexes.map(i => i - delIdxs.count(_ 
< i))
+
+                    t.remove(n)
+                    t.add(n.clone(tokIdxs, wordIdxs))
+                })
+            })
+        }
+    }
+
     @throws[NCE]
     override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: 
Map[String, Serializable], parent: Span = null): Unit = {
         require(isStarted)
@@ -225,12 +286,25 @@ object NCStopWordEnricher extends NCProbeEnricher {
         startScopedSpan(
             "enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> 
mdl.model.getId, "txt" -> ns.text
         ) { _ =>
-            mark(mdl.exclStopWordsStems, f = false)
-            mark(mdl.addStopWordsStems, f = true)
-            processGeo(ns)
-            processDate(ns)
-            processNums(ns)
-            processCommonStops(mdl, ns)
+            if (mdl.model.isStopWordsAllowed) {
+                mark(mdl.exclStopWordsStems, f = false)
+                mark(mdl.addStopWordsStems, f = true)
+
+                // If stop word swallowed by any built token (numeric, date 
etc) - it's stop word marking dropped.
+                ns.filter(t => t.isStopWord && !t.isNlp).foreach(t => 
ns.fixNote(t.getNlpNote, "stopWord" -> false))
+
+                processGeo(ns)
+                processDate(ns)
+                processNums(ns)
+
+                eraseNlpStops(mdl, ns)
+
+                processCommonStops(mdl, ns)
+
+                eraseNlpStops(mdl, ns)
+            }
+            else
+                ns.filter(_.isStopWord).foreach(t => ns.fixNote(t.getNlpNote, 
"stopWord" -> false))
         }
     }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index d5dfc1e..f6855ea 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -213,7 +213,8 @@ object NCSentenceManager extends NCService {
     private def simpleCopy(
         ns: NCNlpSentence,
         history: mutable.ArrayBuffer[(Int, Int)],
-        toksCopy: NCNlpSentence, i: Int
+        toksCopy: NCNlpSentence,
+        i: Int
     ): Seq[NCNlpSentenceToken] = {
         val tokCopy = toksCopy(i)
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
index 636b263..12b21bd 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
@@ -153,12 +153,12 @@ object NCServerEnrichmentManager extends NCService with 
NCIgniteInstance {
             catching(wrapIE) {
                 cache(normTxt) match {
                     case Some(h) =>
-                        if (h.enabledBuiltInTokens == normEnabledBuiltInToks) {
-                            prepareAsciiTable(h.sentence).info(logger, 
Some(s"Sentence enriched (from cache): '$normTxt'"))
-
-                            h.sentence
-                        }
-                        else
+//                        if (h.enabledBuiltInTokens == 
normEnabledBuiltInToks) {
+//                            prepareAsciiTable(h.sentence).info(logger, 
Some(s"Sentence enriched (from cache): '$normTxt'"))
+//
+//                            h.sentence
+//                        }
+//                        else
                             process(srvReqId, normTxt, enabledBuiltInToks, 
span)
                     case None =>
                         process(srvReqId, normTxt, enabledBuiltInToks, span)
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala
index 670a4dc..cf39575 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala
@@ -207,7 +207,7 @@ object NCNumericEnricher extends NCServerEnricher {
         toIncl: Boolean,
         toFractional: Boolean,
         unitDataOpt: Option[NCNumericUnitData],
-    ): Seq[NCNlpSentenceNote] = {
+    ): Unit= {
         val params =
             mutable.ArrayBuffer.empty[(String, Any)] ++
             Seq(
@@ -223,7 +223,7 @@ object NCNumericEnricher extends NCServerEnricher {
                 "isToPositiveInfinity" -> (to == MAX_VALUE)
             )
 
-        def mkAndAssign(toks: Seq[NCNlpSentenceToken], typ: String, params: 
(String, Any)*):NCNlpSentenceNote = {
+        def mkAndAssign(toks: Seq[NCNlpSentenceToken], params: (String, 
Any)*):NCNlpSentenceNote = {
             val note = NCNlpSentenceNote(toks.map(_.index), "nlpcraft:num", 
params:_*)
 
             toks.foreach(_.add(note))
@@ -241,17 +241,17 @@ object NCNumericEnricher extends NCServerEnricher {
                 }
 
                 if (unitData.tokens == toks)
-                    Seq(mkAndAssign(toks, "nlpcraft:num", extend():_*))
+                    Seq(mkAndAssign(toks, extend():_*))
                 else {
                     Seq(
                         mkAndAssign(
-                            toks.filter(t => !unitData.tokens.contains(t)), 
"nlpcraft:num", params.clone():_*
+                            toks.filter(t => !unitData.tokens.contains(t)), 
params.clone():_*
                         ),
-                        mkAndAssign(toks, "nlpcraft:num", extend():_*)
+                        mkAndAssign(toks, extend():_*)
                     )
                 }
 
-            case None => Seq(mkAndAssign(toks, "nlpcraft:num", params:_*))
+            case None => Seq(mkAndAssign(toks, params:_*))
         }
     }
 
@@ -316,7 +316,7 @@ object NCNumericEnricher extends NCServerEnricher {
                                 Some(NCNumericUnitData(num1.unitData.get.unit, 
num1.tokens ++ num2.tokens))
                             }
     
-                        val notes = p._2 match {
+                        p._2 match {
                             case BETWEEN_EXCLUSIVE =>
                                 mkNotes(
                                     prepToks,
@@ -364,79 +364,75 @@ object NCNumericEnricher extends NCServerEnricher {
     
                             processed ++= toks
     
-                            val notes =
-                                prep.prepositionType match {
-                                    case MORE =>
-                                        mkNotes(
-                                            toks,
-                                            num.value,
-                                            fromIncl = false,
-                                            fromFractional = num.isFractional,
-                                            to = MAX_VALUE,
-                                            toIncl = true,
-                                            toFractional = num.isFractional,
-                                            num.unitData
-                                        )
-                                    case MORE_OR_EQUAL =>
-                                        mkNotes(
-                                            toks,
-                                            num.value,
-                                            fromIncl = true,
-                                            fromFractional = num.isFractional,
-                                            to = MAX_VALUE,
-                                            toIncl = true,
-                                            toFractional = num.isFractional,
-                                            num.unitData
-                                        )
-                                    case LESS =>
-                                        mkNotes(
-                                            toks,
-                                            MIN_VALUE,
-                                            fromIncl = true,
-                                            fromFractional = num.isFractional,
-                                            to = num.value,
-                                            toIncl = false,
-                                            toFractional = num.isFractional,
-                                            num.unitData
-                                        )
-                                    case LESS_OR_EQUAL =>
-                                        mkNotes(
-                                            toks,
-                                            MIN_VALUE,
-                                            fromIncl = true,
-                                            fromFractional = num.isFractional,
-                                            to = num.value,
-                                            toIncl = true,
-                                            toFractional = num.isFractional,
-                                            num.unitData
-                                        )
-                                    case EQUAL =>
-                                        mkNotes(
-                                            toks,
-                                            num.value,
-                                            fromIncl = true,
-                                            fromFractional = num.isFractional,
-                                            to = num.value,
-                                            toIncl = true,
-                                            toFractional = num.isFractional,
-                                            num.unitData
-                                        )
-                                    case NOT_EQUAL =>
-                                        mkNotes(
-                                            toks,
-                                            num.value,
-                                            fromIncl = false,
-                                            fromFractional = num.isFractional,
-                                            to = num.value,
-                                            toIncl = false,
-                                            toFractional = num.isFractional,
-                                            num.unitData
-                                        )
-                                    case _ => throw new 
AssertionError(s"Illegal note type: ${prep.prepositionType}.")
-                                }
-
-                            for (note <- notes)
-                                toks.foreach(_.add(note))
+                            prep.prepositionType match {
+                                case MORE =>
+                                    mkNotes(
+                                        toks,
+                                        num.value,
+                                        fromIncl = false,
+                                        fromFractional = num.isFractional,
+                                        to = MAX_VALUE,
+                                        toIncl = true,
+                                        toFractional = num.isFractional,
+                                        num.unitData
+                                    )
+                                case MORE_OR_EQUAL =>
+                                    mkNotes(
+                                        toks,
+                                        num.value,
+                                        fromIncl = true,
+                                        fromFractional = num.isFractional,
+                                        to = MAX_VALUE,
+                                        toIncl = true,
+                                        toFractional = num.isFractional,
+                                        num.unitData
+                                    )
+                                case LESS =>
+                                    mkNotes(
+                                        toks,
+                                        MIN_VALUE,
+                                        fromIncl = true,
+                                        fromFractional = num.isFractional,
+                                        to = num.value,
+                                        toIncl = false,
+                                        toFractional = num.isFractional,
+                                        num.unitData
+                                    )
+                                case LESS_OR_EQUAL =>
+                                    mkNotes(
+                                        toks,
+                                        MIN_VALUE,
+                                        fromIncl = true,
+                                        fromFractional = num.isFractional,
+                                        to = num.value,
+                                        toIncl = true,
+                                        toFractional = num.isFractional,
+                                        num.unitData
+                                    )
+                                case EQUAL =>
+                                    mkNotes(
+                                        toks,
+                                        num.value,
+                                        fromIncl = true,
+                                        fromFractional = num.isFractional,
+                                        to = num.value,
+                                        toIncl = true,
+                                        toFractional = num.isFractional,
+                                        num.unitData
+                                    )
+                                case NOT_EQUAL =>
+                                    mkNotes(
+                                        toks,
+                                        num.value,
+                                        fromIncl = false,
+                                        fromFractional = num.isFractional,
+                                        to = num.value,
+                                        toIncl = false,
+                                        toFractional = num.isFractional,
+                                        num.unitData
+                                    )
+                                case _ => throw new AssertionError(s"Illegal 
note type: ${prep.prepositionType}.")
+                            }
                         }
                 }
     
@@ -448,7 +444,7 @@ object NCNumericEnricher extends NCServerEnricher {
     
             // Numeric without conditions.
             for (num <- nums if !processed.exists(num.tokens.contains)) {
-                val notes = mkNotes(
+                mkNotes(
                     num.tokens,
                     num.value,
                     fromIncl = true,
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
index a4e396f..5a9169d 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
@@ -683,7 +683,6 @@ object NCStopWordEnricher extends NCServerEnricher {
             "percent"
         ).map(NCNlpCoreManager.stem)
 
-
         // Stemmatization is done already by generator.
         possessiveWords = 
U.readTextGzipResource("stopwords/possessive_words.txt.gz", "UTF-8", 
logger).toSet
         firstWords = U.readTextGzipResource("stopwords/first_words.txt.gz", 
"UTF-8", logger).toSet
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
index 33ab3c3..c47661f 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
@@ -21,6 +21,9 @@ import org.apache.nlpcraft.model.{NCIntent, NCIntentMatch, 
NCResult}
 import org.apache.nlpcraft.{NCTestContext, NCTestEnvironment}
 import org.junit.jupiter.api.Test
 
+import java.util
+import scala.jdk.CollectionConverters.{SetHasAsJava, SetHasAsScala}
+
 class NCAbstractTokensModelIntents extends NCAbstractTokensModel {
     @NCIntent("intent=wrapAnyWordIntent term(t)={# == 'wrapAnyWord'}")
     private def onWrapInternal(ctx: NCIntentMatch): NCResult = 
NCResult.text("OK")
@@ -33,6 +36,9 @@ class NCAbstractTokensModelIntents extends 
NCAbstractTokensModel {
 
     @NCIntent("intent=wrapWrapLimit term(t1)={# == 'wrapWrapLimit'} 
term(t2)={# == 'wrapAnyWord'}")
     private def wrapWrapLimit(ctx: NCIntentMatch): NCResult = 
NCResult.text("OK")
+
+    // TODO: w1 and w2 are stopwords according to 
src/main/resources/stopwords/stop_words.txt
+    override def getExcludedStopWords: util.Set[String] = (Set("w1", "w2") ++ 
super.getExcludedStopWords.asScala).asJava
 }
 
 @NCTestEnvironment(model = classOf[NCAbstractTokensModelIntents], startClient 
= true)
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
index 3fb8319..15700fe 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
@@ -38,4 +38,6 @@ class NCAbstractTokensModel extends NCModelAdapter(
     override def getAbstractTokens: util.Set[String] = Set("nlpcraft:num", 
"anyWord").asJava
     override def isPermutateSynonyms: Boolean = false
     override def isSparse: Boolean = false
+
+    override def getExcludedStopWords: util.Set[String] = Set("the").asJava
 }
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
index 0dd39bf..a60d762 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
@@ -38,6 +38,8 @@ abstract class NCTokenPropertiesModelAbstract extends 
NCModelAdapter(
 
     override def isPermutateSynonyms: Boolean = true
     override def isSparse: Boolean = true
+
+    override def isStopWordsAllowed: Boolean = false
 }
 
 case class NCPropTestElement(
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsAllowedSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsAllowedSpec.scala
new file mode 100644
index 0000000..7ec0386
--- /dev/null
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsAllowedSpec.scala
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.stop
+
+import org.apache.nlpcraft.model.{NCContext, NCElement, NCModelAdapter, 
NCResult}
+import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+import java.util
+import scala.jdk.CollectionConverters.CollectionHasAsScala
+import scala.language.implicitConversions
+
+/**
+  *
+  */
+class NCStopWordsAllowedModelAdapter extends NCModelAdapter("nlpcraft.test", 
"Test Model", "1.0") {
+    override def getElements: util.Set[NCElement] = Set(NCTestElement("a", 
"the test"))
+}
+
+/**
+  *
+  */
+class NCStopWordsAllowedModel extends NCStopWordsAllowedModelAdapter {
+    override def isStopWordsAllowed: Boolean = true
+
+    override def onContext(ctx: NCContext): NCResult = {
+        ctx.getRequest.getNormalizedText match {
+            case "the" =>
+                // One empty variant.
+                require(ctx.getVariants.size() == 1)
+                require(ctx.getVariants.asScala.head.size() == 0)
+            // Should be processed same way.
+            case "the test" | "the the test" =>
+                // One variant.
+                require(ctx.getVariants.size() == 1)
+
+                // One token (user token)
+                require(ctx.getVariants.asScala.head.size() == 1)
+                require(ctx.getVariants.asScala.head.asScala.head.getId == "a")
+
+            case "test the the test" =>
+                // One variant.
+                require(ctx.getVariants.size() == 1)
+
+                require(ctx.getVariants.asScala.head.size() == 2)
+                require(ctx.getVariants.asScala.head.asScala.head.getId == 
"nlpcraft:nlp")
+                require(ctx.getVariants.asScala.head.asScala.last.getId == "a")
+
+            case _ =>   throw new IllegalStateException(s"Unsupported test: 
${ctx.getRequest.getNormalizedText}")
+        }
+
+        NCResult.text("OK")
+    }
+}
+
+/**
+  *
+  */
+class NCStopWordsNotAllowedModel extends NCStopWordsAllowedModelAdapter {
+    override def isStopWordsAllowed: Boolean = false
+
+    override def onContext(ctx: NCContext): NCResult = {
+        ctx.getRequest.getNormalizedText match {
+            case "the" =>
+                // One variant.
+                require(ctx.getVariants.size() == 1)
+
+                // One free token (nlp)
+                require(ctx.getVariants.asScala.head.size() == 1)
+                require(ctx.getVariants.asScala.head.asScala.head.getId == 
"nlpcraft:nlp")
+
+            case "the test" =>
+                // One variant.
+                require(ctx.getVariants.size() == 1)
+
+                // One token (user token)
+                require(ctx.getVariants.asScala.head.size() == 1)
+                require(ctx.getVariants.asScala.head.asScala.head.getId == "a")
+
+            case "the the test" | "test the the test" =>
+                // There are shouldn't be stop words.
+                ctx.getVariants.asScala.foreach(v => 
require(v.getStopWordTokens.asScala.isEmpty))
+
+            case _ =>   throw new IllegalStateException(s"Unsupported test: 
${ctx.getRequest.getNormalizedText}")
+        }
+
+        NCResult.text("OK")
+    }
+}
+
+/**
+  *
+  */
+@NCTestEnvironment(model = classOf[NCStopWordsAllowedModel], startClient = 
true)
+class NCStopWordsAllowedSpec extends NCTestContext {
+    @Test
+    def test(): Unit = {
+        checkResult("the", "OK")
+        checkResult("the test", "OK")
+        checkResult("the the test", "OK")
+        checkResult("test the the test", "OK")
+    }
+}
+
+/**
+  *
+  */
+@NCTestEnvironment(model = classOf[NCStopWordsNotAllowedModel], startClient = 
true)
+class NCStopWordsNotAllowedSpec extends NCStopWordsAllowedSpec
\ No newline at end of file
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
index 4d5d991..bf4d6f1 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
@@ -92,12 +92,23 @@ class NCEnricherNestedModelSpec2 extends 
NCEnricherNestedModelSpec1 {
                 usr(text = "test tomorrow", id = "x3"),
                 nlp(text = "xxx"),
             ),
+            
             _ => checkExists(
                 "y the y",
-                usr(text = "y y", id = "y3"),
-                nlp(text = "the", isStop = true)
+                usr(text = "y y", id = "y3")
+            ),
+            _ => checkExists(
+                "y the y",
+                usr(text = "y", id = "y1"),
+                usr(text = "y", id = "y1")
             ),
             _ => checkExists(
+                "y the y",
+                usr(text = "y", id = "y2"),
+                usr(text = "y", id = "y2")
+            ),
+
+            _ => checkExists(
                 "y xxx y",
                 usr(text = "y y", id = "y3"),
                 nlp(text = "xxx")
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
index 27082f1..be643d5 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
@@ -42,6 +42,8 @@ class NCNestedTestModel41 extends 
NCModelAdapter("nlpcraft.nested4.test.mdl", "N
 
     override def isPermutateSynonyms: Boolean = false
     override def isSparse: Boolean = false
+
+    override def getExcludedStopWords: util.Set[String] = Set("the", 
"a").asJava
 }
 
 /**

[incubator-nlpcraft] 01/01: WIP.

Reply via email to