[incubator-nlpcraft] 04/05: WIP.

sergeykamov Tue, 06 Apr 2021 09:20:13 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


commit 46d7f4cb05fca36ab25bfa9ef68f65320c62c345
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Apr 6 15:30:52 2021 +0300

    WIP.
---
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala |  58 +++---
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   |   5 +-
 .../probe/mgrs/sentence/NCSentenceManager.scala    |   1 +
 .../nlp/enrichers/sort/NCEnricherSortSpec.scala    | 199 ++++++++++++++++++++-
 4 files changed, 229 insertions(+), 34 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 4464a1b..c766237 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -267,12 +267,11 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
     private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
         (for (n ← toks.size until 0 by -1) yield 
toks.sliding(n)).flatten.map(p ⇒ p)
 
-    // TODO:
-//    /**
-//      *
-//      * @param toks
-//      * @param elemId
-//      */
+    /**
+      *
+      * @param toks
+      * @param elemId
+      */
     private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean = 
toks.forall(_.isTypeOf(elemId))
 
     /**
@@ -296,9 +295,17 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
             case (complex, kind) ⇒ if (complex.isLeft) Some(complex.left.get → 
kind) else None
         }
 
+    /**
+      *
+      */
     private def mkCache(): Cache =
         mutable.HashMap.empty[String, 
mutable.ArrayBuffer[Seq[Int]]].withDefault(_ ⇒ 
mutable.ArrayBuffer.empty[Seq[Int]])
 
+    /**
+      *
+      * @param tows
+      * @param ns
+      */
     private def toNlpTokens(tows: Seq[NCDslContent], ns: NCNlpSentence): 
Seq[NlpToken] =
         (
             tows.filter(_.isRight).map(_.right.get) ++
@@ -306,6 +313,23 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                     flatMap(w ⇒ ns.filter(t ⇒ 
t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
         ).sortBy(_.startCharIndex)
 
+    /**
+      *
+      * @param toks
+      */
+    private def tokString(toks: Seq[NlpToken]): String = toks.map(t ⇒ 
(t.origText, t.index)).mkString(" ")
+
+    /**
+      * Gets synonyms sorted in descending order by their weight (already 
prepared),
+      * i.e. first synonym in the sequence is the most important one.
+      *
+      * @param fastMap {Element ID → {Synonym length → T}}
+      * @param elmId
+      * @param len
+      */
+    private def fastAccess[T](fastMap: Map[String, Map[Int, T]], elmId: 
String, len: Int): Option[T] =
+        fastMap.getOrElse(elmId, Map.empty[Int, T]).get(len)
+
     @throws[NCE]
     override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: 
Map[String, Serializable], parent: Span = null): Unit = {
         require(isStarted)
@@ -325,31 +349,10 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
             def addMatch(elm: NCElement, toks: Seq[NlpToken], syn: Synonym, 
parts: Seq[TokType]): Unit = {
                 val toksSet = toks.toSet
 
-                // TODO:
-                //require(!matches.exists(m ⇒ m.element.getId == elm.getId && 
toksSet.subsetOf(m.tokensSet)))
-
                 if (!matches.exists(m ⇒ m.element.getId == elm.getId && 
toksSet.subsetOf(m.tokensSet)))
                     matches += ElementMatch(elm, toks, syn, parts)
             }
 
-            /**
-              * Gets synonyms sorted in descending order by their weight 
(already prepared),
-              * i.e. first synonym in the sequence is the most important one.
-              *
-              * @param fastMap {Element ID → {Synonym length → T}}
-              * @param elmId
-              * @param len
-              */
-            def fastAccess[T](fastMap: Map[String, Map[Int, T]], elmId: 
String, len: Int): Option[T] =
-                fastMap.getOrElse(elmId, Map.empty[Int, T]).get(len)
-
-            /**
-              *
-              * @param toks
-              * @return
-              */
-            def tokString(toks: Seq[NlpToken]): String = toks.map(t ⇒ 
(t.origText, t.index)).mkString(" ")
-
             lazy val complexesWords = ns.map(Complex(_))
             lazy val complexes: Seq[ComplexSeq] =
                 NCProbeVariants.convert(ns.srvReqId, mdl, 
NCSentenceManager.collapse(mdl.model, ns.clone())).
@@ -606,7 +609,6 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                                 ).getOrElse(throw new AssertionError(s"Custom 
model parser returned an invalid custom token: $w"))
                             )
 
-                            // TODO:
                             if (!alreadyMarked(matchedToks, elemId))
                                 mark(
                                     ns,
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index d3853da..95c123e 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -177,7 +177,7 @@ object NCSortEnricher extends NCProbeEnricher {
         toks.flatten.
             filter(!_.isNlp).
             filter(n ⇒ n.tokenIndexes.head >= min && n.tokenIndexes.last <= 
max).
-            map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
+            map(n ⇒ NoteData(n.noteType, n.tokenIndexes)).
             sortBy(_.indexes.head).distinct
     }
 
@@ -324,8 +324,7 @@ object NCSortEnricher extends NCProbeEnricher {
                         forall(p ⇒ (p.isStopWord || p.stem == stemAnd) && 
!maskWords.contains(p.stem))
                 ) {
                     // It removes duplicates (`SORT x x ORDER x x x` converts 
to `SORT x ORDER x`)
-                    val mask = toks.map(getKeyWordType).
-                        foldLeft("")((x, y) ⇒ if (x.endsWith(y)) x else s"$x 
$y").trim
+                    val mask = toks.map(getKeyWordType).foldLeft("")((x, y) ⇒ 
if (x.endsWith(y)) x else s"$x $y").trim
 
                     MASKS.get(mask) match {
                         case Some(typ) ⇒
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index fb676d0..7d011a0 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -23,6 +23,7 @@ import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, 
NCNlpSentenceToken}
 import org.apache.nlpcraft.common.{NCE, NCService, U}
 import org.apache.nlpcraft.model.NCModel
+import org.apache.nlpcraft.model.impl.NCTokenLogger
 
 import java.io.{Serializable => JSerializable}
 import java.util
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
index cc03066..8f24288 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
@@ -17,15 +17,31 @@
 
 package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort
 
-import org.apache.nlpcraft.NCTestEnvironment
+import org.apache.nlpcraft.{NCTestElement, NCTestEnvironment}
+import org.apache.nlpcraft.model.NCElement
 import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.NCTestSortTokenType._
-import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.{NCDefaultTestModel, 
NCEnricherBaseSpec, NCTestNlpToken ⇒ nlp, NCTestSortToken ⇒ srt, 
NCTestUserToken ⇒ usr}
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.{NCDefaultTestModel, 
NCEnricherBaseSpec, NCTestNlpToken => nlp, NCTestSortToken => srt, 
NCTestUserToken => usr}
 import org.junit.jupiter.api.Test
 
+import java.util
+import scala.collection.JavaConverters._
+
+class NCDefaultSpecTestModel extends NCDefaultTestModel {
+    override def getElements: util.Set[NCElement] = {
+        (
+            super.getElements.asScala ++
+            Set(NCTestElement("wrapperA", "^^{tok_id() == 'A'}^^ ^^{tok_id() 
== 'A'}^^ ^^{tok_id() == 'A'}^^"))
+        ).asJava
+    }
+
+    override def isPermutateSynonyms: Boolean = true
+    override def isSparse: Boolean = true
+}
+
 /**
  * Sort enricher test.
  */
-@NCTestEnvironment(model = classOf[NCDefaultTestModel], startClient = true)
+@NCTestEnvironment(model = classOf[NCDefaultSpecTestModel], startClient = true)
 class NCEnricherSortSpec extends NCEnricherBaseSpec {
     /**
      *
@@ -204,6 +220,183 @@ class NCEnricherSortSpec extends NCEnricherBaseSpec {
                 nlp(text = ",", isStop = true),
                 usr(text = "B", id = "B"),
                 nlp(text = ", asc", isStop = true)
+            ),
+            _ ⇒ checkExists(
+                "sort A",
+                srt(text = "sort", typ = SUBJ_ONLY, note = "A", index = 1),
+                usr("A", "A")
+            ),
+            _ ⇒ checkExists(
+                "sort A by A",
+                srt(text = "sort", subjNote = "A", subjIndex = 1, byNote = 
"A", byIndex = 3),
+                usr(text = "A", id = "A"),
+                nlp(text = "by", isStop = true),
+                usr(text = "A", id = "A")
+            ),
+            _ ⇒ checkExists(
+                "sort A, C by A, C",
+                srt(text = "sort", subjNotes = Seq("A", "C"), subjIndexes = 
Seq(1, 3), byNotes = Seq("A", "C"), byIndexes = Seq(5, 7)),
+                usr(text = "A", id = "A"),
+                nlp(text = ",", isStop = true),
+                usr(text = "C", id = "C"),
+                nlp(text = "by", isStop = true),
+                usr(text = "A", id = "A"),
+                nlp(text = ",", isStop = true),
+                usr(text = "C", id = "C")
+            ),
+            _ ⇒ checkExists(
+                "sort A C by A C",
+                srt(text = "sort", subjNotes = Seq("A", "C"), subjIndexes = 
Seq(1, 2), byNotes = Seq("A", "C"), byIndexes = Seq(4, 5)),
+                usr(text = "A", id = "A"),
+                usr(text = "C", id = "C"),
+                nlp(text = "by", isStop = true),
+                usr(text = "A", id = "A"),
+                usr(text = "C", id = "C")
+            ),
+            _ ⇒ checkExists(
+                "sort A B by A B",
+                srt(text = "sort", subjNotes = Seq("A", "B"), subjIndexes = 
Seq(1, 2), byNotes = Seq("A", "B"), byIndexes = Seq(4, 5)),
+                usr(text = "A", id = "A"),
+                usr(text = "B", id = "B"),
+                nlp(text = "by", isStop = true),
+                usr(text = "A", id = "A"),
+                usr(text = "B", id = "B")
+            ),
+            _ ⇒ checkExists(
+                "sort A B by A B",
+                srt(text = "sort", subjNote = "AB", subjIndex = 1, byNote = 
"AB", byIndex = 3),
+                usr(text = "A B", id = "AB"),
+                nlp(text = "by", isStop = true),
+                usr(text = "A B", id = "AB")
+            ),
+            _ ⇒ checkExists(
+                "A classify",
+                usr(text = "A", id = "A"),
+                srt(text = "classify", typ = SUBJ_ONLY, note = "A", index = 0)
+            ),
+            _ ⇒ checkExists(
+                "the A the classify",
+                nlp(text = "the", isStop = true),
+                usr(text = "A", id = "A"),
+                nlp(text = "the", isStop = true),
+                srt(text = "classify", typ = SUBJ_ONLY, note = "A", index = 1)
+            ),
+            _ ⇒ checkExists(
+                "segment A by top down",
+                srt(text = "segment", typ = SUBJ_ONLY, note = "A", index = 1, 
asc = false),
+                usr(text = "A", id = "A"),
+                nlp(text = "by top down", isStop = true)
+            ),
+            _ ⇒ checkExists(
+                "segment A in bottom up order",
+                srt(text = "segment", typ = SUBJ_ONLY, note = "A", index = 1, 
asc = true),
+                usr(text = "A", id = "A"),
+                nlp(text = "in bottom up order", isStop = true)
+            ),
+            // `by` is redundant word here
+            _ ⇒ checkExists(
+                "segment A by in bottom up order",
+                srt(text = "segment", typ = SUBJ_ONLY, note = "A", index = 1),
+                usr(text = "A", id = "A"),
+                nlp(text = "by"),
+                nlp(text = "in"),
+                nlp(text = "bottom"),
+                nlp(text = "up"),
+                nlp(text = "order")
+            ),
+            _ ⇒ checkExists(
+                "the segment the A the in bottom up the order the",
+                nlp(text = "the", isStop = true),
+                srt(text = "segment", typ = SUBJ_ONLY, note = "A", index = 3, 
asc = true),
+                nlp(text = "the", isStop = true),
+                usr(text = "A", id = "A"),
+                nlp(text = "the in bottom up the order the", isStop = true)
+            ),
+            _ ⇒ checkExists(
+                "the segment the A the by bottom up the order the",
+                nlp(text = "the", isStop = true),
+                srt(text = "segment", typ = SUBJ_ONLY, note = "A", index = 3, 
asc = true),
+                nlp(text = "the", isStop = true),
+                usr(text = "A", id = "A"),
+                nlp(text = "the by bottom up the order the", isStop = true)
+            ),
+            _ ⇒ checkExists(
+                "A classify",
+                usr(text = "A", id = "A"),
+                srt(text = "classify", typ = SUBJ_ONLY, note = "A", index = 0)
+            ),
+            _ ⇒ checkAll(
+                "A B classify",
+                Seq(
+                    usr(text = "A B", id = "AB"),
+                    srt(text = "classify", typ = SUBJ_ONLY, note = "AB", index 
= 0)
+                ),
+                Seq(
+                    usr(text = "A", id = "A"),
+                    usr(text = "B", id = "B"),
+                    srt(text = "classify", subjNotes = Seq("A", "B"), 
subjIndexes = Seq(0, 1))
+                ),
+                Seq(
+                    usr(text = "A", id = "A"),
+                    usr(text = "B", id = "B"),
+                    srt(text = "classify", subjNotes = Seq("B"), subjIndexes = 
Seq(1))
+                )
+            ),
+            _ ⇒ checkAll(
+                "D classify",
+                Seq(
+                    usr(text = "D", id = "D1"),
+                    srt(text = "classify", typ = SUBJ_ONLY, note = "D1", index 
= 0)
+                ),
+                Seq(
+                    usr(text = "D", id = "D2"),
+                    srt(text = "classify", typ = SUBJ_ONLY, note = "D2", index 
= 0)
+                )
+            ),
+            _ ⇒ checkAll(
+                "sort by A",
+                Seq(
+                    srt(text = "sort by", typ = BY_ONLY, note = "A", index = 
1),
+                    usr(text = "A", id = "A")
+                )
+            ),
+            _ ⇒ checkExists(
+                "organize by A, B top down",
+                srt(text = "organize by", byNotes = Seq("A", "B"), byIndexes = 
Seq(1, 3), asc = Some(false)),
+                usr(text = "A", id = "A"),
+                nlp(text = ",", isStop = true),
+                usr(text = "B", id = "B"),
+                nlp(text = "top down", isStop = true)
+            ),
+            _ ⇒ checkExists(
+                "organize by A, B from bottom up order",
+                srt(text = "organize by", byNotes = Seq("A", "B"), byIndexes = 
Seq(1, 3), asc = Some(true)),
+                usr(text = "A", id = "A"),
+                nlp(text = ",", isStop = true),
+                usr(text = "B", id = "B"),
+                nlp(text = "from bottom up order", isStop = true)
+            ),
+            _ ⇒ checkExists(
+                "organize by A, B the descending",
+                srt(text = "organize by", byNotes = Seq("A", "B"), byIndexes = 
Seq(1, 3), asc = Some(false)),
+                usr(text = "A", id = "A"),
+                nlp(text = ",", isStop = true),
+                usr(text = "B", id = "B"),
+                nlp(text = "the descending", isStop = true)
+            ),
+            _ ⇒ checkExists(
+                "organize by A, B, asc",
+                srt(text = "organize by", byNotes = Seq("A", "B"), byIndexes = 
Seq(1, 3), asc = Some(true)),
+                usr(text = "A", id = "A"),
+                nlp(text = ",", isStop = true),
+                usr(text = "B", id = "B"),
+                nlp(text = ", asc", isStop = true)
+            ),
+            _ ⇒ checkExists(
+                "sort A the A the A",
+                srt(text = "sort", typ = SUBJ_ONLY, note = "wrapperA", index = 
1),
+                usr("A A A", "wrapperA"),
+                nlp("the the", isStop = true)
             )
         )
 }

[incubator-nlpcraft] 04/05: WIP.

Reply via email to