[incubator-nlpcraft] 07/08: WIP.

sergeykamov Sat, 18 Sep 2021 02:05:21 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


commit f471a8be8f5210d4d1c1ad9905e3021fc04673d6
Author: Sergey Kamov <[email protected]>
AuthorDate: Sat Sep 18 12:00:24 2021 +0300

    WIP.
---
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 135 ++++-----------
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   | 189 ++++++++++-----------
 .../probe/mgrs/sentence/NCSentenceManager.scala    |  11 +-
 3 files changed, 130 insertions(+), 205 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index d83ab05..f8457e8 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -19,6 +19,7 @@ package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
 
 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common._
+import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, 
NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
 import org.apache.nlpcraft.model._
 import org.apache.nlpcraft.model.impl.NCTokenImpl
@@ -330,7 +331,6 @@ object NCModelEnricher extends NCProbeEnricher {
             val bigSlides = slides.filter(_.size > 2)
 
             var stops4Delete: Seq[Seq[NlpToken]] =
-
                 if (bigSlides.nonEmpty) {
                     val allBig = bigSlides.flatMap(p => p)
                     val stops4AllCombs = stops.filter(p => !allBig.contains(p))
@@ -534,8 +534,6 @@ object NCModelEnricher extends NCProbeEnricher {
     override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, 
Serializable], parent: Span = null): Unit = {
         require(isStarted)
 
-        //logger.info("ENRICH111")
-
         startScopedSpan(
             "enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> 
mdl.model.getId, "txt" -> ns.text
         ) { span =>
@@ -547,17 +545,16 @@ object NCModelEnricher extends NCProbeEnricher {
 //
 //            
logger.info("ns.flatten.flatten.size="+ns.tokens.flatten.distinct.count(!_.isNlp))
 //
-//            ns.tokens.flatten.filter(!_.isNlp).distinct.foreach(n => {
+//            ns.tokens.flatten.filter(!_.isNlp).distinct.sortBy(p => 
(p.noteType, -p.tokenIndexes.size)).foreach(n => {
 //                val parts =
 //                    n.get("parts") match {
 //                        case Some(v) =>
 //                            val parts = 
v.asInstanceOf[java.util.List[NCTokenPartKey]].asScala
 //
-//                            "all parts=" + parts.size + "  " +
 //                            parts.map(p => {
 //                                val ref = ns.tokens.find(t => 
t.startCharIndex == p.from && t.endCharIndex == p.to).get
 //
-//                                "part=" + p.id + " (" + ref.index + "), 
text=" + ref.origText
+//                                "part=" + p.id + " (idx=" + ref.index + "), 
text=" + ref.origText
 //                            }).mkString(" | ")
 //                        case None => "NO"
 //                    }
@@ -588,11 +585,6 @@ object NCModelEnricher extends NCProbeEnricher {
                             !greedy ||
                             !contCache(eId).exists(_.containsSlice(idxs))  && 
!alreadyMarked(ns, eId, toks, idxs)
                     ) {
-//                        println("!!!toks="+toks.map(_.origText).mkString(" 
"))
-//                        
println("!!!toksExt="+toksExt.map(_.origText).mkString(" "))
-//                        println()
-
-
                         // 1. SIMPLE.
                         if (simpleEnabled && (if (idlEnabled) 
mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) {
                             lazy val tokStems = toks.map(_.stem).mkString(" ")
@@ -699,7 +691,38 @@ object NCModelEnricher extends NCProbeEnricher {
             processParsers(mdl, ns, span, req)
         }
 
-        //logger.info("ENRICH222")
+        normalize(ns)
+    }
+
+    /**
+      *
+      * @param ns
+      */
+    private def normalize(ns: Sentence): Unit = {
+        val usrNotes = ns.flatten.filter(_.isUser).distinct
+        val links = NCSentenceManager.getLinks(usrNotes)
+        val parts = NCSentenceManager.getPartKeys(usrNotes: _*)
+
+        usrNotes.
+            filter(n => !links.contains(NoteLink(n.noteType, 
n.tokenIndexes.sorted))).
+            filter(n => !parts.contains(NCTokenPartKey(n, ns))).
+            foreach(n => {
+                val hasBetter =
+                    usrNotes.exists(candidate =>
+                        candidate != n &&
+                        candidate.noteType == n.noteType &&
+                        candidate.dataOpt("parts") == n.dataOpt("parts") &&
+                        
candidate.wordIndexes.toSet.subsetOf(n.wordIndexes.toSet) &&
+                        n.wordIndexes.filter(n => 
!candidate.wordIndexes.contains(n)).
+                            forall(wordIdx => ns.tokens.exists(t => 
t.wordIndexes.contains(wordIdx) && t.isStopWord)))
+
+                if (hasBetter) {
+                    ns.removeNote(n)
+
+                    // TODO: trace.
+                    logger.info(s"Element removed: ${n}")
+                }
+            })
     }
 
     // TODO: simplify, add tests, check model properties (sparse etc) for 
optimization.
@@ -730,91 +753,3 @@ object NCModelEnricher extends NCProbeEnricher {
         ))
     }
 }
-
-object x extends App {
-    case class T(index: Int, isStopWord: Boolean = false) {
-        override def toString: String = index.toString
-    }
-
-    private def combosTokens1(toks: Seq[T]): Seq[(Seq[T], Seq[T])] =
-        combos(toks).flatMap(combo => {
-            val stops = combo.filter(_.isStopWord)
-
-            val stops4Delete = Range.inclusive(1, 
stops.size).flatMap(stops.combinations)
-
-            (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => 
!del.contains(t)))).map(_ -> combo)
-        }).
-            toMap.
-            filter(_._1.nonEmpty).
-            groupBy(_._1).
-            map(p => p._1 -> p._2.values.minBy(p => (-p.size, p.head.index))).
-            sortBy(p => (-p._2.size, -p._1.size, -p._2.head.index, 
-p._1.head.index))
-
-    private def combos[T](toks: Seq[T]): Seq[Seq[T]] = {
-        val x  = (for (n <- toks.size until 0 by -1) yield 
toks.sliding(n)).flatten.map(p => p)
-
-        println("size=" + x.size)
-
-        x
-    }
-
-    private def combosTokens(toks: Seq[T]): Seq[(Seq[T], Seq[T])] = {
-        val value = combos(toks)
-
-        value.flatMap(combo => {
-            val stops = combo.filter(_.isStopWord)
-
-            val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[T]]
-
-            for (stop <- stops)
-                if (slides.nonEmpty && slides.last.last.index + 1 == 
stop.index)
-                    slides.last += stop
-                else
-                    slides += mutable.ArrayBuffer.empty :+ stop
-
-            val bigSlides = slides.filter(_.size >= 3)
-
-            var stops4Delete: Seq[Seq[T]] =
-                if (bigSlides.nonEmpty) {
-                    val allBig = bigSlides.flatten
-                    val stops4AllCombs = stops.filter(p => !allBig.contains(p))
-
-                    if (stops4AllCombs.nonEmpty)
-                        for (
-                            seq1 <- Range.inclusive(0, 
stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
-                                seq2 <- Range.inclusive(0, 
bigSlides.size).flatMap(bigSlides.combinations)
-                        )
-                        yield seq1 ++ seq2.flatMap(p => p)
-                    else
-                        for (seq <- Range.inclusive(0, 
bigSlides.size).flatMap(bigSlides.combinations))
-                            yield seq.flatMap(p => p)
-                }
-                else
-                    Range.inclusive(1, stops.size).flatMap(stops.combinations)
-
-            stops4Delete = stops4Delete.filter(seq => 
!seq.contains(combo.head) && !seq.contains(combo.last))
-
-            (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => 
!del.contains(t)))).map(_ -> combo).distinct
-        }).
-            filter(_._1.nonEmpty).
-            groupBy(_._1).
-            map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, 
p.head.index))).
-            sortBy { case (data, combo) => (-combo.size, -data.size, 
combo.head.index, data.head.index) }
-    }
-
-    def go(): Unit = {
-        val combs = combosTokens(
-//            Seq(
-//                T(0), T(2, true), T(3, true), T(4, true), T(5), T(6), T(7, 
true), T(8, true),  T(9),  T(10, true),  T(11, true),  T(12)
-//            )
-
-            Range.inclusive(0, 12).map(T(_, true))
-        )
-
-        println("All=" + combs.size)
-
-        combs.foreach { case (p1, p2) => println(p1.mkString("|") + " : " + 
p2.mkString("|"))  }
-    }
-
-    go()
-}
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 286c8b4..1e31ab0 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -17,7 +17,6 @@
 
 package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort
 
-import java.io.Serializable
 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common.NCService
 import org.apache.nlpcraft.common.makro.NCMacroParser
@@ -26,6 +25,7 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence, 
NCNlpSentenceNote, NCNlpSe
 import org.apache.nlpcraft.probe.mgrs.NCProbeModel
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 
+import java.io.Serializable
 import java.util.{List => JList}
 import scala.collection.mutable
 import scala.jdk.CollectionConverters._
@@ -187,59 +187,50 @@ object NCSortEnricher extends NCProbeEnricher {
       *
       * @param toksNoteData
       */
-    private def split(toks: Seq[NCNlpSentenceToken], othersRefs: 
Seq[NCNlpSentenceToken], toksNoteData: Seq[NoteData], nullable: Boolean): 
Seq[Seq[NoteData]] = {
-        val res =
-            if (toksNoteData.nonEmpty) {
-                val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
-
-                /**
-                  * Returns flag which indicates are token contiguous or not.
-                  *
-                  * @param tok1Idx First token index.
-                  * @param tok2Idx Second token index.
-                  */
-                def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
-                    val between = toks.filter(t => t.index > tok1Idx && 
t.index < tok2Idx)
-
-                    between.isEmpty || between.forall(p => p.isStopWord || 
p.stem == stemAnd)
-                }
+    private def split(
+        toks: Seq[NCNlpSentenceToken],
+        othersRefs: Seq[NCNlpSentenceToken],
+        toksNoteData: Seq[NoteData]
+    ): Seq[Seq[NoteData]] =
+        if (toksNoteData.nonEmpty) {
+            val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
+
+            /**
+              * Returns flag which indicates are token contiguous or not.
+              *
+              * @param tok1Idx First token index.
+              * @param tok2Idx Second token index.
+              */
+            def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
+                val between = toks.filter(t => t.index > tok1Idx && t.index < 
tok2Idx)
+
+                between.isEmpty || between.forall(p => p.isStopWord || p.stem 
== stemAnd)
+            }
 
-                val toks2 = toks.filter(othersRefs.contains)
+            val toks2 = toks.filter(othersRefs.contains)
 
-                val minIdx = toks2.dropWhile(t => 
!isUserNotValue(t)).head.index
-                val maxIdx = toks2.reverse.dropWhile(t => 
!isUserNotValue(t)).head.index
+            val minIdx = toks2.dropWhile(t => !isUserNotValue(t)).head.index
+            val maxIdx = toks2.reverse.dropWhile(t => 
!isUserNotValue(t)).head.index
 
-                require(minIdx <= maxIdx)
+            require(minIdx <= maxIdx)
 
-                def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = 
mutable.ArrayBuffer.empty[NoteData]): Unit = {
-                    seq += nd
+            def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = 
mutable.ArrayBuffer.empty[NoteData]): Unit = {
+                seq += nd
 
-                    toksNoteData.
-                        filter(p => nd.indexes.last < p.indexes.head && 
contiguous(nd.indexes.last, p.indexes.head)).
-                        foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ 
seq.clone()))
+                toksNoteData.
+                    filter(p => nd.indexes.last < p.indexes.head && 
contiguous(nd.indexes.last, p.indexes.head)).
+                    foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ 
seq.clone()))
 
-                    if (seq.nonEmpty && seq.head.indexes.head == minIdx && 
seq.last.indexes.last == maxIdx)
-                        res += seq
-                }
+                if (seq.nonEmpty && seq.head.indexes.head == minIdx && 
seq.last.indexes.last == maxIdx)
+                    res += seq
+            }
 
-                toksNoteData.filter(_.indexes.head == minIdx).foreach(p => 
fill(p))
+            toksNoteData.filter(_.indexes.head == minIdx).foreach(p => fill(p))
 
-                res
-            }
-            else
-                Seq.empty
-
-        if (res.isEmpty && !nullable)
-            throw new AssertionError(s"Invalid empty result " +
-                s"[tokensTexts=[${toks.map(_.origText).mkString("|")}]" +
-                s", notes=[${toks.flatten.map(n => 
s"${n.noteType}:[${n.tokenIndexes.mkString(",")}]").mkString("|")}]" +
-                s", tokensIndexes=[${toks.map(_.index).mkString("|")}]" +
-                s", allData=[${toksNoteData.mkString("|")}]" +
-                s"]"
-            )
-
-        res.toSeq
-    }
+            res
+        }
+        else
+            Seq.empty
 
     /**
       *
@@ -346,72 +337,78 @@ object NCSortEnricher extends NCProbeEnricher {
                             if (data1.nonEmpty || data2.nonEmpty) {
                                 val seq1 =
                                     if (data1.nonEmpty)
-                                        split(part1, othersRefs, data1, 
nullable = false)
-                                    else
-                                        split(part2, othersRefs, data2, 
nullable = false)
-                                val seq2 =
-                                    if (data1.nonEmpty && data2.nonEmpty)
-                                        split(part2, othersRefs, data2, 
nullable = true)
+                                        split(part1, othersRefs, data1)
                                     else
-                                        Seq.empty
-                                val asc = orderOpt.flatMap(o => 
Some(order(o.synonymIndex)._2))
-
-                                typ match {
-                                    case TYPE_SUBJ =>
-                                        require(seq1.nonEmpty)
-                                        require(seq2.isEmpty)
-                                        require(sortToks.nonEmpty)
-
-                                        // Ignores invalid cases.
-                                        if (byToks.isEmpty)
-                                            res =
-                                                Some(
+                                        split(part2, othersRefs, data2)
+
+                                if (seq1.nonEmpty) {
+                                    val seq2 =
+                                        if (data1.nonEmpty && data2.nonEmpty)
+                                            split(part2, othersRefs, data2)
+                                        else
+                                            Seq.empty
+
+                                    val asc = orderOpt.flatMap(o => 
Some(order(o.synonymIndex)._2))
+
+                                    typ match {
+                                        case TYPE_SUBJ =>
+                                            require(seq1.nonEmpty)
+                                            require(seq2.isEmpty)
+                                            require(sortToks.nonEmpty)
+
+                                            // Ignores invalid cases.
+                                            if (byToks.isEmpty)
+                                                res =
+                                                    Some(
+                                                        Match(
+                                                            asc = asc,
+                                                            main = sortToks,
+                                                            stop = orderToks,
+                                                            subjSeq = seq1,
+                                                            bySeq = Seq.empty
+                                                        )
+                                                    )
+
+                                        case TYPE_SUBJ_BY =>
+                                            require(seq1.nonEmpty)
+                                            require(sortToks.nonEmpty)
+                                            require(byToks.nonEmpty)
+
+                                            if (seq2.isEmpty)
+                                                res = None
+                                            else
+                                                res = Some(
                                                     Match(
                                                         asc = asc,
                                                         main = sortToks,
-                                                        stop = orderToks,
+                                                        stop = byToks ++ 
orderToks,
                                                         subjSeq = seq1,
-                                                        bySeq = Seq.empty
+                                                        bySeq = seq2
                                                     )
                                                 )
 
-                                    case TYPE_SUBJ_BY =>
-                                        require(seq1.nonEmpty)
-                                        require(sortToks.nonEmpty)
-                                        require(byToks.nonEmpty)
+                                        case TYPE_BY =>
+                                            require(seq1.nonEmpty)
+                                            require(seq2.isEmpty)
+                                            require(sortToks.nonEmpty)
+                                            require(byToks.nonEmpty)
 
-                                        if (seq2.isEmpty)
-                                            res = None
-                                        else
+                                            // `Sort by` as one element, see 
validation.
                                             res = Some(
                                                 Match(
                                                     asc = asc,
-                                                    main = sortToks,
-                                                    stop = byToks ++ orderToks,
-                                                    subjSeq = seq1,
-                                                    bySeq = seq2
+                                                    main = sortToks ++ byToks,
+                                                    stop = orderToks,
+                                                    subjSeq = Seq.empty,
+                                                    bySeq = seq1
                                                 )
                                             )
 
-                                    case TYPE_BY =>
-                                        require(seq1.nonEmpty)
-                                        require(seq2.isEmpty)
-                                        require(sortToks.nonEmpty)
-                                        require(byToks.nonEmpty)
-
-                                        // `Sort by` as one element, see 
validation.
-                                        res = Some(
-                                            Match(
-                                                asc = asc,
-                                                main = sortToks ++ byToks,
-                                                stop = orderToks,
-                                                subjSeq = Seq.empty,
-                                                bySeq = seq1
-                                            )
-                                        )
-
-                                    case _ => throw new 
AssertionError(s"Unexpected type: $typ")
+                                        case _ => throw new 
AssertionError(s"Unexpected type: $typ")
+                                    }
                                 }
+                                else
+                                    None
                             }
                         case None => // No-op.
                     }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 41fc484..d85c9d6 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -48,7 +48,7 @@ object NCSentenceManager extends NCService {
       *
       * @param notes
       */
-    private def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = {
+    def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = {
         val noteLinks = mutable.ArrayBuffer.empty[NoteLink]
 
         for (n <- notes.filter(n => n.noteType == "nlpcraft:limit" || 
n.noteType == "nlpcraft:references"))
@@ -79,7 +79,7 @@ object NCSentenceManager extends NCService {
       *
       * @param notes
       */
-    private def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] =
+    def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] =
         notes.
             filter(_.isUser).
             flatMap(n => {
@@ -677,15 +677,9 @@ object NCSentenceManager extends NCService {
                         map { case (_, seq) => seq.map { case (_, note) => 
note }.toSet }.
                         toSeq.sortBy(-_.size)
 
-//                println("!!!!!toksByIdx.size="+toksByIdx.size)
-//                
println("!!!!!toksByIdx.ALL-sizes-sum="+toksByIdx.map(_.size).sum)
-//                println("!!!!!toksByIdx.all-sized="+toksByIdx.map(_.size))
-                        
                 def findCombinations(): Seq[Seq[NCNlpSentenceNote]] = {
                     val res = 
NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, 
pool).asScala.map(_.asScala.toSeq)
 
-//                    println("!!! combinations=" + res.size)
-
                     res
                 }
 
@@ -742,7 +736,6 @@ object NCSentenceManager extends NCService {
             )
         )
 
-
         def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = 
s.flatten.filter(!_.isNlp)
 
         // Drops similar sentences (with same notes structure). Keeps with 
more found.

[incubator-nlpcraft] 07/08: WIP.

Reply via email to