[incubator-nlpcraft] branch NLPCRAFT-443 updated: WIP.

sergeykamov Sun, 19 Sep 2021 02:26:29 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
     new 308b097  WIP.
308b097 is described below

commit 308b09761856852938c28f3de3f332b5df4a7e2c
Author: Sergey Kamov <[email protected]>
AuthorDate: Sun Sep 19 12:26:18 2021 +0300

    WIP.
---
 .../apache/nlpcraft/common/nlp/NCNlpSentence.scala |  7 ++--
 .../probe/mgrs/sentence/NCSentenceManager.scala    | 38 ++++++++++++----------
 2 files changed, 25 insertions(+), 20 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index f508745..478d930 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -101,10 +101,11 @@ class NCNlpSentence(
 
     override def equals(obj: Any): Boolean = obj match {
         case x: NCNlpSentence =>
+            tokens.size == x.tokens.size &&
             tokens == x.tokens &&
-                srvReqId == x.srvReqId &&
-                text == x.text &&
-                enabledBuiltInToks == x.enabledBuiltInToks
+            srvReqId == x.srvReqId &&
+            text == x.text &&
+            enabledBuiltInToks == x.enabledBuiltInToks
 
         case _ => false
     }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index c842825..0c0288d 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -487,8 +487,9 @@ object NCSentenceManager extends NCService {
       *
       * @param ns Sentence.
       * @param notNlpTypes Token types.
+      * @param lastPhase Phase.
       */
-    private def collapseSentence(ns: NCNlpSentence, notNlpTypes: Seq[String]): 
Boolean = {
+    private def collapseSentence(ns: NCNlpSentence, notNlpTypes: Seq[String], 
lastPhase: Boolean): Boolean = {
         ns.
             filter(!_.isNlp).
             filter(_.isStopWord).
@@ -523,7 +524,8 @@ object NCSentenceManager extends NCService {
             fixIndexesReferencesList("nlpcraft:sort", "subjindexes", 
"subjnotes", ns, histSeq) &&
             fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", 
ns, histSeq)
 
-        if (res) {
+        // On last phase  - just for performance reasons.
+        if (res && lastPhase) {
             // Validation (all indexes calculated well)
             require(
                 !res ||
@@ -603,7 +605,7 @@ object NCSentenceManager extends NCService {
             if (lastPhase)
                 dropAbstract(mdl, ns)
 
-            if (collapseSentence(ns, 
getNotNlpNotes(ns.toSeq).map(_.noteType).distinct)) Some(ns) else None
+            if (collapseSentence(ns, 
getNotNlpNotes(ns.toSeq).map(_.noteType).distinct, lastPhase)) Some(ns) else 
None
         }
 
         // Always deletes `similar` notes.
@@ -752,14 +754,16 @@ object NCSentenceManager extends NCService {
                 }
             }.toSeq
 
-        sens =
-            sens.filter(s => {
-                def mkNotNlp(s: NCNlpSentence): Set[NCNlpSentenceNote] = 
s.flatten.filter(!_.isNlp).toSet
+        var sensWithNotes = sens.map(s => s -> 
s.flatten.filter(!_.isNlp).toSet)
 
-                val notNlpNotes = mkNotNlp(s)
+        var sensWithNotesIdxs = sensWithNotes.zipWithIndex
 
-                !sens.filter(_ != s).map(mkNotNlp).exists(notNlpNotes.subsetOf)
-            })
+        sens =
+            sensWithNotesIdxs.filter { case ((_, notNlpNotes1), idx1) =>
+                !sensWithNotesIdxs.
+                    filter { case (_, idx2) => idx2 != idx1 }.
+                    exists { case((_, notNlpNotes2), _) => 
notNlpNotes1.subsetOf(notNlpNotes2) }
+            }.map { case ((sen, _), _) => sen }
 
         // Drops similar sentences (with same tokens structure).
         // Among similar sentences we prefer one with minimal free words count.
@@ -769,17 +773,17 @@ object NCSentenceManager extends NCService {
 
         // Drops sentences if they are just subset of another.
         // (Maybe better for lastPhase?)
-        sens = sens.filter(s1 => {
-            val notes1 = s1.tokens.flatten.distinct.filter(!_.isNlp)
+        sensWithNotes = sensWithNotes.filter { case (sen, _) => 
sens.contains(sen) }
 
-            !sens.exists(s2 =>
-                s1 != s2 && {
-                    val notes2 = s2.tokens.flatten.distinct.filter(!_.isNlp)
+        sensWithNotesIdxs = sensWithNotes.zipWithIndex
 
-                    notes2.size > notes1.size && notes1.forall(t1 => 
notes2.exists(_.equalsWithoutIndexes(t1)))
+        sens = sensWithNotesIdxs.filter { case ((s1, notNlpNotes1), idx1) =>
+            !sensWithNotesIdxs.exists { case ((s2, notNlpNotes2), idx2) =>
+                idx1 != idx2 && {
+                    notNlpNotes2.size > notNlpNotes1.size && 
notNlpNotes1.forall(t1 => notNlpNotes2.exists(_.equalsWithoutIndexes(t1)))
                 }
-            )
-        })
+            }
+        }.map { case ((sen, _), _) => sen }
 
         sens
     }

[incubator-nlpcraft] branch NLPCRAFT-443 updated: WIP.

Reply via email to