This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
     new 68f9e40  WIP.
68f9e40 is described below

commit 68f9e4022a1ac39f9b17b4cd1c8ecc5748cef787
Author: Sergey Kamov <[email protected]>
AuthorDate: Sat Sep 18 15:18:05 2021 +0300

    WIP.
---
 .../apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala | 10 ++++++++++
 .../probe/mgrs/sentence/NCSentenceManager.scala        | 18 +++++++++++++++++-
 .../mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala |  6 ------
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
index 7e306f4..45fc3a8 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
@@ -83,6 +83,16 @@ class NCNlpSentenceNote(private val values: Map[String, 
JSerializable]) extends
 
     /**
       *
+      * @param n
+      */
+    def equalsWithoutIndexes(n: NCNlpSentenceNote): Boolean =
+        this.noteType == n.noteType &&
+        this.wordIndexes.size == n.wordIndexes.size &&
+        this.wordIndexes.zip(n.wordIndexes).map(p => p._1 - 
p._2).distinct.size == 1 &&
+        this.clone(Seq(0), Seq(0)) == n.clone(Seq(0), Seq(0))
+
+    /**
+      *
       * @return
       */
     override def toAscii: String =
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 415e6ae..c842825 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -763,9 +763,25 @@ object NCSentenceManager extends NCService {
 
         // Drops similar sentences (with same tokens structure).
         // Among similar sentences we prefer one with minimal free words count.
-        sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
+        sens = sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
             map { case (_, seq) => seq.minBy(_.filter(p => p.isNlp && 
!p.isStopWord).map(_.wordIndexes.length).sum) }.
             toSeq
+
+        // Drops sentences if they are just subset of another.
+        // (Maybe better for lastPhase?)
+        sens = sens.filter(s1 => {
+            val notes1 = s1.tokens.flatten.distinct.filter(!_.isNlp)
+
+            !sens.exists(s2 =>
+                s1 != s2 && {
+                    val notes2 = s2.tokens.flatten.distinct.filter(!_.isNlp)
+
+                    notes2.size > notes1.size && notes1.forall(t1 => 
notes2.exists(_.equalsWithoutIndexes(t1)))
+                }
+            )
+        })
+
+        sens
     }
 
     override def start(parent: Span): NCService = {
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
index 503e093..3aee776 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
@@ -68,12 +68,6 @@ class NCEnricherLimitSpec extends NCEnricherBaseSpec {
                     lim(text = "handful of", limit = 5, index = 1, note = "A", 
asc = false),
                     usr(text = "A", id = "A"),
                     usr(text = "B", id = "B")
-                ),
-                Seq(
-                    nlp("handful"),
-                    nlp("of"),
-                    usr(text = "A", id = "A"),
-                    usr(text = "B", id = "B")
                 )
             )
         )

Reply via email to