This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
new 68f9e40 WIP.
68f9e40 is described below
commit 68f9e4022a1ac39f9b17b4cd1c8ecc5748cef787
Author: Sergey Kamov <[email protected]>
AuthorDate: Sat Sep 18 15:18:05 2021 +0300
WIP.
---
.../apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala | 10 ++++++++++
.../probe/mgrs/sentence/NCSentenceManager.scala | 18 +++++++++++++++++-
.../mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala | 6 ------
3 files changed, 27 insertions(+), 7 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
index 7e306f4..45fc3a8 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
@@ -83,6 +83,16 @@ class NCNlpSentenceNote(private val values: Map[String,
JSerializable]) extends
/**
*
+ * @param n
+ */
+ def equalsWithoutIndexes(n: NCNlpSentenceNote): Boolean =
+ this.noteType == n.noteType &&
+ this.wordIndexes.size == n.wordIndexes.size &&
+ this.wordIndexes.zip(n.wordIndexes).map(p => p._1 -
p._2).distinct.size == 1 &&
+ this.clone(Seq(0), Seq(0)) == n.clone(Seq(0), Seq(0))
+
+ /**
+ *
* @return
*/
override def toAscii: String =
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 415e6ae..c842825 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -763,9 +763,25 @@ object NCSentenceManager extends NCService {
// Drops similar sentences (with same tokens structure).
// Among similar sentences we prefer one with minimal free words count.
- sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
+ sens = sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
map { case (_, seq) => seq.minBy(_.filter(p => p.isNlp &&
!p.isStopWord).map(_.wordIndexes.length).sum) }.
toSeq
+
+ // Drops sentences if they are just subset of another.
+ // (Maybe better for lastPhase?)
+ sens = sens.filter(s1 => {
+ val notes1 = s1.tokens.flatten.distinct.filter(!_.isNlp)
+
+ !sens.exists(s2 =>
+ s1 != s2 && {
+ val notes2 = s2.tokens.flatten.distinct.filter(!_.isNlp)
+
+ notes2.size > notes1.size && notes1.forall(t1 =>
notes2.exists(_.equalsWithoutIndexes(t1)))
+ }
+ )
+ })
+
+ sens
}
override def start(parent: Span): NCService = {
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
index 503e093..3aee776 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
@@ -68,12 +68,6 @@ class NCEnricherLimitSpec extends NCEnricherBaseSpec {
lim(text = "handful of", limit = 5, index = 1, note = "A",
asc = false),
usr(text = "A", id = "A"),
usr(text = "B", id = "B")
- ),
- Seq(
- nlp("handful"),
- nlp("of"),
- usr(text = "A", id = "A"),
- usr(text = "B", id = "B")
)
)
)