This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
new 0d2c798 WIP.
0d2c798 is described below
commit 0d2c7985d508e674ecbe6db6ebc312b9263aa2ce
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Sep 15 13:49:23 2021 +0300
WIP.
---
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 55 ++++++++++++----------
.../model/stop/NCStopWordsInsideSpec.scala | 23 ++++++---
2 files changed, 47 insertions(+), 31 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 6e6f7d1..4823a68 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -194,7 +194,10 @@ object NCModelEnricher extends NCProbeEnricher {
toks.foreach(_.add(note))
// For NLP elements.
- toks.foreach(t => ns.fixNote(t.getNlpNote, "direct" -> direct))
+ toks.foreach(t => {
+ ns.fixNote(t.getNlpNote, "direct" -> direct)
+ ns.fixNote(t.getNlpNote, "stopWord" -> false)
+ })
}
/**
@@ -282,23 +285,26 @@ object NCModelEnricher extends NCProbeEnricher {
}
/**
- * Gets all sequential permutations of given tokens.
*
- * For example, if buffer contains "a b c d" tokens, then this function
will return the
- * sequence of following token sequences in this order:
- * "a b c d"
- * "a b c"
- * "b c d"
- * "a b"
- * "b c"
- * "c d"
- * "a"
- * "b"
- * "c"
- * "d"
+ * @param toks
+ */
+ private def combosNlpTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken],
Seq[NlpToken])] =
+ combos(toks).flatMap(combo => {
+ val stops = combo.filter(_.isStopWord)
+
+ val stops4Delete = Range.inclusive(1,
stops.size).flatMap(stops.combinations)
+
+ (Seq(combo) ++ stops4Delete.map(del => combo.filter(t =>
!del.contains(t)))).map(_ -> combo)
+ }).
+ toMap.
+ filter(_._1.nonEmpty).
+ groupBy(_._1).
+ map(p => p._1 -> p._2.values.minBy(p => (-p.size, p.head.index))).
+ sortBy(p => (-p._2.size, -p._1.size, -p._2.head.index,
-p._1.head.index))
+
+ /**
*
* @param toks
- * @return
*/
private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
(for (n <- toks.size until 0 by -1) yield
toks.sliding(n)).flatten.map(p => p)
@@ -451,7 +457,7 @@ object NCModelEnricher extends NCProbeEnricher {
"enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" ->
mdl.model.getId, "txt" -> ns.text
) { span =>
val req = NCRequestImpl(senMeta, ns.srvReqId)
- val combToks = combos(ns.toSeq)
+ val combToks = combosNlpTokens(ns.toSeq)
lazy val ch = mkComplexes(mdl, ns)
def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
@@ -466,7 +472,9 @@ object NCModelEnricher extends NCProbeEnricher {
lazy val idlCache = mutable.HashSet.empty[Seq[Complex]]
for (
- toks <- combToks;
+ // toksExt is part of sentence.
+ // toks is toksExt or toksExt without some stopwords
set. All stopwords combinations are taking into account.
+ (toks, toksExt) <- combToks;
idxs = toks.map(_.index);
e <- mdl.elements.values;
eId = e.getId;
@@ -489,7 +497,7 @@ object NCModelEnricher extends NCProbeEnricher {
syns.get(tokStems) match {
case Some(s) =>
found = true
- add("simple continuous",
ns, contCache, eId, greedy, toks, idxs, s)
+ add("simple continuous",
ns, contCache, eId, greedy, toksExt, idxs, s)
case None => notFound()
}
@@ -497,7 +505,7 @@ object NCModelEnricher extends NCProbeEnricher {
for (s <- syns if !found)
if (s.isMatch(toks)) {
found = true
- add("simple continuous
scan", ns, contCache, eId, greedy, toks, idxs, s)
+ add("simple continuous
scan", ns, contCache, eId, greedy, toksExt, idxs, s)
}
tryMap(
@@ -527,7 +535,6 @@ object NCModelEnricher extends NCProbeEnricher {
lazy val allCombs = mkCombinations(ch, toks,
idlCache.toSet)
// 2.1 Continuous.
-
if (!mdl.hasSparseSynonyms) {
var found = false
@@ -538,7 +545,7 @@ object NCModelEnricher extends NCProbeEnricher {
data = comb.map(_.data)
)
if (s.isMatch(data, req)) {
- add("IDL continuous", ns, contCache,
eId, greedy, toks, idxs, s, toParts(data, s))
+ add("IDL continuous", ns, contCache,
eId, greedy, toksExt, idxs, s, toParts(data, s))
idlCache += comb
@@ -598,9 +605,9 @@ object NCModelEnricher extends NCProbeEnricher {
||
(
n.tokenIndexes == toksIdxsSorted ||
- n.tokenIndexes.containsSlice(toksIdxsSorted) &&
- U.isContinuous(toksIdxsSorted) &&
- U.isContinuous(n.tokenIndexes)
+ n.tokenIndexes.containsSlice(toksIdxsSorted) &&
+ U.isContinuous(toksIdxsSorted) &&
+ U.isContinuous(n.tokenIndexes)
)
)
))
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
index 64ddf02..cac6983 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
@@ -31,7 +31,7 @@ class NCStopWordsInsideModel extends
NCModelAdapter("nlpcraft.test", "Test Model
override def getElements: util.Set[NCElement] =
Set(NCTestElement("complex", "a b"))
@NCIntent("intent=i term={# == 'complex'}")
- private def onI(): NCResult = NCResult.text("OK")
+ def onI(): NCResult = NCResult.text("OK")
}
/**
@@ -42,10 +42,19 @@ class NCStopWordsInsideSpec extends NCTestContext {
@Test
def test(): Unit = {
checkIntent("a b", "i")
- checkResult("a the b", "i")
- checkResult("a , b", "i")
- checkResult("a,,b", "i")
- checkResult("a, ,b", "i")
- checkResult("a, the,b", "i")
+ checkIntent("a the b", "i")
+ checkIntent("a , b", "i")
+ checkIntent("a, b", "i")
+ checkIntent("a, the b", "i")
+ checkIntent("a, the, b", "i")
}
-}
\ No newline at end of file
+}
+
+class NCStopWordsInsideSparseModel extends NCStopWordsInsideModel {
+ override def isPermutateSynonyms: Boolean = true
+ override def isSparse: Boolean = true
+}
+
+@NCTestEnvironment(model = classOf[NCStopWordsInsideSparseModel], startClient
= true)
+class NCStopWordsInsideSparseSpec extends NCStopWordsInsideSpec
+