This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-443 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 1891c2c2d8e2441d0ad60a2a75f3fafee0dbaa04 Author: Sergey Kamov <[email protected]> AuthorDate: Fri Sep 17 11:55:48 2021 +0300 WIP. --- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 216 +++++++++++++++++++-- .../probe/mgrs/sentence/NCSentenceManager.scala | 13 +- .../model/stop/NCStopWordsInsideSpec.scala | 11 +- .../model/NCEnricherNestedModelSpec4.scala | 81 ++++++-- 4 files changed, 280 insertions(+), 41 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 22af412..d83ab05 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -21,6 +21,7 @@ import io.opencensus.trace.Span import org.apache.nlpcraft.common._ import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken} import org.apache.nlpcraft.model._ +import org.apache.nlpcraft.model.impl.NCTokenImpl import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher @@ -285,11 +286,7 @@ object NCModelEnricher extends NCProbeEnricher { } } - /** - * - * @param toks - */ - private def combosNlpTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] = + private def combosTokens1(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] = combos(toks).flatMap(combo => { val stops = combo.filter(_.isStopWord) @@ -303,6 +300,64 @@ object NCModelEnricher extends NCProbeEnricher { map(p => p._1 -> p._2.values.minBy(p => (-p.size, p.head.index))). sortBy(p => (-p._2.size, -p._1.size, -p._2.head.index, -p._1.head.index)) + + /** + * + * 1. Prepares combination of tokens (sliding). + * Example: 'A B C D' -> {'A B C', 'A B', 'B C', 'A', 'B', 'C'} + * One sentence converted to 4 pieces. + * + * 2. Additionally, each piece converted into set of elements with all possible its stopwords permutations. + * Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded into + * {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'} + * + * 3. All variants collected, duplicated deleted, etc. + * + * @param toks + */ + private def combosTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] = + combos(toks).flatMap(combo => { + val stops = combo.filter(s => s.isStopWord && s != combo.head && s != combo.last) + + val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NlpToken]] + + for (stop <- stops) + if (slides.nonEmpty && slides.last.last.index + 1 == stop.index) + slides.last += stop + else + slides += mutable.ArrayBuffer.empty :+ stop + + val bigSlides = slides.filter(_.size > 2) + + var stops4Delete: Seq[Seq[NlpToken]] = + + if (bigSlides.nonEmpty) { + val allBig = bigSlides.flatMap(p => p) + val stops4AllCombs = stops.filter(p => !allBig.contains(p)) + + if (stops4AllCombs.nonEmpty) + for ( + seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations); + seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations) + ) + yield seq1 ++ seq2.flatMap(p => p) + else + for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)) + yield seq.flatMap(p => p) + } + else + Range.inclusive(1, stops.size).flatMap(stops.combinations) + + stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last)) + + (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct + + }). + filter(_._1.nonEmpty). + groupBy(_._1). + map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))). + sortBy { case(data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) } + /** * * @param toks @@ -315,9 +370,18 @@ object NCModelEnricher extends NCProbeEnricher { * @param seq * @param s */ - private def toParts(seq: Seq[NCIdlContent], s: Synonym): Seq[TokType] = + private def toParts(mdl: NCProbeModel, stvReqId: String, seq: Seq[NCIdlContent], s: Synonym): Seq[TokType] = seq.zip(s.map(_.kind)).flatMap { - case (complex, kind) => if (complex.isLeft) Some(complex.swap.toOption.get -> kind) else None + case (complex, kind) => + if (complex.isLeft) + Some(complex.swap.toOption.get -> kind) + else { + val clone = complex.toOption.get.clone() + + clone.filter(!_.isNlp).foreach(clone.remove) + + Some(NCTokenImpl(mdl, stvReqId, clone) -> kind) + } } /** @@ -457,8 +521,8 @@ object NCModelEnricher extends NCProbeEnricher { */ private def getSparsedTokens(matched: Seq[NlpToken], toks2Match: Seq[NlpToken]): Seq[NlpToken] = { require(matched.nonEmpty) - // Matched tokens should be already sorted. + val stopsInside = toks2Match.filter(t => t.isStopWord && !matched.contains(matched) && t.index > matched.head.index && t.index < matched.last.index ) @@ -470,13 +534,38 @@ object NCModelEnricher extends NCProbeEnricher { override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = { require(isStarted) + //logger.info("ENRICH111") + startScopedSpan( "enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text ) { span => val req = NCRequestImpl(senMeta, ns.srvReqId) - val combToks = combosNlpTokens(ns.toSeq) + val combToks = combosTokens(ns.toSeq) lazy val ch = mkComplexes(mdl, ns) +// logger.info("combToks="+combToks.size) +// +// logger.info("ns.flatten.flatten.size="+ns.tokens.flatten.distinct.count(!_.isNlp)) +// +// ns.tokens.flatten.filter(!_.isNlp).distinct.foreach(n => { +// val parts = +// n.get("parts") match { +// case Some(v) => +// val parts = v.asInstanceOf[java.util.List[NCTokenPartKey]].asScala +// +// "all parts=" + parts.size + " " + +// parts.map(p => { +// val ref = ns.tokens.find(t => t.startCharIndex == p.from && t.endCharIndex == p.to).get +// +// "part=" + p.id + " (" + ref.index + "), text=" + ref.origText +// }).mkString(" | ") +// case None => "NO" +// } +// logger.info(s"${n.noteType} [${n.wordIndexes.mkString(",")}], parts=$parts") +// }) +// +// logger.info("---") + def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit = startScopedSpan( "execute", span, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text @@ -489,8 +578,7 @@ object NCModelEnricher extends NCProbeEnricher { lazy val idlCache = mutable.HashSet.empty[Seq[Complex]] for ( - // toksExt is part of sentence. - // toks is toksExt or toksExt without some stopwords set. All stopwords combinations are taking into account. + // 'toksExt' is piece of sentence, 'toks' is the same as 'toksExt' or without some stopwords set. (toks, toksExt) <- combToks; idxs = toks.map(_.index); e <- mdl.elements.values; @@ -500,6 +588,11 @@ object NCModelEnricher extends NCProbeEnricher { !greedy || !contCache(eId).exists(_.containsSlice(idxs)) && !alreadyMarked(ns, eId, toks, idxs) ) { +// println("!!!toks="+toks.map(_.origText).mkString(" ")) +// println("!!!toksExt="+toksExt.map(_.origText).mkString(" ")) +// println() + + // 1. SIMPLE. if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) { lazy val tokStems = toks.map(_.stem).mkString(" ") @@ -542,9 +635,6 @@ object NCModelEnricher extends NCProbeEnricher { for (s <- get(mdl.sparseSynonyms, eId)) s.sparseMatch(toks) match { case Some(res) => -// println("!!!toks="+toks.map(_.origText)) -// println("!!!res="+res.map(_.origText)) -// println add("simple sparse", ns, contCache, eId, greedy, getSparsedTokens(res, toks), idxs, s) case None => // No-op. } @@ -566,7 +656,9 @@ object NCModelEnricher extends NCProbeEnricher { data = comb.map(_.data) ) if (s.isMatch(data, req)) { - add("IDL continuous", ns, contCache, eId, greedy, toksExt, idxs, s, toParts(data, s)) + val parts = toParts(mdl, ns.srvReqId, data, s) + + add("IDL continuous", ns, contCache, eId, greedy, toksExt, idxs, s, parts) idlCache += comb @@ -583,7 +675,9 @@ object NCModelEnricher extends NCProbeEnricher { case Some(res) => val typ = if (s.sparse) "IDL sparse" else "IDL continuous" - add(typ, ns, contCache, eId, greedy, getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns)), idxs, s, toParts(res, s)) + val parts = toParts(mdl, ns.srvReqId, res, s) + + add(typ, ns, contCache, eId, greedy, getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns)), idxs, s, parts) idlCache += comb case None => // No-op. @@ -604,6 +698,8 @@ object NCModelEnricher extends NCProbeEnricher { processParsers(mdl, ns, span, req) } + + //logger.info("ENRICH222") } // TODO: simplify, add tests, check model properties (sparse etc) for optimization. @@ -633,4 +729,92 @@ object NCModelEnricher extends NCProbeEnricher { ) )) } +} + +object x extends App { + case class T(index: Int, isStopWord: Boolean = false) { + override def toString: String = index.toString + } + + private def combosTokens1(toks: Seq[T]): Seq[(Seq[T], Seq[T])] = + combos(toks).flatMap(combo => { + val stops = combo.filter(_.isStopWord) + + val stops4Delete = Range.inclusive(1, stops.size).flatMap(stops.combinations) + + (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo) + }). + toMap. + filter(_._1.nonEmpty). + groupBy(_._1). + map(p => p._1 -> p._2.values.minBy(p => (-p.size, p.head.index))). + sortBy(p => (-p._2.size, -p._1.size, -p._2.head.index, -p._1.head.index)) + + private def combos[T](toks: Seq[T]): Seq[Seq[T]] = { + val x = (for (n <- toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p => p) + + println("size=" + x.size) + + x + } + + private def combosTokens(toks: Seq[T]): Seq[(Seq[T], Seq[T])] = { + val value = combos(toks) + + value.flatMap(combo => { + val stops = combo.filter(_.isStopWord) + + val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[T]] + + for (stop <- stops) + if (slides.nonEmpty && slides.last.last.index + 1 == stop.index) + slides.last += stop + else + slides += mutable.ArrayBuffer.empty :+ stop + + val bigSlides = slides.filter(_.size >= 3) + + var stops4Delete: Seq[Seq[T]] = + if (bigSlides.nonEmpty) { + val allBig = bigSlides.flatten + val stops4AllCombs = stops.filter(p => !allBig.contains(p)) + + if (stops4AllCombs.nonEmpty) + for ( + seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations); + seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations) + ) + yield seq1 ++ seq2.flatMap(p => p) + else + for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)) + yield seq.flatMap(p => p) + } + else + Range.inclusive(1, stops.size).flatMap(stops.combinations) + + stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last)) + + (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct + }). + filter(_._1.nonEmpty). + groupBy(_._1). + map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))). + sortBy { case (data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) } + } + + def go(): Unit = { + val combs = combosTokens( +// Seq( +// T(0), T(2, true), T(3, true), T(4, true), T(5), T(6), T(7, true), T(8, true), T(9), T(10, true), T(11, true), T(12) +// ) + + Range.inclusive(0, 12).map(T(_, true)) + ) + + println("All=" + combs.size) + + combs.foreach { case (p1, p2) => println(p1.mkString("|") + " : " + p2.mkString("|")) } + } + + go() } \ No newline at end of file diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index f6855ea..41fc484 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -676,9 +676,18 @@ object NCSentenceManager extends NCService { groupBy { case (idx, _) => idx }. map { case (_, seq) => seq.map { case (_, note) => note }.toSet }. toSeq.sortBy(-_.size) + +// println("!!!!!toksByIdx.size="+toksByIdx.size) +// println("!!!!!toksByIdx.ALL-sizes-sum="+toksByIdx.map(_.size).sum) +// println("!!!!!toksByIdx.all-sized="+toksByIdx.map(_.size)) - def findCombinations(): Seq[Seq[NCNlpSentenceNote]] = - NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala.toSeq) + def findCombinations(): Seq[Seq[NCNlpSentenceNote]] = { + val res = NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala.toSeq) + +// println("!!! combinations=" + res.size) + + res + } val seqSens = combCache. diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala index 9e3e911..3cc26f1 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala @@ -32,9 +32,7 @@ class NCStopWordsInsideModel extends NCModelAdapter("nlpcraft.test", "Test Model override def getElements: util.Set[NCElement] = Set(NCTestElement("complex", "a b")) @NCIntent("intent=i term={# == 'complex'}") - def onI( - ctx: NCIntentMatch - ): NCResult = { + def onI(ctx: NCIntentMatch): NCResult = { require(ctx.getContext.getVariants.size() == 1) require(ctx.getContext.getVariants.asScala.head.asScala.size == 1) require(ctx.getContext.getVariants.asScala.head.asScala.head.getNormalizedText == ctx.getContext.getRequest.getNormalizedText) @@ -68,12 +66,7 @@ class NCStopWordsInsideSparseModel extends NCStopWordsInsideModel { class NCStopWordsInsideSparseSpec extends NCStopWordsInsideSpec { @Test def test2(): Unit = { - //checkIntent("a b", "i") - checkIntent("a the b", "i") -// checkIntent("a , b", "i") -// checkIntent("a, b", "i") -// checkIntent("a, the b", "i") -// checkIntent("a, the, b", "i") + // TODO: } } diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala index 27082f1..825e4a2 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala @@ -24,10 +24,8 @@ import org.junit.jupiter.api.Test import java.util import scala.jdk.CollectionConverters.SetHasAsJava -/** - * Nested Elements test model. - */ -class NCNestedTestModel41 extends NCModelAdapter("nlpcraft.nested4.test.mdl", "Nested Test Model", "1.0") { +// It shouldn't be too slow. +class NCNestedTestModel4Adapter extends NCModelAdapter("nlpcraft.nested4.test.mdl", "Nested Test Model", "1.0") { override def getElements: util.Set[NCElement] = Set( NCTestElement("e1", "//[a-zA-Z0-9]+//"), @@ -36,34 +34,89 @@ class NCNestedTestModel41 extends NCModelAdapter("nlpcraft.nested4.test.mdl", "N override def getAbstractTokens: util.Set[String] = Set("e1").asJava override def getEnabledBuiltInTokens: util.Set[String] = Set.empty[String].asJava +} - @NCIntent("intent=onE2 term(t1)={# == 'e2'}[8, 100]") - def onAB(ctx: NCIntentMatch): NCResult = NCResult.text("OK") +/** + * Greedy(one element expected) + not permuted. + */ +class NCNestedTestModel41 extends NCNestedTestModel4Adapter { + @NCIntent("intent=onE2 term(t1)={# == 'e2'}") + def onAB(): NCResult = NCResult.text("OK") + override def isGreedy: Boolean = true override def isPermutateSynonyms: Boolean = false override def isSparse: Boolean = false } /** - * It shouldn't be too slow. + * */ @NCTestEnvironment(model = classOf[NCNestedTestModel41], startClient = true) class NCEnricherNestedModelSpec41 extends NCTestContext { - @Test + // @Test + def test(): Unit = checkIntent("the a " * 11, "onE2") +} + +/** + * Not-greedy(few elements expected) + not permuted. + */ +class NCNestedTestModel42 extends NCNestedTestModel4Adapter { + @NCIntent("intent=onE2 term(t1)={# == 'e2'}[3, 100]") + def onAB(): NCResult = NCResult.text("OK") + + override def isGreedy: Boolean = false + override def isPermutateSynonyms: Boolean = false + override def isSparse: Boolean = false +} + +/** + * + */ +@NCTestEnvironment(model = classOf[NCNestedTestModel41], startClient = true) +class NCEnricherNestedModelSpec42 extends NCTestContext { + // @Test def test(): Unit = checkIntent("the a " * 11, "onE2") } -class NCNestedTestModel42 extends NCNestedTestModel41 { +/** + * Greedy(one element expected) + permuted. + */ +class NCNestedTestModel43 extends NCNestedTestModel4Adapter { + @NCIntent("intent=onE2 term(t1)={# == 'e2'}[1, 100]") + def onAB(): NCResult = NCResult.text("OK") + + override def isGreedy: Boolean = true override def isPermutateSynonyms: Boolean = true override def isSparse: Boolean = true } /** - * It shouldn't be too slow. + * */ -@NCTestEnvironment(model = classOf[NCNestedTestModel42], startClient = true) -class NCEnricherNestedModelSpec42 extends NCTestContext { - @Test - def test(): Unit = checkIntent("the a " * 8, "onE2") +@NCTestEnvironment(model = classOf[NCNestedTestModel43], startClient = true) +class NCEnricherNestedModelSpec43 extends NCTestContext { + // @Test + def test(): Unit = checkIntent("the a " * 4, "onE2") +} + +/** + * Not-greedy(few elements expected) + permuted. + */ +class NCNestedTestModel44 extends NCNestedTestModel4Adapter { + @NCIntent("intent=onE2 term(t1)={# == 'e2'}[3, 100]") + def onAB(): NCResult = NCResult.text("OK") + + override def isGreedy: Boolean = false + override def isPermutateSynonyms: Boolean = true + override def isSparse: Boolean = true +} + +/** + * + */ +@NCTestEnvironment(model = classOf[NCNestedTestModel44], startClient = true) +class NCEnricherNestedModelSpec44 extends NCTestContext { + // @Test + def test(): Unit = checkIntent("the a " * 2, "onE2") }
