This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-443 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit f471a8be8f5210d4d1c1ad9905e3021fc04673d6 Author: Sergey Kamov <[email protected]> AuthorDate: Sat Sep 18 12:00:24 2021 +0300 WIP. --- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 135 ++++----------- .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 189 ++++++++++----------- .../probe/mgrs/sentence/NCSentenceManager.scala | 11 +- 3 files changed, 130 insertions(+), 205 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index d83ab05..f8457e8 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -19,6 +19,7 @@ package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model import io.opencensus.trace.Span import org.apache.nlpcraft.common._ +import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken} import org.apache.nlpcraft.model._ import org.apache.nlpcraft.model.impl.NCTokenImpl @@ -330,7 +331,6 @@ object NCModelEnricher extends NCProbeEnricher { val bigSlides = slides.filter(_.size > 2) var stops4Delete: Seq[Seq[NlpToken]] = - if (bigSlides.nonEmpty) { val allBig = bigSlides.flatMap(p => p) val stops4AllCombs = stops.filter(p => !allBig.contains(p)) @@ -534,8 +534,6 @@ object NCModelEnricher extends NCProbeEnricher { override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = { require(isStarted) - //logger.info("ENRICH111") - startScopedSpan( "enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text ) { span => @@ -547,17 +545,16 @@ object NCModelEnricher extends NCProbeEnricher { // // logger.info("ns.flatten.flatten.size="+ns.tokens.flatten.distinct.count(!_.isNlp)) // -// ns.tokens.flatten.filter(!_.isNlp).distinct.foreach(n => { +// ns.tokens.flatten.filter(!_.isNlp).distinct.sortBy(p => (p.noteType, -p.tokenIndexes.size)).foreach(n => { // val parts = // n.get("parts") match { // case Some(v) => // val parts = v.asInstanceOf[java.util.List[NCTokenPartKey]].asScala // -// "all parts=" + parts.size + " " + // parts.map(p => { // val ref = ns.tokens.find(t => t.startCharIndex == p.from && t.endCharIndex == p.to).get // -// "part=" + p.id + " (" + ref.index + "), text=" + ref.origText +// "part=" + p.id + " (idx=" + ref.index + "), text=" + ref.origText // }).mkString(" | ") // case None => "NO" // } @@ -588,11 +585,6 @@ object NCModelEnricher extends NCProbeEnricher { !greedy || !contCache(eId).exists(_.containsSlice(idxs)) && !alreadyMarked(ns, eId, toks, idxs) ) { -// println("!!!toks="+toks.map(_.origText).mkString(" ")) -// println("!!!toksExt="+toksExt.map(_.origText).mkString(" ")) -// println() - - // 1. SIMPLE. if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) { lazy val tokStems = toks.map(_.stem).mkString(" ") @@ -699,7 +691,38 @@ object NCModelEnricher extends NCProbeEnricher { processParsers(mdl, ns, span, req) } - //logger.info("ENRICH222") + normalize(ns) + } + + /** + * + * @param ns + */ + private def normalize(ns: Sentence): Unit = { + val usrNotes = ns.flatten.filter(_.isUser).distinct + val links = NCSentenceManager.getLinks(usrNotes) + val parts = NCSentenceManager.getPartKeys(usrNotes: _*) + + usrNotes. + filter(n => !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))). + filter(n => !parts.contains(NCTokenPartKey(n, ns))). + foreach(n => { + val hasBetter = + usrNotes.exists(candidate => + candidate != n && + candidate.noteType == n.noteType && + candidate.dataOpt("parts") == n.dataOpt("parts") && + candidate.wordIndexes.toSet.subsetOf(n.wordIndexes.toSet) && + n.wordIndexes.filter(n => !candidate.wordIndexes.contains(n)). + forall(wordIdx => ns.tokens.exists(t => t.wordIndexes.contains(wordIdx) && t.isStopWord))) + + if (hasBetter) { + ns.removeNote(n) + + // TODO: trace. + logger.info(s"Element removed: ${n}") + } + }) } // TODO: simplify, add tests, check model properties (sparse etc) for optimization. @@ -730,91 +753,3 @@ object NCModelEnricher extends NCProbeEnricher { )) } } - -object x extends App { - case class T(index: Int, isStopWord: Boolean = false) { - override def toString: String = index.toString - } - - private def combosTokens1(toks: Seq[T]): Seq[(Seq[T], Seq[T])] = - combos(toks).flatMap(combo => { - val stops = combo.filter(_.isStopWord) - - val stops4Delete = Range.inclusive(1, stops.size).flatMap(stops.combinations) - - (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo) - }). - toMap. - filter(_._1.nonEmpty). - groupBy(_._1). - map(p => p._1 -> p._2.values.minBy(p => (-p.size, p.head.index))). - sortBy(p => (-p._2.size, -p._1.size, -p._2.head.index, -p._1.head.index)) - - private def combos[T](toks: Seq[T]): Seq[Seq[T]] = { - val x = (for (n <- toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p => p) - - println("size=" + x.size) - - x - } - - private def combosTokens(toks: Seq[T]): Seq[(Seq[T], Seq[T])] = { - val value = combos(toks) - - value.flatMap(combo => { - val stops = combo.filter(_.isStopWord) - - val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[T]] - - for (stop <- stops) - if (slides.nonEmpty && slides.last.last.index + 1 == stop.index) - slides.last += stop - else - slides += mutable.ArrayBuffer.empty :+ stop - - val bigSlides = slides.filter(_.size >= 3) - - var stops4Delete: Seq[Seq[T]] = - if (bigSlides.nonEmpty) { - val allBig = bigSlides.flatten - val stops4AllCombs = stops.filter(p => !allBig.contains(p)) - - if (stops4AllCombs.nonEmpty) - for ( - seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations); - seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations) - ) - yield seq1 ++ seq2.flatMap(p => p) - else - for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)) - yield seq.flatMap(p => p) - } - else - Range.inclusive(1, stops.size).flatMap(stops.combinations) - - stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last)) - - (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct - }). - filter(_._1.nonEmpty). - groupBy(_._1). - map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))). - sortBy { case (data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) } - } - - def go(): Unit = { - val combs = combosTokens( -// Seq( -// T(0), T(2, true), T(3, true), T(4, true), T(5), T(6), T(7, true), T(8, true), T(9), T(10, true), T(11, true), T(12) -// ) - - Range.inclusive(0, 12).map(T(_, true)) - ) - - println("All=" + combs.size) - - combs.foreach { case (p1, p2) => println(p1.mkString("|") + " : " + p2.mkString("|")) } - } - - go() -} \ No newline at end of file diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala index 286c8b4..1e31ab0 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala @@ -17,7 +17,6 @@ package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort -import java.io.Serializable import io.opencensus.trace.Span import org.apache.nlpcraft.common.NCService import org.apache.nlpcraft.common.makro.NCMacroParser @@ -26,6 +25,7 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSe import org.apache.nlpcraft.probe.mgrs.NCProbeModel import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher +import java.io.Serializable import java.util.{List => JList} import scala.collection.mutable import scala.jdk.CollectionConverters._ @@ -187,59 +187,50 @@ object NCSortEnricher extends NCProbeEnricher { * * @param toksNoteData */ - private def split(toks: Seq[NCNlpSentenceToken], othersRefs: Seq[NCNlpSentenceToken], toksNoteData: Seq[NoteData], nullable: Boolean): Seq[Seq[NoteData]] = { - val res = - if (toksNoteData.nonEmpty) { - val res = mutable.ArrayBuffer.empty[Seq[NoteData]] - - /** - * Returns flag which indicates are token contiguous or not. - * - * @param tok1Idx First token index. - * @param tok2Idx Second token index. - */ - def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = { - val between = toks.filter(t => t.index > tok1Idx && t.index < tok2Idx) - - between.isEmpty || between.forall(p => p.isStopWord || p.stem == stemAnd) - } + private def split( + toks: Seq[NCNlpSentenceToken], + othersRefs: Seq[NCNlpSentenceToken], + toksNoteData: Seq[NoteData] + ): Seq[Seq[NoteData]] = + if (toksNoteData.nonEmpty) { + val res = mutable.ArrayBuffer.empty[Seq[NoteData]] + + /** + * Returns flag which indicates are token contiguous or not. + * + * @param tok1Idx First token index. + * @param tok2Idx Second token index. + */ + def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = { + val between = toks.filter(t => t.index > tok1Idx && t.index < tok2Idx) + + between.isEmpty || between.forall(p => p.isStopWord || p.stem == stemAnd) + } - val toks2 = toks.filter(othersRefs.contains) + val toks2 = toks.filter(othersRefs.contains) - val minIdx = toks2.dropWhile(t => !isUserNotValue(t)).head.index - val maxIdx = toks2.reverse.dropWhile(t => !isUserNotValue(t)).head.index + val minIdx = toks2.dropWhile(t => !isUserNotValue(t)).head.index + val maxIdx = toks2.reverse.dropWhile(t => !isUserNotValue(t)).head.index - require(minIdx <= maxIdx) + require(minIdx <= maxIdx) - def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = { - seq += nd + def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = { + seq += nd - toksNoteData. - filter(p => nd.indexes.last < p.indexes.head && contiguous(nd.indexes.last, p.indexes.head)). - foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone())) + toksNoteData. + filter(p => nd.indexes.last < p.indexes.head && contiguous(nd.indexes.last, p.indexes.head)). + foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone())) - if (seq.nonEmpty && seq.head.indexes.head == minIdx && seq.last.indexes.last == maxIdx) - res += seq - } + if (seq.nonEmpty && seq.head.indexes.head == minIdx && seq.last.indexes.last == maxIdx) + res += seq + } - toksNoteData.filter(_.indexes.head == minIdx).foreach(p => fill(p)) + toksNoteData.filter(_.indexes.head == minIdx).foreach(p => fill(p)) - res - } - else - Seq.empty - - if (res.isEmpty && !nullable) - throw new AssertionError(s"Invalid empty result " + - s"[tokensTexts=[${toks.map(_.origText).mkString("|")}]" + - s", notes=[${toks.flatten.map(n => s"${n.noteType}:[${n.tokenIndexes.mkString(",")}]").mkString("|")}]" + - s", tokensIndexes=[${toks.map(_.index).mkString("|")}]" + - s", allData=[${toksNoteData.mkString("|")}]" + - s"]" - ) - - res.toSeq - } + res + } + else + Seq.empty /** * @@ -346,72 +337,78 @@ object NCSortEnricher extends NCProbeEnricher { if (data1.nonEmpty || data2.nonEmpty) { val seq1 = if (data1.nonEmpty) - split(part1, othersRefs, data1, nullable = false) - else - split(part2, othersRefs, data2, nullable = false) - val seq2 = - if (data1.nonEmpty && data2.nonEmpty) - split(part2, othersRefs, data2, nullable = true) + split(part1, othersRefs, data1) else - Seq.empty - val asc = orderOpt.flatMap(o => Some(order(o.synonymIndex)._2)) - - typ match { - case TYPE_SUBJ => - require(seq1.nonEmpty) - require(seq2.isEmpty) - require(sortToks.nonEmpty) - - // Ignores invalid cases. - if (byToks.isEmpty) - res = - Some( + split(part2, othersRefs, data2) + + if (seq1.nonEmpty) { + val seq2 = + if (data1.nonEmpty && data2.nonEmpty) + split(part2, othersRefs, data2) + else + Seq.empty + + val asc = orderOpt.flatMap(o => Some(order(o.synonymIndex)._2)) + + typ match { + case TYPE_SUBJ => + require(seq1.nonEmpty) + require(seq2.isEmpty) + require(sortToks.nonEmpty) + + // Ignores invalid cases. + if (byToks.isEmpty) + res = + Some( + Match( + asc = asc, + main = sortToks, + stop = orderToks, + subjSeq = seq1, + bySeq = Seq.empty + ) + ) + + case TYPE_SUBJ_BY => + require(seq1.nonEmpty) + require(sortToks.nonEmpty) + require(byToks.nonEmpty) + + if (seq2.isEmpty) + res = None + else + res = Some( Match( asc = asc, main = sortToks, - stop = orderToks, + stop = byToks ++ orderToks, subjSeq = seq1, - bySeq = Seq.empty + bySeq = seq2 ) ) - case TYPE_SUBJ_BY => - require(seq1.nonEmpty) - require(sortToks.nonEmpty) - require(byToks.nonEmpty) + case TYPE_BY => + require(seq1.nonEmpty) + require(seq2.isEmpty) + require(sortToks.nonEmpty) + require(byToks.nonEmpty) - if (seq2.isEmpty) - res = None - else + // `Sort by` as one element, see validation. res = Some( Match( asc = asc, - main = sortToks, - stop = byToks ++ orderToks, - subjSeq = seq1, - bySeq = seq2 + main = sortToks ++ byToks, + stop = orderToks, + subjSeq = Seq.empty, + bySeq = seq1 ) ) - case TYPE_BY => - require(seq1.nonEmpty) - require(seq2.isEmpty) - require(sortToks.nonEmpty) - require(byToks.nonEmpty) - - // `Sort by` as one element, see validation. - res = Some( - Match( - asc = asc, - main = sortToks ++ byToks, - stop = orderToks, - subjSeq = Seq.empty, - bySeq = seq1 - ) - ) - - case _ => throw new AssertionError(s"Unexpected type: $typ") + case _ => throw new AssertionError(s"Unexpected type: $typ") + } } + else + None } case None => // No-op. } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index 41fc484..d85c9d6 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -48,7 +48,7 @@ object NCSentenceManager extends NCService { * * @param notes */ - private def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = { + def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = { val noteLinks = mutable.ArrayBuffer.empty[NoteLink] for (n <- notes.filter(n => n.noteType == "nlpcraft:limit" || n.noteType == "nlpcraft:references")) @@ -79,7 +79,7 @@ object NCSentenceManager extends NCService { * * @param notes */ - private def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] = + def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] = notes. filter(_.isUser). flatMap(n => { @@ -677,15 +677,9 @@ object NCSentenceManager extends NCService { map { case (_, seq) => seq.map { case (_, note) => note }.toSet }. toSeq.sortBy(-_.size) -// println("!!!!!toksByIdx.size="+toksByIdx.size) -// println("!!!!!toksByIdx.ALL-sizes-sum="+toksByIdx.map(_.size).sum) -// println("!!!!!toksByIdx.all-sized="+toksByIdx.map(_.size)) - def findCombinations(): Seq[Seq[NCNlpSentenceNote]] = { val res = NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala.toSeq) -// println("!!! combinations=" + res.size) - res } @@ -742,7 +736,6 @@ object NCSentenceManager extends NCService { ) ) - def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = s.flatten.filter(!_.isNlp) // Drops similar sentences (with same notes structure). Keeps with more found.
