This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-287 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit bbeecf6ad83eb7941676e32298859989e0f89bc0 Author: Sergey Kamov <[email protected]> AuthorDate: Thu Apr 8 22:48:24 2021 +0300 WIP. --- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 74 ++++++++++------------ 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 5169afe..0ec40cd 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -26,15 +26,17 @@ import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKin import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager -import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCProbeSynonym ⇒ Synonym} +import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCProbeSynonym ⇒Synonym} import java.io.Serializable import java.util +import java.util.{List ⇒ JList} import scala.collection.JavaConverters._ import scala.collection.convert.DecorateAsScala import scala.collection.mutable.ArrayBuffer import scala.collection.{Map, Seq, mutable} + /** * Model elements enricher. */ @@ -133,7 +135,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { element: NCElement, tokens: Seq[NlpToken], synonym: Synonym, - parts: Seq[TokType] + parts: Seq[TokType], + tokIdxs: Seq[Int] ) extends Ordered[ElementMatch] { // Tokens sparsity. lazy val sparsity: Int = U.calcSparsity(tokens.map(_.index)) @@ -203,6 +206,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param syn * @param metaOpt * @param parts + * @param toksIdxs */ private def mark( ns: NCNlpSentence, @@ -211,13 +215,18 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { direct: Boolean, syn: Option[Synonym], metaOpt: Option[Map[String, Object]], - parts: Seq[TokType] + parts: Seq[TokType], + toksIdxs: Seq[Int] ): Unit = { val params = mutable.ArrayBuffer.empty[(String, AnyRef)] // For system elements. params += "direct" → direct.asInstanceOf[AnyRef] + val toksIdxsJava: JList[Int] = toksIdxs.asJava + + params += "allToksIndexes" → toksIdxsJava + syn match { case Some(s) ⇒ if (s.isValueSynonym) @@ -302,7 +311,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { new NCCustomElement() { override def getElementId: String = noteId - override def getWords: util.List[NCCustomWord] = words + override def getWords: JList[NCCustomWord] = words override def getMetadata: JavaMeta = md.map(p ⇒ p._1 → p._2.asInstanceOf[AnyRef]).asJava } }).asJava @@ -333,7 +342,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { direct = true, syn = None, metaOpt = Some(e.getMetadata.asScala), - parts = Seq.empty + parts = Seq.empty, + matchedToks.map(_.index) ) }) } @@ -369,7 +379,17 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param toks * @param elemId */ - private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean = toks.forall(_.isTypeOf(elemId)) + private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean = + toks.forall(_.isTypeOf(elemId)) || + toks.flatten.exists(n ⇒ + n.noteType == elemId && + ( + n.dataOpt("allToksIndexes").asInstanceOf[Option[JList[Int]]] match { + case Some(idxs) ⇒ idxs.asScala.containsSlice(toks.map(_.index)) + case None ⇒ false + } + ) + ) /** * @@ -506,55 +526,29 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param matches */ private def processMatches(ns: NCNlpSentence, mdlId: String, matches: Seq[ElementMatch]): Unit = { - // Scans by elements that are found with same tokens length. - // Inside, for each token we drop all non-optimized combinations. - // Example: - // 1. element's synonym - 'a b', isSparse 'true', isPermuteSynonyms 'true' - // 2. Request 'a b a b', - // Initially found 0-1, 1-2, 2-3, 0-3. - // 0-3 will be deleted because for 0 and 3 tokens best variants found for same element with same tokens length. - val matchesNorm = - matches. - flatMap(m ⇒ m.tokens.map(_ → m)). - groupBy { case (t, m) ⇒ (m.element.getId, m.length, t) }. - flatMap { case (_, seq) ⇒ - // Optimization by sparsity sum for each tokens set for one element found with same tokens count. - U.permute( - seq.groupBy { case (tok, _) ⇒ tok }. - map { case (_, seq) ⇒ seq.map { case (_, m) ⇒ m }.toList }.toList - ).minBy(_.map(_.sparsity).sum) - }. - toSeq. - distinct - - val matchCnt = matchesNorm.size - // TODO:matchesNorm // Add notes for all remaining (non-intersecting) matches. for ((m, idx) ← matches.zipWithIndex) { if (DEEP_DEBUG) logger.trace( - s"Model '$mdlId' element found (${idx + 1} of $matchCnt) [" + + s"Model '$mdlId' element found (${idx + 1} of ${matches.size}) [" + s"elementId=${m.element.getId}, " + s"synonym=${m.synonym}, " + s"tokens=${tokString(m.tokens)}" + s"]" ) - val elm = m.element - val syn = m.synonym - val tokIdxs = m.tokens.map(_.index) - val direct = syn.isDirect && (tokIdxs == tokIdxs.sorted) + val direct = m.synonym.isDirect && (tokIdxs == tokIdxs.sorted) // TODO: - if (!alreadyMarked(m.tokens, elm.getId)) { - mark(ns, elem = elm, toks = m.tokens, direct = direct, syn = Some(syn), metaOpt = None, parts = m.parts) + if (!alreadyMarked(m.tokens, m.element.getId)) { + mark(ns, m.element, m.tokens, direct, syn = Some(m.synonym), metaOpt = None, m.parts, m.tokIdxs) - println(s"SET: ${elm.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}") + println(s"SET: ${m.element.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}") } else - println(s"NOT SET: ${elm.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}") + println(s"NOT SET: ${m.element.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}") } } @@ -571,7 +565,9 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { startScopedSpan("synsProc", span, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { _ ⇒ var state = if (ns.firstProbePhase) SIMPLE else DSL_NEXT + ns.firstProbePhase = false + val combosToks = combos(ns) def go(): Unit = { @@ -590,7 +586,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { var added = false if (!matchExist(elm.getId, res)) { - matches += ElementMatch(elm, res, s, parts) + matches += ElementMatch(elm, res, s, parts, tokIdxs) added = true }
