This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-287 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit c4d2d15cb4ff94c96105cedfb12a70e4845dd113 Author: Sergey Kamov <[email protected]> AuthorDate: Tue Apr 6 11:25:59 2021 +0300 WIP. --- .../nlpcraft/probe/mgrs/NCProbeSynonym.scala | 51 ++++++----- .../nlpcraft/probe/mgrs/model/NCModelManager.scala | 22 ++--- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 100 ++++++++++----------- .../probe/mgrs/sentence/NCSentenceManager.scala | 22 +++-- 4 files changed, 106 insertions(+), 89 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala index 5324304..95c526f 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala @@ -92,37 +92,44 @@ class NCProbeSynonym( require(toks != null) require(toks.nonEmpty) - lazy val res = mutable.ArrayBuffer.empty[T] - lazy val all = mutable.HashSet.empty[T] + if (toks.size >= this.size) { + lazy val res = mutable.ArrayBuffer.empty[T] + lazy val all = mutable.HashSet.empty[T] - var state = 0 + var state = 0 - for (chunk ← this if state != -1) { - val seq = - if (state == 0) { - state = 1 + for (chunk ← this if state != -1) { + val seq = + if (state == 0) { + state = 1 - toks.filter(t ⇒ isMatch(t, chunk)) - } - else - toks.filter(t ⇒ !res.contains(t) && isMatch(t, chunk)) + toks.filter(t ⇒ isMatch(t, chunk)) + } + else + toks.filter(t ⇒ !res.contains(t) && isMatch(t, chunk)) - if (seq.nonEmpty) { - val head = seq.head + if (seq.nonEmpty) { + val head = seq.head - if (!perm && res.nonEmpty && getIndex(head) <= getIndex(res.last)) - state = -1 - else { - res += head - all ++= seq + if (!perm && res.nonEmpty && getIndex(head) <= getIndex(res.last)) + state = -1 + else { + res += head + all ++= seq + + if (all.size > res.size) + state = -1 + } } + else + state = -1 } + + if (state != -1 && all.size == res.size) + Some(res) else - state = -1 + None } - - if (state != -1 && all.size == res.size) - Some(res) else None } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala index cdfdf89..03c59ff 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala @@ -65,17 +65,19 @@ object NCModelManager extends NCService with DecorateAsScala { val elmCnt = w.elements.keySet.size val intentCnt = w.intents.size + def getWithWarning(i: Int): String = if (i == 0) s"0 ${r("(!)")}" else i.toString + tbl += Seq( - s"Name: ${bo(c(mdl.getName))}", - s"ID: ${bo(mdl.getId)}", - s"Version: ${mdl.getVersion}", - s"Origin: ${mdl.getOrigin}", - s"Elements: $elmCnt" + (if (elmCnt == 0) s" ${r("(!)")}" else ""), - s"Synonyms: $synCnt" + (if (synCnt == 0) s" ${r("(!)")}" else ""), - s"Synonyms(DSL): $synDslCnt" + (if (synDslCnt == 0) s" ${r("(!)")}" else ""), - s"Synonyms(Sparse): $synSparseCnt" + (if (synSparseCnt == 0) s" ${r("(!)")}" else ""), - s"Synonyms(Sparse, DSL): $synSparseDslCnt" + (if (synSparseDslCnt == 0) s" ${r("(!)")}" else ""), - s"Intents: $intentCnt" + (if (intentCnt == 0) s" ${r("(!)")}" else "") + s"Name: ${bo(c(mdl.getName))}", + s"ID: ${bo(mdl.getId)}", + s"Version: ${mdl.getVersion}", + s"Origin: ${mdl.getOrigin}", + s"Elements: ${getWithWarning(elmCnt)}", + s"Synonyms(Continuous) $synCnt", + s"Synonyms(Continuous, DSL): $synDslCnt", + s"Synonyms(Sparse): $synSparseCnt", + s"Synonyms(Sparse, DSL): $synSparseDslCnt", + s"Intents: ${getWithWarning(intentCnt)}" ) }) } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 46506fd..f9acd95 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -296,11 +296,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { else None } - private def mkCache(): mutable.Map[String, ArrayBuffer[Seq[NlpToken]]] = + private def mkCache(): mutable.Map[String, ArrayBuffer[Seq[Int]]] = mutable.HashMap.empty[ String, - mutable.ArrayBuffer[Seq[NlpToken]] - ].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[NlpToken]]) + mutable.ArrayBuffer[Seq[Int]] + ].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[Int]]) private def convert(tows: Seq[NCDslContent], ns: NCNlpSentence): Seq[NlpToken] = ( @@ -388,15 +388,17 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { ) { _ ⇒ for (toks ← combos(ns)) { - val idxsSeq = toks.flatMap(tokIdxs) - val idxsSorted = idxsSeq.sorted - val idxs = idxsSeq.toSet - val idxMin = idxsSorted.head - val idxMax = idxsSorted.last + val indexes = toks.map(_.index) - lazy val sorted = idxsSorted.zipWithIndex.toMap + lazy val dslCombs: Map[Int, Seq[Seq[Complex]]] = { + val idxsSeq = toks.flatMap(tokIdxs) + val idxsSorted = idxsSeq.sorted + val idxs = idxsSeq.toSet + val idxMin = idxsSorted.head + val idxMax = idxsSorted.last + + lazy val sorted = idxsSorted.zipWithIndex.toMap - lazy val dslCombs: Map[Int, Seq[Seq[Complex]]] = complexes.par. flatMap(complexSeq ⇒ { val rec = complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs)) @@ -412,54 +414,41 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { None }). map(_.sortBy(p ⇒ sorted(p.wordIndexes.head))).seq.groupBy(_.length) + } lazy val tokStems = toks.map(_.stem).mkString(" ") // Attempt to match each element. for (elm ← mdl.elements.values) { val elemId = elm.getId - val sparseEnabled = !cacheSparse(elemId).exists(_.contains(toks)) - val notSparseEnabled = !cacheNotSparse(elemId).exists(_.contains(toks)) - var foundSparse = false - var foundNotSparse = false + val sparseEnabled = !cacheSparse(elemId).exists(_.containsSlice(indexes)) + val notSparseEnabled = !cacheNotSparse(elemId).exists(_.containsSlice(indexes)) + var found = false def addSparse(res: Seq[NlpToken], syn: NCProbeSynonym, parts: Seq[TokenData]): Unit = { addMatch(elm, res, syn, parts) - cacheSparse(elemId) += toks - foundSparse = true + cacheSparse(elemId) += indexes + found = true } def addNotSparse(syn: NCProbeSynonym, parts: Seq[TokenData]): Unit = { addMatch(elm, toks, syn, parts) - cacheNotSparse(elemId) += toks - foundNotSparse = true + cacheNotSparse(elemId) += indexes + found = true } - - // 1. Simple, sparse. - if (firstPhase && sparseEnabled) - for (syn ← mdl.sparseSynonyms.getOrElse(elemId, Seq.empty) if !foundSparse) - syn.trySparseMatch(toks) match { - case Some(res) ⇒ addSparse(res, syn, Seq.empty) - case None ⇒ // No-op. - } - - // 2. Simple, not sparse. - // Optimization - plain synonyms can be used only on first iteration - if (firstPhase && notSparseEnabled) + // 1. Simple, not sparse. + if (firstPhase && notSparseEnabled && !found) fastAccess(mdl.nonSparseSynonyms, elemId, toks.length) match { case Some(h) ⇒ def tryMap(synsMap: Map[String, NCProbeSynonym], notFound: () ⇒ Unit): Unit = synsMap.get(tokStems) match { case Some(syn) ⇒ addNotSparse(syn, Seq.empty) - // TODO: - //if (!found) - // notFound() case None ⇒ notFound() } def tryScan(synsSeq: Seq[NCProbeSynonym]): Unit = - for (syn ← synsSeq if !foundNotSparse) + for (syn ← synsSeq if !found) if (syn.isMatch(toks)) addNotSparse(syn, Seq.empty) @@ -468,7 +457,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { () ⇒ { tryScan(h.notTxtDirectSynonyms) - if (!foundNotSparse) + if (!found) tryMap( h.txtNotDirectSynonyms, () ⇒ tryScan(h.notTxtNotDirectSynonyms) @@ -478,30 +467,38 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { case None ⇒ // No-op. } - // 3. DSL, sparse. - if (sparseEnabled) - for ( - (_, seq) ← dslCombs; - syn ← mdl.sparseSynonymsDsl.getOrElse(elemId, Seq.empty); - comb ← seq if !foundSparse - ) { - syn.trySparseMatch(comb.map(_.data), req) match { - case Some(towsRes) ⇒ addSparse(convert(towsRes, ns), syn, getPartsContent(towsRes, syn)) - case None ⇒ // No-op. - } - } - - // 4. DSL, non sparse. - if (notSparseEnabled) { + // 2. DSL, non sparse. + if (notSparseEnabled && mdl.nonSparseSynonymsDsl.nonEmpty && !found) { for ( (len, seq) ← dslCombs; syn ← fastAccess(mdl.nonSparseSynonymsDsl, elemId, len).getOrElse(Seq.empty); - comb ← seq if !foundNotSparse + comb ← seq if !found ) { if (syn.isMatch(comb.map(_.data), req)) addNotSparse(syn, getPartsComplex(comb, syn)) } } + + // 3. Simple, sparse. + if (firstPhase && sparseEnabled && !found) + for (syn ← mdl.sparseSynonyms.getOrElse(elemId, Seq.empty) if !found) + syn.trySparseMatch(toks) match { + case Some(res) ⇒ addSparse(res, syn, Seq.empty) + case None ⇒ // No-op. + } + + // 4. DSL, sparse. + if (sparseEnabled && mdl.sparseSynonymsDsl.nonEmpty && !found) + for ( + syn ← mdl.sparseSynonymsDsl.getOrElse(elemId, Seq.empty); + (_, seq) ← dslCombs; + comb ← seq if !found + ) { + syn.trySparseMatch(comb.map(_.data), req) match { + case Some(towsRes) ⇒ addSparse(convert(towsRes, ns), syn, getPartsContent(towsRes, syn)) + case None ⇒ // No-op. + } + } } } } @@ -529,6 +526,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { val matchCnt = matchesNorm.size + // TODO:matchesNorm // Add notes for all remaining (non-intersecting) matches. for ((m, idx) ← matches.zipWithIndex) { diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index ad66b8f..a938f59 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -24,9 +24,9 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSe import org.apache.nlpcraft.common.{NCE, NCService, U} import org.apache.nlpcraft.model.NCModel -import java.io.{Serializable ⇒ JSerializable} +import java.io.{Serializable => JSerializable} import java.util -import java.util.{List ⇒ JList} +import java.util.{List => JList} import scala.collection.JavaConverters.{asScalaBufferConverter, _} import scala.collection.{Map, Seq, mutable} import scala.language.implicitConversions @@ -37,6 +37,8 @@ import scala.language.implicitConversions object NCSentenceManager extends NCService { @volatile private var pool: java.util.concurrent.ForkJoinPool = _ + private val cache = U.mkLRUMap[Seq[Set[NCNlpSentenceNote]], util.List[util.List[NCNlpSentenceNote]]]("sentence-combinations-cache", 500) + case class PartKey(id: String, start: Int, end: Int) { require(start <= end) @@ -197,7 +199,7 @@ object NCSentenceManager extends NCService { * @param noteField * @param ns */ - private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit = { + private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit = ns.flatMap(_.getNotes(note)).foreach(rel ⇒ rel.dataOpt[JList[JList[Int]]](idxsField) match { case Some(idxsList) ⇒ @@ -211,7 +213,6 @@ object NCSentenceManager extends NCService { case None ⇒ // No-op. } ) - } /** * Copies token. @@ -679,14 +680,23 @@ object NCSentenceManager extends NCService { var sens = if (delCombs.nonEmpty) { - val toksByIdx = + val toksByIdx: Seq[Set[NCNlpSentenceNote]] = delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)). groupBy { case (idx, _) ⇒ idx }. map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note }.toSet }. toSeq.sortBy(-_.size) + + var combs: JList[JList[NCNlpSentenceNote]] = cache.get(toksByIdx) + + if (combs == null) { + combs = NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool) + + cache.put(toksByIdx, combs) + } + val seqSens = - NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala). + combs.asScala.map(_.asScala). par. flatMap(delComb ⇒ { val nsClone = sen.clone()
