This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-287 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 1f47a982c25c0b82802b8881277bb51e1a6f3442 Author: Sergey Kamov <[email protected]> AuthorDate: Thu Apr 8 18:57:21 2021 +0300 WIP. --- .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala | 9 +- .../probe/mgrs/deploy/NCDeployManager.scala | 15 +- .../nlpcraft/probe/mgrs/model/NCModelManager.scala | 24 ++- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 161 ++++++++++----------- 4 files changed, 103 insertions(+), 106 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala index 0e418b3..31fa627 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala @@ -30,8 +30,7 @@ import scala.collection.{Map, Seq} * @param intents * @param directSynonyms * @param sparseSynonyms - * @param directSynonymsDsl - * @param addStopWordsStems + * @param synonymsDsl * @param exclStopWordsStems * @param suspWordsStems * @param elements @@ -42,13 +41,13 @@ case class NCProbeModel( intents: Seq[NCIdlIntent], directSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , NCProbeSynonymsWrapper]], // Fast access map. sparseSynonyms: Map[String /*Element ID*/, Seq[NCProbeSynonym]], - directSynonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast access map. - sparseSynonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], + synonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast access map. addStopWordsStems: Set[String], exclStopWordsStems: Set[String], suspWordsStems: Set[String], elements: Map[String /*Element ID*/ , NCElement], samples: Set[(String, Seq[Seq[String]])] ) { - def hasDslSynonyms(elemId: String): Boolean = directSynonymsDsl.contains(elemId) || sparseSynonymsDsl.contains(elemId) + def hasDslSynonyms(elemId: String): Boolean = synonymsDsl.contains(elemId) + def hasDslSynonyms: Boolean = synonymsDsl.nonEmpty } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala index aa3b99e..04ed091 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala @@ -197,8 +197,10 @@ object NCDeployManager extends NCService with DecorateAsScala { // TODO: Sparse for nonDSL def ok(b: Boolean, exp: Boolean): Boolean = if (exp) b else !b - def filter(dsl: Boolean, sparse: Boolean): Set[SynonymHolder] = - syns.toSet.filter(s ⇒ ok(s.syn.exists(_.kind == IDL), dsl) && ok(s.sparse && s.syn.size > 1, sparse)) + def filterDsl(syns: Set[SynonymHolder], dsl: Boolean): Set[SynonymHolder] = + syns.filter(s ⇒ ok(s.syn.exists(_.kind == IDL), dsl)) + def filterSparse(syns: Set[SynonymHolder], sparse: Boolean): Set[SynonymHolder] = + syns.filter(s ⇒ ok(s.sparse && s.syn.size > 1, sparse)) var cnt = 0 val maxCnt = mdl.getMaxTotalSynonyms @@ -506,14 +508,15 @@ object NCDeployManager extends NCService with DecorateAsScala { def toMap(set: Set[SynonymHolder]): Map[String, Seq[NCProbeSynonym]] = set.groupBy(_.elmId).map(p ⇒ p._1 → p._2.map(_.syn).toSeq.sortBy(-_.size)) + val notDsl = filterDsl(syns.toSet, dsl = false) + NCProbeModel( model = mdl, solver = solver, intents = intents.map(_._1).toSeq, - directSynonyms = mkFastAccessMap(filter(dsl = false, sparse = false), NCProbeSynonymsWrapper(_)), - sparseSynonyms = toMap(filter(dsl = false, sparse = true)), - directSynonymsDsl = toMap(filter(dsl = true, sparse = false)), - sparseSynonymsDsl = toMap(filter(dsl = true, sparse = true)), + directSynonyms = mkFastAccessMap(filterSparse(notDsl, sparse = false), NCProbeSynonymsWrapper(_)), + sparseSynonyms = toMap(filterSparse(notDsl, sparse = true)), + synonymsDsl = toMap(filterDsl(syns.toSet, dsl = true)), addStopWordsStems = addStopWords, exclStopWordsStems = exclStopWords, suspWordsStems = suspWords, diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala index 457bf35..ff0cb78 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala @@ -58,26 +58,24 @@ object NCModelManager extends NCService with DecorateAsScala { data.values.foreach(w ⇒ { val mdl = w.model - val synCnt = w.directSynonyms.flatMap(_._2.map(_._2.count)).sum - val synDslCnt = w.directSynonymsDsl.map(_._2.size).sum + val synDirectCnt = w.directSynonyms.flatMap(_._2.map(_._2.count)).sum val synSparseCnt = w.sparseSynonyms.map(_._2.size).sum - val synSparseDslCnt = w.sparseSynonymsDsl.map(_._2.size).sum + val synDslCnt = w.synonymsDsl.map(_._2.size).sum val elmCnt = w.elements.keySet.size val intentCnt = w.intents.size def withWarn(i: Int): String = if (i == 0) s"0 ${r("(!)")}" else i.toString tbl += Seq( - s"Name: ${bo(c(mdl.getName))}", - s"ID: ${bo(mdl.getId)}", - s"Version: ${mdl.getVersion}", - s"Origin: ${mdl.getOrigin}", - s"Elements: ${withWarn(elmCnt)}", - s"Synonyms(Direct) $synCnt", - s"Synonyms(Direct, DSL): $synDslCnt", - s"Synonyms(Sparse): $synSparseCnt", - s"Synonyms(Sparse, DSL): $synSparseDslCnt", - s"Intents: ${withWarn(intentCnt)}" + s"Name: ${bo(c(mdl.getName))}", + s"ID: ${bo(mdl.getId)}", + s"Version: ${mdl.getVersion}", + s"Origin: ${mdl.getOrigin}", + s"Elements: ${withWarn(elmCnt)}", + s"Synonyms(Direct) $synDirectCnt", + s"Synonyms(Sparse): $synSparseCnt", + s"Synonyms(DSL): $synDslCnt", + s"Intents: ${withWarn(intentCnt)}" ) }) } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 0542174..5169afe 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -373,30 +373,31 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { /** * - * @param comb - * @param syn + * @param seq + * @param s */ - private def getPartsComplex(comb: Seq[Complex], syn: Synonym): Seq[TokType] = - comb.zip(syn.map(_.kind)).flatMap { + private def toPartsComplex(seq: Seq[Complex], s: Synonym): Seq[TokType] = + seq.zip(s.map(_.kind)).flatMap { case (complex, kind) ⇒ if (complex.isToken) Some(complex.token → kind) else None } /** * - * @param comb - * @param syn + * @param seq + * @param s */ - private def toParts(comb: Seq[NCDslContent], syn: Synonym): Seq[TokType] = - comb.zip(syn.map(_.kind)).flatMap { + private def toParts(seq: Seq[NCDslContent], s: Synonym): Seq[TokType] = + seq.zip(s.map(_.kind)).flatMap { case (complex, kind) ⇒ if (complex.isLeft) Some(complex.left.get → kind) else None } /** * */ - private def mkCache(): Cache = - mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[Int]]) + private def mkCache(mdl: NCProbeModel): Cache = + mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]].empty ++ + mdl.elements.keys.map(k ⇒ k → mutable.ArrayBuffer.empty[Seq[Int]]) /** * @@ -478,24 +479,21 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param h * @param toks */ - private def mkComplexCombinations(h: ComplexHolder, toks: Seq[NlpToken]): Seq[Seq[Complex]] = { - val idxsSeq = toks.flatMap(_.wordIndexes) -// val idxsSorted = idxsSeq.sorted - val idxs = idxsSeq.toSet -// val idxMin = idxsSorted.head -// val idxMax = idxsSorted.last + private def mkComplexCombinations(h: ComplexHolder, toks: Seq[NlpToken], cache: Set[Seq[Complex]]): Seq[Seq[Complex]] = { + val idxs = toks.flatMap(_.wordIndexes).toSet h.complexes.par. flatMap(complexSeq ⇒ { //val rec = complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs)) - val rec = complexSeq.tokensComplexes.filter(_.wordIndexes.exists(idxsSeq.contains)) + val rec = complexSeq.tokensComplexes.filter(_.wordIndexes.exists(idxs.contains)) // Drops without tokens (IDL part works with tokens). - if (rec.nonEmpty) - Some( - rec ++ + if (rec.nonEmpty) { + val data = rec ++ (complexSeq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.complexesWords) - ) + + if (!cache.contains(data)) Some(data) else None + } else None }).seq @@ -569,31 +567,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { startScopedSpan("enrich", parent, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { span ⇒ val req = NCRequestImpl(senMeta, srvReqId) - val matches = mutable.ArrayBuffer.empty[ElementMatch] - val cacheSparse = mkCache() - val cacheDirect = mkCache() val h = mkComplexes(mdl, ns) - var found = false - - def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], tokIdxs: Seq[Int], s: Synonym, parts: Seq[TokType]): Unit = { - val toksSet = res.toSet - - var added = false - - // TODO: - if (!matches.exists(m ⇒ m.element.getId == elm.getId && toksSet.subsetOf(m.tokensSet))) { - matches += ElementMatch(elm, res, s, parts) - - added = true - } - - cache(elm.getId) += tokIdxs - found = true - - println(s"ADDED: ${elm.getId}, type=$typ, res=${res.map(_.origText).mkString("|")}, tokIdxs=${tokIdxs.mkString("|")}, added=$added") - } - startScopedSpan("synsProc", span, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { _ ⇒ var state = if (ns.firstProbePhase) SIMPLE else DSL_NEXT ns.firstProbePhase = false @@ -603,9 +578,36 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { println println(s"GO $state") + val matches = mutable.ArrayBuffer.empty[ElementMatch] + + val cacheSparse = mkCache(mdl) + val cacheDirect = mkCache(mdl) + val dslCache = mutable.HashSet.empty[Seq[Complex]] + + var found = false + + def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], tokIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = { + var added = false + + if (!matchExist(elm.getId, res)) { + matches += ElementMatch(elm, res, s, parts) + + added = true + } + + cache(elm.getId) += tokIdxs + found = true + + println(s"ADDED: ${elm.getId}, type=$typ, res=${res.map(_.origText).mkString("|")}, toks=${tokIdxs.mkString("|")}, added=$added") + } + + // TODO: + def matchExist(elemId: String, toks: Seq[NlpToken]): Boolean = + matches.exists(m ⇒ m.element.getId == elemId && toks.toSet.subsetOf(m.tokensSet)) + for (toks ← combosToks) { val tokIdxs = toks.map(_.index) - lazy val dslCombs: Seq[Seq[Complex]] = mkComplexCombinations(h, toks) + lazy val dslCombs: Seq[Seq[Complex]] = mkComplexCombinations(h, toks, dslCache.toSet) lazy val tokStems = toks.map(_.stem).mkString(" ") // Attempt to match each element. @@ -613,12 +615,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { for ( elm ← mdl.elements.values; elemId = elm.getId; - if - !alreadyMarked(toks, elm.getId) + dirProc = cacheDirect(elemId).exists(_.containsSlice(tokIdxs)); + sparseProc = cacheSparse(elemId).exists(_.containsSlice(tokIdxs)) + if (!dirProc || !sparseProc) && !alreadyMarked(toks, elemId) && !matchExist(elemId, toks) ) { - val directProc = cacheDirect(elemId).exists(_.containsSlice(tokIdxs)) - val sparseProc = cacheSparse(elemId).exists(_.containsSlice(tokIdxs)) - + //println(s"State=$elemId, dirProc=$dirProc, sparseProc=$sparseProc, cacheSparse(elemId)="+cacheSparse(elemId).mkString("|")) // 1. SIMPLE. found = false @@ -630,19 +631,19 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { } // 1.1 Direct. - if (simpleEnabled && !directProc && !found) + if (simpleEnabled && !dirProc && !found) fastAccess(mdl.directSynonyms, elemId, toks.length) match { case Some(h) ⇒ def tryMap(syns: Map[String, Synonym], notFound: () ⇒ Unit): Unit = syns.get(tokStems) match { - case Some(s) ⇒ add("direct simple", elm, cacheDirect, toks, tokIdxs, s, Seq.empty) + case Some(s) ⇒ add("direct simple", elm, cacheDirect, toks, tokIdxs, s) case None ⇒ notFound() } def tryScan(syns: Seq[Synonym]): Unit = for (s ← syns if !found) if (s.isMatch(toks)) - add("direct simple2", elm, cacheDirect, toks, tokIdxs, s, Seq.empty) + add("direct simple2", elm, cacheDirect, toks, tokIdxs, s) tryMap( h.txtDirectSynonyms, @@ -660,34 +661,37 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { if (simpleEnabled && !sparseProc && !found) for (s ← get(mdl.sparseSynonyms, elemId) if !found) s.trySparseMatch(toks) match { - case Some(res) ⇒ add("sparse simple", elm, cacheSparse, res, tokIdxs, s, Seq.empty) + case Some(res) ⇒ add("sparse simple", elm, cacheSparse, res, tokIdxs, s) case None ⇒ // No-op. } // 2. DSL. - found = false val dslEnabled = state != SIMPLE - // 2.1 Direct. - if (dslEnabled && mdl.directSynonymsDsl.nonEmpty && !directProc && !found) - for (s ← get(mdl.directSynonymsDsl, elemId); comb ← dslCombs if !found) { - if (s.isMatch(comb.map(_.data), req)) { - println(s"OK $elemId for s=$s for toks:${toks.map(_.origText)}") - - add("direct DSL", elm, cacheDirect, toks, tokIdxs, s, getPartsComplex(comb, s)) - } - println { - println(s"NOT OK $elemId for s=$s for toks:${toks.map(_.origText)}") - } + if (dslEnabled && mdl.synonymsDsl.nonEmpty) { + found = false + + // 2.1 Sparse. + if (mdl.hasDslSynonyms) { + if (!sparseProc) + for (s ← get(mdl.synonymsDsl, elemId); comb ← dslCombs if !found) + s.trySparseMatch(comb.map(_.data), req) match { + case Some(res) ⇒ + add("DSL", elm, cacheSparse, toTokens(res, ns), tokIdxs, s, toParts(res, s)) + dslCache += comb + case None ⇒ // No-op. + } } - - // 2.2 Sparse. - if (dslEnabled && mdl.sparseSynonymsDsl.nonEmpty && !sparseProc && !found) - for (s ← get(mdl.sparseSynonymsDsl, elemId); comb ← dslCombs if !found) - s.trySparseMatch(comb.map(_.data), req) match { - case Some(res) ⇒ add("sparse DSL", elm, cacheSparse, toTokens(res, ns), tokIdxs, s, toParts(res, s)) - case None ⇒ // No-op. - } + // 2.2 Direct. + else { + if (!dirProc) + for (s ← get(mdl.synonymsDsl, elemId); comb ← dslCombs if !found) + if (s.isMatch(comb.map(_.data), req)) { + add("direct DSL", elm, cacheDirect, toks, tokIdxs, s, toPartsComplex(comb, s)) + dslCache += comb + } + } + } } } @@ -701,18 +705,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { go() } - - } - - processParsers(mdl, ns, span, req) } } - def isComplex(mdl: NCProbeModel): Boolean = - mdl.directSynonymsDsl.nonEmpty || - mdl.sparseSynonymsDsl.nonEmpty || - !mdl.model.getParsers.isEmpty + def isComplex(mdl: NCProbeModel): Boolean = mdl.synonymsDsl.nonEmpty || !mdl.model.getParsers.isEmpty } \ No newline at end of file
