This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-443-1 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 312fabf6420cfffcb5e89f1a432d45f198719a20 Author: Sergey Kamov <[email protected]> AuthorDate: Tue Sep 21 12:22:57 2021 +0300 WIP. --- .../cargps/src/main/resources/cargps_model.yaml | 2 +- .../nlpcraft/probe/mgrs/NCProbeSynonym.scala | 51 +++++++--- .../nlpcraft/probe/mgrs/NCProbeVariants.scala | 30 ++++++ .../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 4 +- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 20 +++- .../mgrs/nlp/enrichers/model/NCSentenceCache.scala | 110 +++++++++++++++++++++ .../probe/mgrs/sentence/NCSentenceManager.scala | 40 +++++++- 7 files changed, 238 insertions(+), 19 deletions(-) diff --git a/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml b/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml index cd5fb4e..62f45c8 100644 --- a/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml +++ b/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml @@ -60,7 +60,7 @@ elements: - id: "x:addr:st" greedy: false synonyms: - - "{//[a-zA-Z0-9]+//}[1,3]" + - "{^^{is_alphanum(tok_txt) && tok_is_between_ids('x:addr:num', 'x:addr:kind') == true}^^}[1,3]" - id: "x:addr" synonyms: diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala index 809c4e5..e324857 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala @@ -20,9 +20,10 @@ package org.apache.nlpcraft.probe.mgrs import org.apache.nlpcraft.common.U import org.apache.nlpcraft.common.nlp.NCNlpSentenceToken import org.apache.nlpcraft.model._ -import org.apache.nlpcraft.model.intent.NCIdlContext -import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent +import org.apache.nlpcraft.model.intent.{NCIdlContext, NCIdlFunction} +import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.{NCIdlContent, saveIdl} import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind._ +import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager import scala.collection.mutable @@ -146,8 +147,11 @@ class NCProbeSynonym( * @param tow * @param chunk * @param req + * @param variantsToks */ - private def isMatch(tow: NCIdlContent, chunk: NCProbeSynonymChunk, req: NCRequest): Boolean = { + private def isMatch( + tow: NCIdlContent, chunk: NCProbeSynonymChunk, req: NCRequest, variantsToks: Seq[Seq[NCToken]] + ): Boolean = { def get0[T](fromToken: NCToken => T, fromWord: NCNlpSentenceToken => T): T = if (tow.isLeft) fromToken(tow.swap.toOption.get) else fromWord(tow.toOption.get) @@ -160,7 +164,20 @@ class NCProbeSynonym( r.matcher(get0(_.origText, _.origText)).matches() || r.matcher(get0(_.normText, _.normText)).matches() case IDL => - get0(t => chunk.idlPred.apply(t, NCIdlContext(req = req)).value.asInstanceOf[Boolean], _ => false) + val ok = + variantsToks.exists(variantToks => + get0(t => + chunk.idlPred.apply( + t, + NCIdlContext(req = req, toks = variantToks) + ).value.asInstanceOf[Boolean], _ => false + ) + ) + + if (ok) + saveIdl(req, tow.swap.toOption.get, chunk.idlPred) + + ok case _ => throw new AssertionError() } @@ -188,17 +205,20 @@ class NCProbeSynonym( * * @param tows * @param req + * @param variantsToks * @return */ - def isMatch(tows: Seq[NCIdlContent], req: NCRequest): Boolean = { + def isMatch(tows: Seq[NCIdlContent], req: NCRequest, variantsToks: Seq[Seq[NCToken]]): Boolean= { require(tows != null) if (tows.length == length && tows.count(_.isLeft) >= idlChunks) - tows.zip(this).sortBy(p => getSort(p._2.kind)).forall { case (tow, chunk) => isMatch(tow, chunk, req) } + tows.zip(this).sortBy(p => getSort(p._2.kind)).forall { + case (tow, chunk) => isMatch(tow, chunk, req, variantsToks) + } else false } - + /** * * @param toks @@ -214,15 +234,16 @@ class NCProbeSynonym( * * @param tows * @param req + * @param variantsToks */ - def sparseMatch(tows: Seq[NCIdlContent], req: NCRequest): Option[Seq[NCIdlContent]] = { + def sparseMatch(tows: Seq[NCIdlContent], req: NCRequest, variantsToks: Seq[Seq[NCToken]]): Option[Seq[NCIdlContent]] = { require(tows != null) require(req != null) require(hasIdl) sparseMatch0( tows, - (t: NCIdlContent, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, req), + (t: NCIdlContent, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, req, variantsToks), (t: NCIdlContent) => if (t.isLeft) t.swap.toOption.get.getStartCharIndex else t.toOption.get.startCharIndex, shouldBeNeighbors = !sparse ) @@ -340,9 +361,17 @@ object NCProbeSynonym { permute: Boolean ): NCProbeSynonym = { val syn = new NCProbeSynonym(isElementId, isValueName, isDirect, value, sparse, permute) - + syn ++= chunks - + syn } + + /** + * + * @param req + * @param tok + * @param idlPred + */ + def saveIdl(req: NCRequest, tok: NCToken, idlPred: NCIdlFunction): Unit = NCSentenceManager.saveIdl(req, tok, idlPred) } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala index bcf2c9c..39f6969 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala @@ -22,6 +22,8 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence => NlpSentence, NCNlpSenten import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY} import org.apache.nlpcraft.model.NCVariant import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCTokenLogger, NCVariantImpl} +import org.apache.nlpcraft.model.intent.NCIdlContext +import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager import java.io.{Serializable => JSerializable} import java.util @@ -267,6 +269,34 @@ object NCProbeVariants { for ((tok, tokNlp) <- toks.zip(nlpSen) if tokNlp.isUser) process(tok, tokNlp) + if (ok) { + NCSentenceManager.getIdlData(srvReqId) match { + case Some((req, toksData)) => + ok = + toks.forall(t => + toksData.get((t, t.getId)) match { + case Some(f) => + val x = + f.apply( + t, + NCIdlContext(req = req, toks = toks.toSeq) + ).value.asInstanceOf[Boolean] + + + if (!x) + println("x="+x + ", t=" + t + ", toks=" + toks) + x + + + case None => true + } + ) + + case None => // No-op. + + } + } + if (ok) Some(new NCVariantImpl(toks.asJava)) else None }) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala index 4b6c697..9af0c61 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala @@ -526,8 +526,6 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { ) }) - NCSentenceManager.clearCache(srvReqId) - // Final validation before execution. try sensSeq.foreach(NCValidateManager.postValidate(mdl, _, span)) @@ -556,6 +554,8 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { var senVars = NCProbeVariants.convert(srvReqId, mdl, sensSeq, lastPhase = true) + NCSentenceManager.clearCache(srvReqId) + // Sentence variants can be filtered by model. val fltSenVars: Seq[(NCVariant, Int)] = senVars. diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 9706c4c..7a11806 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -28,7 +28,7 @@ import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager -import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym} +import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym} import java.io.Serializable import java.util.{List => JList} @@ -526,8 +526,21 @@ object NCModelEnricher extends NCProbeEnricher { "enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text ) { span => val req = NCRequestImpl(senMeta, ns.srvReqId) + val combToks = combosTokens(ns.toSeq) lazy val ch = mkComplexes(mdl, ns) + lazy val variantsToks = + ch.complexes.map(p => p.tokensComplexes.map(p => + if (p.isToken) + p.token + else { + // TODO: everywhere + val clone = p.word.clone() + + clone.filter(!_.isNlp).foreach(clone.remove) + + NCTokenImpl(mdl, ns.srvReqId, clone) + })) def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit = startScopedSpan( @@ -603,6 +616,7 @@ object NCModelEnricher extends NCProbeEnricher { val allSyns = get(mdl.idlSynonyms, eId) lazy val allCombs = mkCombinations(ch, toks, idlCache) + // 2.1 Continuous. if (!mdl.hasSparseSynonyms) { var found = false @@ -613,7 +627,7 @@ object NCModelEnricher extends NCProbeEnricher { if !found; data = comb.map(_.data) ) - if (s.isMatch(data, req)) { + if (s.isMatch(data, req, variantsToks)) { val parts = toParts(mdl, ns.srvReqId, data, s) add("IDL continuous", ns, contCache, eId, greedy, toksExt, idxs, s, parts) @@ -629,7 +643,7 @@ object NCModelEnricher extends NCProbeEnricher { s <- allSyns; comb <- allCombs ) - s.sparseMatch(comb.map(_.data), req) match { + s.sparseMatch(comb.map(_.data), req, variantsToks) match { case Some(res) => val typ = if (s.sparse) "IDL sparse" else "IDL continuous" diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala new file mode 100644 index 0000000..e5b6e3e --- /dev/null +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model + +import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken => NlpToken} +import org.apache.nlpcraft.probe.mgrs.{NCProbeSynonym => Synonym} +import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.{NCIdlContent => IdlToken} +import org.apache.nlpcraft.model.NCRequest +import scala.collection.mutable + +class NCSentenceCache { +// case class Key(elemId: String, indexes: Seq[Int]) +// case class Value[T](synonym: Synonym, result: Seq[T]) +// +// val cacheToks = mutable.HashMap.empty[Key, mutable.HashMap[Seq[Int], Value[NlpToken]]] +// val cacheIdl = mutable.HashMap.empty[Key, mutable.HashMap[Seq[Int], Value[IdlToken]]] +// +// var cacheHits = 0 +// var cacheCnt = 0 +// var time = 0L +// +// private def process[T]( +// elemId: String, +// elemSyns: Seq[Synonym], +// toks: Seq[T], +// extract: (Synonym, Seq[T]) => Option[Seq[T]], +// cache: mutable.Map[Key, mutable.HashMap[Seq[Int], Value[T]]], +// getIndex: T => Int, +// callback: (Synonym, Seq[T]) => Unit +// ): Unit = { +// val t = System.currentTimeMillis() +// +// val hash = toks.map(getIndex) +// val key = Key(elemId, hash) +// +// cacheCnt += 1 +// +// cache.get(key) match { +// case Some(data) => +// cacheHits += 1 +// data.get(hash) match { +// case Some(v) => callback(v.synonym, v.result) +// case None => // No-op. +// } +// case None => +// // mutable.HashMap.empty[Key[IdlToken], Map[Seq[IdlToken], Value[IdlToken]]] +// val hit = mutable.HashMap.empty[Seq[Int], Value[T]] +// +// for (s <- elemSyns) +// extract(s, toks) match { +// case Some(res) => +// callback(s, res) +// hit += hash -> Value(s, res) +// case None => // No-op. +// } +// +// cache += key -> hit +// } +// +// time += (System.currentTimeMillis() - t) +// } +// +// def processSparseTokens( +// elemId: String, +// elemSyns: Seq[Synonym], +// toks: Seq[NlpToken], +// callback: (Synonym, Seq[NlpToken]) => Unit +// ): Unit = +// process( +// elemId, +// elemSyns, +// toks, +// (s: Synonym, toks: Seq[NlpToken]) => s.sparseMatch(toks), +// cacheToks, +// (t: NlpToken) => t.index, +// callback +// ) +// +// def processSparseIdl( +// elemId: String, +// req: NCRequest, +// elemSyns: Seq[Synonym], +// toks: Seq[IdlToken], +// callback: (Synonym, Seq[IdlToken]) => Unit +// ): Unit = +// process( +// elemId, +// elemSyns, +// toks, +// (s: Synonym, toks: Seq[IdlToken]) => s.sparseMatch(toks, req), +// cacheIdl, +// (t: IdlToken) => if (t.isRight) t.toOption.get.index else t.swap.toOption.get.getIndex, +// callback +// ) +} \ No newline at end of file diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index ee8b719..b0a077a 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -22,7 +22,8 @@ import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken} import org.apache.nlpcraft.common.{NCE, NCService, U, _} -import org.apache.nlpcraft.model.NCModel +import org.apache.nlpcraft.model.intent.NCIdlFunction +import org.apache.nlpcraft.model.{NCModel, NCRequest, NCToken} import org.apache.nlpcraft.probe.mgrs.NCTokenPartKey import java.io.{Serializable => JSerializable} @@ -43,6 +44,9 @@ object NCSentenceManager extends NCService { type CacheValue = Seq[Seq[NCNlpSentenceNote]] private val combCache = mutable.HashMap.empty[String, mutable.HashMap[CacheKey, CacheValue]] + type IdlCacheKey = (NCToken, String) + private val reqCache = mutable.HashMap.empty[String, NCRequest] + private val idlCache = mutable.HashMap.empty[String, mutable.HashMap[IdlCacheKey, NCIdlFunction]] /** * @@ -818,5 +822,37 @@ object NCSentenceManager extends NCService { * * @param srvReqId */ - def clearCache(srvReqId: String): Unit = combCache -= srvReqId + def clearCache(srvReqId: String): Unit = { + combCache -= srvReqId + reqCache -= srvReqId + idlCache -= srvReqId + } + + def saveIdl(req: NCRequest, tok: NCToken, idlPred: NCIdlFunction): Unit = { + val srvReqId = req.getServerRequestId + + reqCache += srvReqId -> req + + val idlCacheReq: mutable.Map[IdlCacheKey, NCIdlFunction] = + idlCache.get(srvReqId) match { + case Some(m) => m + case None => + val m = mutable.HashMap.empty[IdlCacheKey, NCIdlFunction] + + idlCache += srvReqId -> m + + m + } + + idlCacheReq += (tok, tok.getId) -> idlPred + } + + def getIdlData(srvReqId: String) : Option[(NCRequest, Map[IdlCacheKey, NCIdlFunction])] = { + val reqData = reqCache.get(srvReqId) + val idlData = idlCache.get(srvReqId) + + require(reqData.isDefined && idlData.isDefined || reqData.isEmpty && idlData.isEmpty) + + if (reqData.isDefined) Some((reqData.get, idlData.get.toMap)) else None + } } \ No newline at end of file
