This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-443-1 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 267e82f5f9b888c5080e272fdd23d8db4e600aeb Author: Sergey Kamov <[email protected]> AuthorDate: Thu Sep 23 08:54:32 2021 +0300 WIP. --- .../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 12 ++++ .../nlpcraft/common/nlp/NCNlpSentenceNote.scala | 9 +-- .../nlpcraft/common/nlp/NCNlpSentenceToken.scala | 12 +--- .../org/apache/nlpcraft/probe/NCProbeBoot.scala | 3 +- .../nlpcraft/probe/mgrs/NCProbeVariants.scala | 4 +- .../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 9 +-- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 9 +-- .../probe/mgrs/sentence/NCSentenceManager.scala | 1 - .../{sentence => synonyms}/NCSynonymsManager.scala | 69 ++++++++++++---------- .../nlp/enrichers/NCServerEnrichmentManager.scala | 4 +- 10 files changed, 69 insertions(+), 63 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala index 0f0b462..40f5da6 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala @@ -74,6 +74,18 @@ class NCNlpSentence( firstProbePhase = firstProbePhase ) + def copy(srvReqId: Option[String]): NCNlpSentence = + new NCNlpSentence( + srvReqId = srvReqId.getOrElse(this.srvReqId), + text = this.text, + enabledBuiltInToks = this.enabledBuiltInToks, + tokens = this.tokens, + deletedNotes = this.deletedNotes, + initNlpNotes = this.initNlpNotes, + nlpTokens = this.nlpTokens, + firstProbePhase = this.firstProbePhase + ) + /** * Utility method that gets set of notes for given note type collected from * tokens in this sentence. Notes are sorted in the same order they appear diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala index 63ae6ca..c457aa7 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala @@ -77,14 +77,11 @@ class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends indexes, Some(wordIndexes), noteType, - values.filter(p => !SKIP_CLONE.contains(p._1)).toSeq ++ params:_* + dataWithoutIndexes.toSeq ++ params:_* ) - override def clone(): NCNlpSentenceNote = { - val m = mutable.Map.empty[String, JSerializable] ++ values - - new NCNlpSentenceNote(m.toMap) - } + override def clone(): NCNlpSentenceNote = + new NCNlpSentenceNote((mutable.HashMap.empty[String, JSerializable] ++ values).toMap) /** * diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala index 4b94b98..fa9cbe6 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala @@ -67,17 +67,7 @@ case class NCNlpSentenceToken( * Shallow copy. */ def clone(index: Int): NCNlpSentenceToken = - NCNlpSentenceToken( - index, - { - val m = mutable.HashSet.empty[NCNlpSentenceNote] - - notes.foreach(n => m += n.clone()) - - m - }, - stopsReasons.clone() - ) + NCNlpSentenceToken(index, mutable.HashSet.empty[NCNlpSentenceNote] ++ notes.clone(), stopsReasons.clone()) /** * Clones note. diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala index 4df9f53..561860f 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala @@ -49,7 +49,8 @@ import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort.NCSortEnricher import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNounsEnricher import org.apache.nlpcraft.probe.mgrs.nlp.validate.NCValidateManager -import org.apache.nlpcraft.probe.mgrs.sentence.{NCSentenceManager, NCSynonymsManager} +import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager +import org.apache.nlpcraft.probe.mgrs.synonyms.NCSynonymsManager import java.io._ import java.util.concurrent.CompletableFuture diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala index e876065..0596783 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala @@ -22,7 +22,7 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence => NlpSentence, NCNlpSenten import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY} import org.apache.nlpcraft.model.NCVariant import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCTokenLogger, NCVariantImpl} -import org.apache.nlpcraft.probe.mgrs.sentence.NCSynonymsManager +import org.apache.nlpcraft.probe.mgrs.synonyms.NCSynonymsManager import java.io.{Serializable => JSerializable} import java.util @@ -268,7 +268,7 @@ object NCProbeVariants { for ((tok, tokNlp) <- toks.zip(nlpSen) if tokNlp.isUser) process(tok, tokNlp) - ok = ok && NCSynonymsManager.isStillValid(srvReqId, toks.toSeq) + ok = ok && (!lastPhase || NCSynonymsManager.isStillValid(srvReqId, toks.toSeq)) if (ok) Some(new NCVariantImpl(toks.asJava)) else None }) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala index 64049ac..20dc64d 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala @@ -43,7 +43,8 @@ import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNounsEnricher import org.apache.nlpcraft.probe.mgrs.nlp.impl._ import org.apache.nlpcraft.probe.mgrs.nlp.validate._ -import org.apache.nlpcraft.probe.mgrs.sentence.{NCSentenceManager, NCSynonymsManager} +import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager +import org.apache.nlpcraft.probe.mgrs.synonyms.NCSynonymsManager import org.apache.nlpcraft.probe.mgrs.{NCProbeMessage, NCProbeVariants} import java.io.Serializable @@ -294,6 +295,9 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { ): Unit = { require(errMsg.isDefined || (resType.isDefined && resBody.isDefined)) + NCSentenceManager.clearRequestData(srvReqId) + NCSynonymsManager.clearRequestData(srvReqId) + val msg = NCProbeMessage(msgName) msg.addData("srvReqId", srvReqId) @@ -554,9 +558,6 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { var senVars = NCProbeVariants.convert(srvReqId, mdl, sensSeq, lastPhase = true) - NCSentenceManager.clearRequestData(srvReqId) - NCSynonymsManager.clearRequestData(srvReqId) - // Sentence variants can be filtered by model. val fltSenVars: Seq[(NCVariant, Int)] = senVars. diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 03c5b5d..c5ca532 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -27,7 +27,8 @@ import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl -import org.apache.nlpcraft.probe.mgrs.sentence.{NCSentenceManager, NCSynonymsManager} +import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager +import org.apache.nlpcraft.probe.mgrs.synonyms.NCSynonymsManager import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym} import java.io.Serializable @@ -535,11 +536,11 @@ object NCModelEnricher extends NCProbeEnricher { p.token else { // TODO: everywhere - val clone = p.word.clone() + val notes = mutable.HashSet.empty[NlpNote] - clone.filter(!_.isNlp).foreach(clone.remove) + notes += p.word.getNlpNote - NCTokenImpl(mdl, ns.srvReqId, clone) + NCTokenImpl(mdl, ns.srvReqId, NlpToken(p.word.index, notes, p.word.stopsReasons)) })) def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit = diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index 2e280ac..34c3f87 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -24,7 +24,6 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSe import org.apache.nlpcraft.common.{NCE, NCService, U, _} import org.apache.nlpcraft.model.NCModel import org.apache.nlpcraft.probe.mgrs.NCTokenPartKey -import org.apache.nlpcraft.probe.mgrs.sentence.NCSynonymsManager.{idlCache, reqCache} import java.io.{Serializable => JSerializable} import java.util diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSynonymsManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala similarity index 85% rename from nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSynonymsManager.scala rename to nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala index cf5eb5d..e9bf751 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSynonymsManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala @@ -15,33 +15,28 @@ * limitations under the License. */ -package org.apache.nlpcraft.probe.mgrs.sentence +package org.apache.nlpcraft.probe.mgrs.synonyms import io.opencensus.trace.Span import org.apache.nlpcraft.common.nlp.NCNlpSentenceToken import org.apache.nlpcraft.common.{NCService, U} -import org.apache.nlpcraft.model._ import org.apache.nlpcraft.model.intent.{NCIdlContext, NCIdlFunction} +import org.apache.nlpcraft.model._ import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{IDL, NCSynonymChunkKind, REGEX, TEXT} import org.apache.nlpcraft.probe.mgrs.{NCProbeSynonymChunk, NCProbeSynonym => Synonym} import scala.collection.mutable +import scala.jdk.CollectionConverters.ListHasAsScala /** * */ object NCSynonymsManager extends NCService { - case class Key(token: NCToken) { - // NCToken hashCode and equals based on indexes. // TODO: check it! - override def hashCode(): Int = U.mkJavaHash(token.getId, token) - override def equals(obj: Any): Boolean = obj match { - case key: Key => key.token.getId == token.getId && key.token == token - } - } case class Value(request: NCRequest, variants: Seq[Seq[NCToken]], predicate: NCIdlFunction) - private val idlCache = mutable.HashMap.empty[String, mutable.HashMap[Key, Value]] + // TODO: NCToken is not suitable key + private val idlCache = mutable.HashMap.empty[String, mutable.HashMap[NCToken, Value]] override def start(parent: Span): NCService = { ackStarting() @@ -149,8 +144,7 @@ object NCSynonymsManager extends NCService { * @param variantsToks */ private def save(req: NCRequest, tok: NCToken, pred: NCIdlFunction, variantsToks: Seq[Seq[NCToken]]): Unit = - idlCache.getOrElseUpdate(req.getServerRequestId, mutable.HashMap.empty) += - Key(tok) -> Value(req, variantsToks, pred) + idlCache.getOrElseUpdate(req.getServerRequestId, mutable.HashMap.empty) += tok -> Value(req, variantsToks, pred) /** * @@ -163,7 +157,8 @@ object NCSynonymsManager extends NCService { tow: NCIdlContent, chunk: NCProbeSynonymChunk, req: NCRequest, variantsToks: Seq[Seq[NCToken]] ): Boolean = { def get0[T](fromToken: NCToken => T, fromWord: NCNlpSentenceToken => T): T = - if (tow.isLeft) fromToken(tow.swap.toOption.get) else fromWord(tow.toOption.get) + if (tow.isLeft) fromToken(tow.swap.toOption.get) + else fromWord(tow.toOption.get) chunk.kind match { case TEXT => chunk.wordStem == get0(_.stem, _.stem) @@ -217,7 +212,7 @@ object NCSynonymsManager extends NCService { * @param req * @param variantsToks */ - def isMatch(s: Synonym, tows: Seq[NCIdlContent], req: NCRequest, variantsToks: Seq[Seq[NCToken]]): Boolean= { + def isMatch(s: Synonym, tows: Seq[NCIdlContent], req: NCRequest, variantsToks: Seq[Seq[NCToken]]): Boolean = { require(tows != null) if (tows.length == s.length && tows.count(_.isLeft) >= s.idlChunks) @@ -256,7 +251,8 @@ object NCSynonymsManager extends NCService { s, tows, (t: NCIdlContent, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, req, variantsToks), - (t: NCIdlContent) => if (t.isLeft) t.swap.toOption.get.getStartCharIndex else t.toOption.get.startCharIndex, + (t: NCIdlContent) => if (t.isLeft) t.swap.toOption.get.getStartCharIndex + else t.toOption.get.startCharIndex, shouldBeNeighbors = !s.sparse ) } @@ -264,31 +260,40 @@ object NCSynonymsManager extends NCService { /** * * @param srvReqId - * @param toks + * @param sen * @return */ - def isStillValid(srvReqId: String, toks: Seq[NCToken]): Boolean = - toks.forall(tok => - idlCache.get(srvReqId) match { - case Some(m) => - m.get(Key(tok)) match { - case Some(v) => + def isStillValid(srvReqId: String, sen: Seq[NCToken]): Boolean = + idlCache.get(srvReqId) match { + case Some(m) => + lazy val allCheckedSenToks = { + val set = mutable.HashSet.empty[NCToken] + def add(t: NCToken): Unit = { + set += t - val x = - v.predicate.apply( - tok, NCIdlContext(req = v.request, toks = toks) - ).value.asInstanceOf[Boolean] + t.getPartTokens.asScala.foreach(add) + } + sen.foreach(add) - if (!x) - println("x="+x + ", t=" + tok + ", toks=" + toks) + set + } + + sen.forall(tok => + m.get(tok) match { + case Some(v) => + v.variants.exists(winHistVariant => + v.predicate.apply( + tok, NCIdlContext(toks = winHistVariant, req = v.request) + ).value.asInstanceOf[Boolean] && + winHistVariant.forall(allCheckedSenToks.contains) + ) - x case None => true - } - case None => true - }) + }) + case None => true + } /** * diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala index 03b749f..2f457cb 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala @@ -156,7 +156,7 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance { if (h.enabledBuiltInTokens == normEnabledBuiltInToks) { prepareAsciiTable(h.sentence).info(logger, Some(s"Sentence enriched (from cache): '$normTxt'")) - h.sentence + h.sentence.copy(Some(U.genGuid())) } else process(srvReqId, normTxt, enabledBuiltInToks, span) @@ -224,7 +224,7 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance { .getNotes(hdr.noteType) .filter(_.contains(hdr.noteName)) .map(note => { - val s = note(hdr.noteName).toString() + val s = note(hdr.noteName).toString if (isStopWord) s"${r(s)}" else s }) .toSeq
