This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
new dd63a0a WIP.
dd63a0a is described below
commit dd63a0a68100ae464ae3b12e30357d7c81944946
Author: Sergey Kamov <[email protected]>
AuthorDate: Sat Sep 25 20:15:05 2021 +0300
WIP.
---
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 114 ++++++++-------
.../probe/mgrs/synonyms/NCSynonymsManager.scala | 162 +++++++++++++--------
2 files changed, 164 insertions(+), 112 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index bd7804b..ded7928 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -508,9 +508,9 @@ object NCModelEnricher extends NCProbeEnricher {
(toks, toksExt) <- combosTokens(ns.toSeq);
idxs = toks.map(_.index);
e <- mdl.elements.values;
- eId = e.getId;
+ elemId = e.getId;
greedy = e.isGreedy.orElse(mdl.model.isGreedy)
- if !greedy || !alreadyMarked(ns, eId, toks, idxs)
+ if !greedy || !alreadyMarked(ns, elemId, toks, idxs)
) {
def add(
dbgType: String,
@@ -521,16 +521,16 @@ object NCModelEnricher extends NCProbeEnricher {
val resIdxs = elemToks.map(_.index)
val ok =
- (!greedy || !alreadyMarked(ns, eId, elemToks,
idxs)) &&
- ( parts.isEmpty || !parts.exists { case (t,
_) => t.getId == eId })
+ (!greedy || !alreadyMarked(ns, elemId,
elemToks, idxs)) &&
+ ( parts.isEmpty || !parts.exists { case (t,
_) => t.getId == elemId })
if (ok)
- mark(ns, eId, elemToks, direct = syn.isDirect
&& U.isIncreased(resIdxs), syn = Some(syn), parts = parts)
+ mark(ns, elemId, elemToks, direct =
syn.isDirect && U.isIncreased(resIdxs), syn = Some(syn), parts = parts)
if (DEEP_DEBUG)
logger.trace(
s"${if (ok) "Added" else "Skipped"}
element [" +
- s"id=$eId, " +
+ s"id=$elemId, " +
s"type=$dbgType, " +
s"text='${elemToks.map(_.origText).mkString(" ")}', " +
s"indexes=${resIdxs.mkString("[", ",",
"]")}, " +
@@ -541,14 +541,14 @@ object NCModelEnricher extends NCProbeEnricher {
}
// 1. SIMPLE.
- if (simpleEnabled && (if (idlEnabled)
mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) {
+ if (simpleEnabled && (if (idlEnabled)
mdl.hasIdlSynonyms(elemId) else !mdl.hasIdlSynonyms(elemId))) {
lazy val tokStems = toks.map(_.stem).mkString(" ")
// 1.1 Continuous.
var found = false
if (mdl.hasContinuousSynonyms)
- fastAccess(mdl.continuousSynonyms, eId,
toks.length) match {
+ fastAccess(mdl.continuousSynonyms, elemId,
toks.length) match {
case Some(h) =>
def tryMap(syns: Map[String, Synonym],
notFound: () => Unit): Unit =
syns.get(tokStems) match {
@@ -559,16 +559,17 @@ object NCModelEnricher extends NCProbeEnricher {
}
def tryScan(syns: Seq[Synonym]): Unit =
- for (
- s <- syns
- if
- !found &&
-
NCSynonymsManager.isUnprocessedTokens(ns.srvReqId, eId, s, idxs)
- )
- if
(NCSynonymsManager.isMatch(s, toks)) {
- found = true
- add("simple continuous
scan", toksExt, s)
- }
+ for (syn <- syns if !found)
+ NCSynonymsManager.onMatch(
+ ns.srvReqId,
+ elemId,
+ syn,
+ toks,
+ _ => {
+ found = true
+ add("simple continuous
scan", toksExt, syn)
+ }
+ )
tryMap(
h.txtDirectSynonyms,
@@ -584,57 +585,60 @@ object NCModelEnricher extends NCProbeEnricher {
// 1.2 Sparse.
if (!found && mdl.hasSparseSynonyms)
- for (
- s <- get(mdl.sparseSynonyms, eId)
- if
NCSynonymsManager.isUnprocessedTokens(ns.srvReqId, eId, s, idxs)
- )
- NCSynonymsManager.sparseMatch(s, toks)
match {
- case Some(res) =>
- add("simple sparse",
getSparsedTokens(res, toks), s)
- case None => // No-op.
- }
+ for (syn <- get(mdl.sparseSynonyms, elemId))
+ NCSynonymsManager.onSparseMatchTokens(
+ ns.srvReqId,
+ elemId,
+ syn,
+ toks,
+ res => add("simple sparse",
getSparsedTokens(res, toks), syn)
+ )
}
// 2. IDL.
if (idlEnabled) {
- val allSyns = get(mdl.idlSynonyms, eId)
+ val allSyns = get(mdl.idlSynonyms, elemId)
lazy val allCombs = mkCombinations(ch, toks)
// 2.1 Continuous.
if (!mdl.hasSparseSynonyms) {
var found = false
- for (
- s <- allSyns;
- comb <- allCombs;
- data = comb.map(_.data)
- if !found &&
NCSynonymsManager.isUnprocessedIdl(ns.srvReqId, eId, s, data)
- )
- if (NCSynonymsManager.isMatch(s, data,
req, variantsToks)) {
- val parts = toParts(mdl, ns.srvReqId,
data, s)
-
- add("IDL continuous", toksExt, s,
parts)
-
- found = true
- }
+ for (syn <- allSyns; comb <- allCombs; data =
comb.map(_.data) if !found)
+ NCSynonymsManager.onMatch(
+ ns.srvReqId,
+ elemId,
+ syn,
+ data,
+ req,
+ variantsToks,
+ _ => {
+ val parts = toParts(mdl,
ns.srvReqId, data, syn)
+
+ add("IDL continuous", toksExt,
syn, parts)
+
+ found = true
+ }
+ )
}
else
// 2.2 Sparse.
- for (
- s <- allSyns;
- comb <- allCombs;
- data = comb.map(_.data)
- if
NCSynonymsManager.isUnprocessedIdl(ns.srvReqId, eId, s, data)
- )
- NCSynonymsManager.sparseMatch(s, data,
req, variantsToks) match {
- case Some(res) =>
+ for (syn <- allSyns; comb <- allCombs)
+ NCSynonymsManager.onSparseMatch(
+ ns.srvReqId,
+ elemId,
+ syn,
+ comb.map(_.data),
+ req,
+ variantsToks,
+ res => {
val toks =
getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns))
- val parts = toParts(mdl,
ns.srvReqId, res, s)
- val typ = if (s.sparse) "IDL
sparse"else "IDL continuous"
+ val parts = toParts(mdl,
ns.srvReqId, res, syn)
+ val typ = if (syn.sparse) "IDL
sparse"else "IDL continuous"
- add(typ, toks, s, parts)
- case None => // No-op.
- }
+ add(typ, toks, syn, parts)
+ }
+ )
}
}
}
@@ -652,7 +656,7 @@ object NCModelEnricher extends NCProbeEnricher {
processParsers(mdl, ns, span, req)
}
- NCSynonymsManager.clearRequestIterationData(ns.srvReqId)
+ NCSynonymsManager.clearIteration(ns.srvReqId)
normalize(ns)
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
index 7996510..315f380 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
@@ -18,7 +18,7 @@
package org.apache.nlpcraft.probe.mgrs.synonyms
import io.opencensus.trace.Span
-import org.apache.nlpcraft.common.nlp.{NCNlpSentenceNote, NCNlpSentenceToken}
+import org.apache.nlpcraft.common.nlp.{NCNlpSentenceNote => NlpNote,
NCNlpSentenceToken => NlpToken}
import org.apache.nlpcraft.common.{NCService, U}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.model.intent.{NCIdlContext, NCIdlFunction}
@@ -35,7 +35,7 @@ import scala.jdk.CollectionConverters.ListHasAsScala
*
*/
object NCSynonymsManager extends NCService {
- class CacheHolder[T] {
+ private class CacheHolder[T] {
private lazy val cache =
mutable.HashMap.empty[String, mutable.HashMap[Int,
mutable.HashMap[Seq[T], mutable.HashSet[Synonym]]]]
@@ -56,9 +56,9 @@ object NCSynonymsManager extends NCService {
}
}
- case class SavedIdlKey(id: String, startCharIndex: Int, endCharIndex: Int,
other: Map[String, AnyRef] = Map.empty)
+ private case class SavedIdlKey(id: String, startCharIndex: Int,
endCharIndex: Int, other: Map[String, AnyRef] = Map.empty)
- object SavedIdlKey {
+ private object SavedIdlKey {
def apply(t: NCToken): SavedIdlKey =
if (t.isUserDefined)
SavedIdlKey(t.getId, t.getStartCharIndex, t.getEndCharIndex)
@@ -67,14 +67,14 @@ object NCSynonymsManager extends NCService {
t.getId,
t.getStartCharIndex,
t.getEndCharIndex,
- NCNlpSentenceNote.getBuiltProperties(t.getId).flatMap(p =>
t.metaOpt(p).asScala match {
+ NlpNote.getBuiltProperties(t.getId).flatMap(p =>
t.metaOpt(p).asScala match {
case Some(v) => Some(p -> v)
case None => None
}).toMap
)
}
- case class Value(request: NCRequest, variants: Seq[Seq[NCToken]],
predicate: NCIdlFunction) {
+ private case class Value(request: NCRequest, variants: Seq[Seq[NCToken]],
predicate: NCIdlFunction) {
override def toString: String = variants.toString()
}
@@ -100,7 +100,7 @@ object NCSynonymsManager extends NCService {
* @param tok
* @param chunk
*/
- private def isMatch(tok: NCNlpSentenceToken, chunk: NCProbeSynonymChunk):
Boolean =
+ private def isMatch(tok: NlpToken, chunk: NCProbeSynonymChunk): Boolean =
chunk.kind match {
case TEXT => chunk.wordStem == tok.stem
case REGEX =>
@@ -210,7 +210,7 @@ object NCSynonymsManager extends NCService {
getOrElseUpdate(
(tow, chunk),
{
- def get0[T](fromToken: NCToken => T, fromWord:
NCNlpSentenceToken => T): T =
+ def get0[T](fromToken: NCToken => T, fromWord: NlpToken =>
T): T =
if (tow.isLeft) fromToken(tow.swap.toOption.get)
else fromWord(tow.toOption.get)
@@ -242,74 +242,118 @@ object NCSynonymsManager extends NCService {
/**
*
- * @param s
+ * @param srvReqId
+ * @param elemId
+ * @param syn
* @param toks
+ * @param callback
*/
- def isMatch(s: Synonym, toks: Seq[NCNlpSentenceToken]): Boolean = {
- require(toks != null)
- require(!s.sparse && !s.hasIdl)
-
- if (toks.length == s.length) {
- if (s.isTextOnly)
- toks.zip(s).forall(p => p._1.stem == p._2.wordStem)
- else
- toks.zip(s).sortBy(p => getSort(p._2.kind)).forall { case
(tok, chunk) => isMatch(tok, chunk) }
+ def onMatch(srvReqId: String, elemId: String, syn: Synonym, toks:
Seq[NlpToken], callback: Unit => Unit): Unit =
+ if (isUnprocessedTokens(srvReqId, elemId, syn, toks.map(_.index))) {
+ require(toks != null)
+ require(!syn.sparse && !syn.hasIdl)
+
+ if (
+ toks.length == syn.length && {
+ if (syn.isTextOnly)
+ toks.zip(syn).forall(p => p._1.stem == p._2.wordStem)
+ else
+ toks.zip(syn).sortBy(p => getSort(p._2.kind)).forall {
case (tok, chunk) => isMatch(tok, chunk) }
+ }
+ )
+ callback()
}
- else
- false
- }
/**
*
+ * @param srvReqId
+ * @param elemId
* @param s
* @param tows
* @param req
* @param variantsToks
+ * @param callback
*/
- def isMatch(s: Synonym, tows: Seq[NCIdlToken], req: NCRequest,
variantsToks: Seq[Seq[NCToken]]): Boolean = {
- require(tows != null)
-
- if (tows.length == s.length && tows.count(_.isLeft) >= s.idlChunks)
- tows.zip(s).sortBy(p => getSort(p._2.kind)).forall {
- case (tow, chunk) => isMatch(tow, chunk, req, variantsToks)
- }
- else
- false
- }
+ def onMatch(
+ srvReqId: String,
+ elemId: String,
+ s: Synonym,
+ tows: Seq[NCIdlToken],
+ req: NCRequest,
+ variantsToks: Seq[Seq[NCToken]],
+ callback: Unit => Unit
+ ): Unit =
+ if (isUnprocessedIdl(srvReqId, elemId, s, tows)) {
+ require(tows != null)
+
+ if (
+ tows.length == s.length &&
+ tows.count(_.isLeft) >= s.idlChunks && {
+ tows.zip(s).sortBy(p => getSort(p._2.kind)).forall {
+ case (tow, chunk) => isMatch(tow, chunk, req,
variantsToks)
+ }
+ }
+ )
+ callback()
+ }
/**
*
- * @param s
+ * @param srvReqId
+ * @param elemId
+ * @param syn
* @param toks
+ * @param callback
*/
- def sparseMatch(s: Synonym, toks: Seq[NCNlpSentenceToken]):
Option[Seq[NCNlpSentenceToken]] = {
- require(toks != null)
- require(s.sparse && !s.hasIdl)
-
- sparseMatch0(s, toks, isMatch, (t: NCNlpSentenceToken) =>
t.startCharIndex, shouldBeNeighbors = false)
- }
+ def onSparseMatchTokens(
+ srvReqId: String, elemId: String, syn: Synonym, toks: Seq[NlpToken],
callback: Seq[NlpToken] => Unit
+ ): Unit =
+ if (isUnprocessedTokens(srvReqId, elemId, syn, toks.map(_.index))) {
+ require(toks != null)
+ require(syn.sparse && !syn.hasIdl)
+
+ sparseMatch0(syn, toks, isMatch, (t: NlpToken) =>
t.startCharIndex, shouldBeNeighbors = false) match {
+ case Some(res) => callback(res)
+ case None => // No-op.
+ }
+ }
/**
*
- * @param s
+ * @param srvReqId
+ * @param elemId
+ * @param syn
* @param tows
* @param req
* @param variantsToks
+ * @param callback
*/
- def sparseMatch(s: Synonym, tows: Seq[NCIdlToken], req: NCRequest,
variantsToks: Seq[Seq[NCToken]]): Option[Seq[NCIdlToken]] = {
- require(tows != null)
- require(req != null)
- require(s.hasIdl)
-
- sparseMatch0(
- s,
- tows,
- (t: NCIdlToken, chunk: NCProbeSynonymChunk) => isMatch(t, chunk,
req, variantsToks),
- (t: NCIdlToken) => if (t.isLeft)
t.swap.toOption.get.getStartCharIndex
- else t.toOption.get.startCharIndex,
- shouldBeNeighbors = !s.sparse
- )
- }
+ def onSparseMatch(
+ srvReqId: String,
+ elemId: String,
+ syn: Synonym,
+ tows: Seq[NCIdlToken],
+ req: NCRequest,
+ variantsToks: Seq[Seq[NCToken]],
+ callback: Seq[NCIdlToken] => Unit
+ ): Unit =
+ if (isUnprocessedIdl(srvReqId, elemId, syn, tows)) {
+ require(tows != null)
+ require(req != null)
+ require(syn.hasIdl)
+
+ sparseMatch0(
+ syn,
+ tows,
+ (t: NCIdlToken, chunk: NCProbeSynonymChunk) => isMatch(t,
chunk, req, variantsToks),
+ (t: NCIdlToken) => if (t.isLeft)
t.swap.toOption.get.getStartCharIndex
+ else t.toOption.get.startCharIndex,
+ shouldBeNeighbors = !syn.sparse
+ ) match {
+ case Some(res) => callback(res)
+ case None => // No-op.
+ }
+ }
/**
*
@@ -361,7 +405,7 @@ object NCSynonymsManager extends NCService {
* @param s
* @param tokens
*/
- def isUnprocessedTokens(srvReqId: String, elemId: String, s: Synonym,
tokens: Seq[Int]): Boolean =
+ private def isUnprocessedTokens(srvReqId: String, elemId: String, s:
Synonym, tokens: Seq[Int]): Boolean =
tokCaches.getOrElseUpdate(srvReqId, new
CacheHolder[Int]).isUnprocessed(elemId, s, tokens)
/**
@@ -371,20 +415,24 @@ object NCSynonymsManager extends NCService {
* @param s
* @param tokens
*/
- def isUnprocessedIdl(srvReqId: String, elemId: String, s: Synonym, tokens:
Seq[NCIdlToken]): Boolean =
+ private def isUnprocessedIdl(srvReqId: String, elemId: String, s: Synonym,
tokens: Seq[NCIdlToken]): Boolean =
idlCaches.getOrElseUpdate(srvReqId, new
CacheHolder[NCIdlToken]).isUnprocessed(elemId, s, tokens)
/**
*
* @param srvReqId
*/
- def clearRequestData(srvReqId: String): Unit = savedIdl -= srvReqId
+ def clearRequestData(srvReqId: String): Unit = {
+ clearIteration(srvReqId)
+
+ savedIdl -= srvReqId
+ }
/**
*
* @param srvReqId
*/
- def clearRequestIterationData(srvReqId: String): Unit = {
+ def clearIteration(srvReqId: String): Unit = {
idlChunksCache -= srvReqId
idlCaches -= srvReqId
tokCaches -= srvReqId