[incubator-nlpcraft] 01/01: WIP.

sergeykamov Tue, 21 Sep 2021 02:23:09 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443-1
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


commit 312fabf6420cfffcb5e89f1a432d45f198719a20
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Sep 21 12:22:57 2021 +0300

    WIP.
---
 .../cargps/src/main/resources/cargps_model.yaml    |   2 +-
 .../nlpcraft/probe/mgrs/NCProbeSynonym.scala       |  51 +++++++---
 .../nlpcraft/probe/mgrs/NCProbeVariants.scala      |  30 ++++++
 .../probe/mgrs/nlp/NCProbeEnrichmentManager.scala  |   4 +-
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala |  20 +++-
 .../mgrs/nlp/enrichers/model/NCSentenceCache.scala | 110 +++++++++++++++++++++
 .../probe/mgrs/sentence/NCSentenceManager.scala    |  40 +++++++-
 7 files changed, 238 insertions(+), 19 deletions(-)

diff --git a/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml 
b/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml
index cd5fb4e..62f45c8 100644
--- a/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml
+++ b/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml
@@ -60,7 +60,7 @@ elements:
   - id: "x:addr:st"
     greedy: false
     synonyms:
-      - "{//[a-zA-Z0-9]+//}[1,3]"
+      - "{^^{is_alphanum(tok_txt) && tok_is_between_ids('x:addr:num', 
'x:addr:kind') == true}^^}[1,3]"
 
   - id: "x:addr"
     synonyms:
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index 809c4e5..e324857 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
@@ -20,9 +20,10 @@ package org.apache.nlpcraft.probe.mgrs
 import org.apache.nlpcraft.common.U
 import org.apache.nlpcraft.common.nlp.NCNlpSentenceToken
 import org.apache.nlpcraft.model._
-import org.apache.nlpcraft.model.intent.NCIdlContext
-import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
+import org.apache.nlpcraft.model.intent.{NCIdlContext, NCIdlFunction}
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.{NCIdlContent, saveIdl}
 import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind._
+import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
 
 import scala.collection.mutable
 
@@ -146,8 +147,11 @@ class NCProbeSynonym(
       * @param tow
       * @param chunk
       * @param req
+      * @param variantsToks
       */
-    private def isMatch(tow: NCIdlContent, chunk: NCProbeSynonymChunk, req: 
NCRequest): Boolean = {
+    private def isMatch(
+        tow: NCIdlContent, chunk: NCProbeSynonymChunk, req: NCRequest, 
variantsToks: Seq[Seq[NCToken]]
+    ): Boolean = {
         def get0[T](fromToken: NCToken => T, fromWord: NCNlpSentenceToken => 
T): T =
             if (tow.isLeft) fromToken(tow.swap.toOption.get) else 
fromWord(tow.toOption.get)
 
@@ -160,7 +164,20 @@ class NCProbeSynonym(
                 r.matcher(get0(_.origText, _.origText)).matches() || 
r.matcher(get0(_.normText, _.normText)).matches()
 
             case IDL =>
-                get0(t => chunk.idlPred.apply(t, NCIdlContext(req = 
req)).value.asInstanceOf[Boolean], _ => false)
+                val ok =
+                    variantsToks.exists(variantToks =>
+                        get0(t =>
+                            chunk.idlPred.apply(
+                                t,
+                                NCIdlContext(req = req, toks = variantToks)
+                            ).value.asInstanceOf[Boolean], _ => false
+                        )
+                    )
+
+                if (ok)
+                    saveIdl(req, tow.swap.toOption.get, chunk.idlPred)
+
+                ok
 
             case _ => throw new AssertionError()
         }
@@ -188,17 +205,20 @@ class NCProbeSynonym(
       *
       * @param tows
       * @param req
+      * @param variantsToks
       * @return
       */
-    def isMatch(tows: Seq[NCIdlContent], req: NCRequest): Boolean = {
+    def isMatch(tows: Seq[NCIdlContent], req: NCRequest, variantsToks: 
Seq[Seq[NCToken]]): Boolean= {
         require(tows != null)
 
         if (tows.length == length && tows.count(_.isLeft) >= idlChunks)
-            tows.zip(this).sortBy(p => getSort(p._2.kind)).forall { case (tow, 
chunk) => isMatch(tow, chunk, req) }
+            tows.zip(this).sortBy(p => getSort(p._2.kind)).forall {
+                case (tow, chunk) => isMatch(tow, chunk, req, variantsToks)
+            }
         else
             false
     }
-    
+
     /**
       *
       * @param toks
@@ -214,15 +234,16 @@ class NCProbeSynonym(
       *
       * @param tows
       * @param req
+      * @param variantsToks
       */
-    def sparseMatch(tows: Seq[NCIdlContent], req: NCRequest): 
Option[Seq[NCIdlContent]] = {
+    def sparseMatch(tows: Seq[NCIdlContent], req: NCRequest, variantsToks: 
Seq[Seq[NCToken]]): Option[Seq[NCIdlContent]] = {
         require(tows != null)
         require(req != null)
         require(hasIdl)
 
         sparseMatch0(
             tows,
-            (t: NCIdlContent, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, 
req),
+            (t: NCIdlContent, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, 
req, variantsToks),
             (t: NCIdlContent) => if (t.isLeft) 
t.swap.toOption.get.getStartCharIndex else t.toOption.get.startCharIndex,
             shouldBeNeighbors = !sparse
         )
@@ -340,9 +361,17 @@ object NCProbeSynonym {
         permute: Boolean
     ): NCProbeSynonym = {
         val syn = new NCProbeSynonym(isElementId, isValueName, isDirect, 
value, sparse, permute)
-        
+
         syn ++= chunks
-        
+
         syn
     }
+
+    /**
+      *
+      * @param req
+      * @param tok
+      * @param idlPred
+      */
+    def saveIdl(req: NCRequest, tok: NCToken, idlPred: NCIdlFunction): Unit = 
NCSentenceManager.saveIdl(req, tok, idlPred)
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
index bcf2c9c..39f6969 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
@@ -22,6 +22,8 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence => 
NlpSentence, NCNlpSenten
 import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY}
 import org.apache.nlpcraft.model.NCVariant
 import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCTokenLogger, 
NCVariantImpl}
+import org.apache.nlpcraft.model.intent.NCIdlContext
+import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
 
 import java.io.{Serializable => JSerializable}
 import java.util
@@ -267,6 +269,34 @@ object NCProbeVariants {
                 for ((tok, tokNlp) <- toks.zip(nlpSen) if tokNlp.isUser)
                     process(tok, tokNlp)
 
+                if (ok) {
+                    NCSentenceManager.getIdlData(srvReqId) match {
+                        case Some((req, toksData)) =>
+                            ok =
+                                toks.forall(t =>
+                                    toksData.get((t, t.getId)) match {
+                                        case Some(f) =>
+                                            val x =
+                                            f.apply(
+                                                t,
+                                                NCIdlContext(req = req, toks = 
toks.toSeq)
+                                            ).value.asInstanceOf[Boolean]
+
+
+                                            if (!x)
+                                                println("x="+x + ", t=" + t  + 
", toks=" + toks)
+                                            x
+
+
+                                        case None => true
+                                    }
+                                )
+
+                        case None =>  // No-op.
+
+                    }
+                }
+
                 if (ok) Some(new NCVariantImpl(toks.asJava)) else None
             })
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index 4b6c697..9af0c61 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -526,8 +526,6 @@ object NCProbeEnrichmentManager extends NCService with 
NCOpenCensusModelStats {
             )
         })
 
-        NCSentenceManager.clearCache(srvReqId)
-
         // Final validation before execution.
         try
             sensSeq.foreach(NCValidateManager.postValidate(mdl, _, span))
@@ -556,6 +554,8 @@ object NCProbeEnrichmentManager extends NCService with 
NCOpenCensusModelStats {
 
         var senVars = NCProbeVariants.convert(srvReqId, mdl, sensSeq, 
lastPhase = true)
 
+        NCSentenceManager.clearCache(srvReqId)
+
         // Sentence variants can be filtered by model.
         val fltSenVars: Seq[(NCVariant, Int)] =
             senVars.
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 9706c4c..7a11806 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -28,7 +28,7 @@ import 
org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
 import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
-import org.apache.nlpcraft.probe.mgrs.{NCProbeModel,  NCProbeVariants, 
NCTokenPartKey, NCProbeSynonym => Synonym}
+import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, 
NCTokenPartKey, NCProbeSynonym => Synonym}
 
 import java.io.Serializable
 import java.util.{List => JList}
@@ -526,8 +526,21 @@ object NCModelEnricher extends NCProbeEnricher {
             "enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> 
mdl.model.getId, "txt" -> ns.text
         ) { span =>
             val req = NCRequestImpl(senMeta, ns.srvReqId)
+
             val combToks = combosTokens(ns.toSeq)
             lazy val ch = mkComplexes(mdl, ns)
+            lazy val variantsToks =
+                ch.complexes.map(p => p.tokensComplexes.map(p =>
+                    if (p.isToken)
+                        p.token
+                    else {
+                        // TODO: everywhere
+                        val clone = p.word.clone()
+
+                        clone.filter(!_.isNlp).foreach(clone.remove)
+
+                        NCTokenImpl(mdl, ns.srvReqId, clone)
+                    }))
 
             def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
                 startScopedSpan(
@@ -603,6 +616,7 @@ object NCModelEnricher extends NCProbeEnricher {
                             val allSyns = get(mdl.idlSynonyms, eId)
                             lazy val allCombs = mkCombinations(ch, toks, 
idlCache)
 
+
                             // 2.1 Continuous.
                             if (!mdl.hasSparseSynonyms) {
                                 var found = false
@@ -613,7 +627,7 @@ object NCModelEnricher extends NCProbeEnricher {
                                     if !found;
                                     data = comb.map(_.data)
                                 )
-                                    if (s.isMatch(data, req)) {
+                                    if (s.isMatch(data, req, variantsToks)) {
                                         val parts = toParts(mdl, ns.srvReqId, 
data, s)
 
                                         add("IDL continuous", ns, contCache, 
eId, greedy, toksExt, idxs, s, parts)
@@ -629,7 +643,7 @@ object NCModelEnricher extends NCProbeEnricher {
                                     s <- allSyns;
                                     comb <- allCombs
                                 )
-                                    s.sparseMatch(comb.map(_.data), req) match 
{
+                                    s.sparseMatch(comb.map(_.data), req, 
variantsToks) match {
                                         case Some(res) =>
                                             val typ = if (s.sparse) "IDL 
sparse" else "IDL continuous"
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala
new file mode 100644
index 0000000..e5b6e3e
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
+
+import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken => NlpToken}
+import org.apache.nlpcraft.probe.mgrs.{NCProbeSynonym => Synonym}
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.{NCIdlContent => IdlToken}
+import org.apache.nlpcraft.model.NCRequest
+import scala.collection.mutable
+
+class NCSentenceCache {
+//    case class Key(elemId: String, indexes: Seq[Int])
+//    case class Value[T](synonym: Synonym, result: Seq[T])
+//
+//    val cacheToks = mutable.HashMap.empty[Key, mutable.HashMap[Seq[Int], 
Value[NlpToken]]]
+//    val cacheIdl = mutable.HashMap.empty[Key, mutable.HashMap[Seq[Int], 
Value[IdlToken]]]
+//
+//    var cacheHits = 0
+//    var cacheCnt = 0
+//    var time = 0L
+//
+//    private def process[T](
+//        elemId: String,
+//        elemSyns: Seq[Synonym],
+//        toks: Seq[T],
+//        extract: (Synonym, Seq[T]) => Option[Seq[T]],
+//        cache: mutable.Map[Key, mutable.HashMap[Seq[Int], Value[T]]],
+//        getIndex: T => Int,
+//        callback: (Synonym, Seq[T]) => Unit
+//    ): Unit = {
+//        val t = System.currentTimeMillis()
+//
+//        val hash = toks.map(getIndex)
+//        val key = Key(elemId, hash)
+//
+//        cacheCnt += 1
+//
+//        cache.get(key) match {
+//            case Some(data) =>
+//                cacheHits += 1
+//                data.get(hash) match {
+//                    case Some(v) => callback(v.synonym, v.result)
+//                    case None => // No-op.
+//                }
+//            case None =>
+//                // mutable.HashMap.empty[Key[IdlToken], Map[Seq[IdlToken], 
Value[IdlToken]]]
+//                val hit = mutable.HashMap.empty[Seq[Int], Value[T]]
+//
+//                for (s <- elemSyns)
+//                    extract(s, toks) match {
+//                        case Some(res) =>
+//                            callback(s, res)
+//                            hit += hash -> Value(s, res)
+//                        case None => // No-op.
+//                    }
+//
+//                cache += key -> hit
+//        }
+//
+//        time += (System.currentTimeMillis() - t)
+//    }
+//
+//    def processSparseTokens(
+//        elemId: String,
+//        elemSyns: Seq[Synonym],
+//        toks: Seq[NlpToken],
+//        callback: (Synonym, Seq[NlpToken]) => Unit
+//    ): Unit =
+//        process(
+//            elemId,
+//            elemSyns,
+//            toks,
+//            (s: Synonym, toks: Seq[NlpToken]) => s.sparseMatch(toks),
+//            cacheToks,
+//            (t: NlpToken) => t.index,
+//            callback
+//        )
+//
+//    def processSparseIdl(
+//        elemId: String,
+//        req: NCRequest,
+//        elemSyns: Seq[Synonym],
+//        toks: Seq[IdlToken],
+//        callback: (Synonym, Seq[IdlToken]) => Unit
+//    ): Unit =
+//        process(
+//            elemId,
+//            elemSyns,
+//            toks,
+//            (s: Synonym, toks: Seq[IdlToken]) => s.sparseMatch(toks, req),
+//            cacheIdl,
+//            (t: IdlToken) => if (t.isRight) t.toOption.get.index else 
t.swap.toOption.get.getIndex,
+//            callback
+//        )
+}
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index ee8b719..b0a077a 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -22,7 +22,8 @@ import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
 import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, 
NCNlpSentenceToken}
 import org.apache.nlpcraft.common.{NCE, NCService, U, _}
-import org.apache.nlpcraft.model.NCModel
+import org.apache.nlpcraft.model.intent.NCIdlFunction
+import org.apache.nlpcraft.model.{NCModel, NCRequest, NCToken}
 import org.apache.nlpcraft.probe.mgrs.NCTokenPartKey
 
 import java.io.{Serializable => JSerializable}
@@ -43,6 +44,9 @@ object NCSentenceManager extends NCService {
     type CacheValue = Seq[Seq[NCNlpSentenceNote]]
     private val combCache = mutable.HashMap.empty[String, 
mutable.HashMap[CacheKey, CacheValue]]
 
+    type IdlCacheKey = (NCToken, String)
+    private val reqCache = mutable.HashMap.empty[String, NCRequest]
+    private val idlCache = mutable.HashMap.empty[String, 
mutable.HashMap[IdlCacheKey, NCIdlFunction]]
 
     /**
       *
@@ -818,5 +822,37 @@ object NCSentenceManager extends NCService {
       *
       * @param srvReqId
       */
-    def clearCache(srvReqId: String): Unit = combCache -= srvReqId
+    def clearCache(srvReqId: String): Unit = {
+        combCache -= srvReqId
+        reqCache -= srvReqId
+        idlCache -= srvReqId
+    }
+
+    def saveIdl(req: NCRequest, tok: NCToken, idlPred: NCIdlFunction): Unit = {
+        val srvReqId = req.getServerRequestId
+
+        reqCache += srvReqId -> req
+
+        val idlCacheReq: mutable.Map[IdlCacheKey, NCIdlFunction] =
+            idlCache.get(srvReqId) match {
+                case Some(m) => m
+                case None =>
+                    val m  = mutable.HashMap.empty[IdlCacheKey, NCIdlFunction]
+
+                    idlCache += srvReqId -> m
+
+                    m
+            }
+
+        idlCacheReq += (tok, tok.getId) -> idlPred
+    }
+
+    def getIdlData(srvReqId: String) : Option[(NCRequest, Map[IdlCacheKey, 
NCIdlFunction])] = {
+        val reqData = reqCache.get(srvReqId)
+        val idlData = idlCache.get(srvReqId)
+
+        require(reqData.isDefined && idlData.isDefined || reqData.isEmpty && 
idlData.isEmpty)
+
+        if (reqData.isDefined) Some((reqData.get, idlData.get.toMap)) else None
+    }
 }
\ No newline at end of file

[incubator-nlpcraft] 01/01: WIP.

Reply via email to