[incubator-nlpcraft] 02/08: WIP.

sergeykamov Wed, 14 Apr 2021 00:01:44 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


commit 8c2f76d06cd2fad51507eb51d92d5aed20ea96d8
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Apr 12 18:52:15 2021 +0300

    WIP.
---
 .../scala/org/apache/nlpcraft/common/package.scala |   2 +-
 .../org/apache/nlpcraft/common/util/NCUtils.scala  |  42 +++-
 .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala  |  12 +-
 .../nlpcraft/probe/mgrs/NCProbeSynonym.scala       |  41 +++-
 .../probe/mgrs/deploy/NCDeployManager.scala        |  30 +--
 .../nlpcraft/probe/mgrs/model/NCModelManager.scala |  11 +-
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 240 ++++++++-------------
 7 files changed, 184 insertions(+), 194 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala
index c4d8bad..74a0e3e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala
@@ -36,7 +36,7 @@ package object common {
     final val U = NCUtils
     
     // Internal deep debug flag (more verbose tracing).
-    final val DEEP_DEBUG = false
+    final val DEEP_DEBUG = true
     
     // Model and token **internal** metadata keys.
     final val TOK_META_ALIASES_KEY = "__NLPCRAFT_TOK_META_ALIASES"
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala
index 141e813..13a1c89 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala
@@ -1424,12 +1424,14 @@ object NCUtils extends LazyLogging {
      * @param e
      */
     def prettyError(logger: Logger, title: String, e: Throwable): Unit = {
-        // Keep the full trace in the 'trace' log level.
-        logger.trace(title, e)
+        e.printStackTrace()
 
-        prettyErrorImpl(new PrettyErrorLogger {
-            override def log(s: String): Unit = logger.error(s)
-        }, title, e)
+        // Keep the full trace in the 'trace' log level.
+//        logger.trace(title, e)
+//
+//        prettyErrorImpl(new PrettyErrorLogger {
+//            override def log(s: String): Unit = logger.error(s)
+//        }, title, e)
     }
 
     /**
@@ -2122,4 +2124,34 @@ object NCUtils extends LazyLogging {
             case Nil ⇒ List(Nil)
             case head :: tail ⇒ for (h ← head; t ← permute(tail)) yield h :: t
         }
+
+    /**
+      *
+      * @param idxs
+      * @return
+      */
+    def isContinuous(idxs: Seq[Int]): Boolean = {
+        require(idxs.nonEmpty)
+
+        idxs.size match {
+            case 0 ⇒ throw new AssertionError()
+            case 1 ⇒ true
+            case _ ⇒  idxs.zip(idxs.tail).forall { case (x, y) ⇒ x + 1 == y }
+        }
+    }
+
+    /**
+      *
+      * @param idxs
+      * @return
+      */
+    def isIncreased(idxs: Seq[Int]): Boolean = {
+        require(idxs.nonEmpty)
+
+        idxs.size match {
+            case 0 ⇒ throw new AssertionError()
+            case 1 ⇒ true
+            case _ ⇒  !idxs.zip(idxs.tail).exists { case (x, y) ⇒ x > y }
+        }
+    }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
index 31fa627..1618421 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
@@ -28,9 +28,9 @@ import scala.collection.{Map, Seq}
   * @param model
   * @param solver
   * @param intents
-  * @param directSynonyms
+  * @param continuousSynonyms
   * @param sparseSynonyms
-  * @param synonymsDsl
+  * @param dslSynonyms
   * @param exclStopWordsStems
   * @param suspWordsStems
   * @param elements
@@ -39,15 +39,15 @@ case class NCProbeModel(
     model: NCModel,
     solver: NCIntentSolver,
     intents: Seq[NCIdlIntent],
-    directSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , 
NCProbeSynonymsWrapper]], // Fast access map.
+    continuousSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ 
, NCProbeSynonymsWrapper]], // Fast access map.
     sparseSynonyms: Map[String /*Element ID*/, Seq[NCProbeSynonym]],
-    synonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast 
access map.
+    dslSynonyms: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast 
access map.
     addStopWordsStems: Set[String],
     exclStopWordsStems: Set[String],
     suspWordsStems: Set[String],
     elements: Map[String /*Element ID*/ , NCElement],
     samples: Set[(String, Seq[Seq[String]])]
 ) {
-    def hasDslSynonyms(elemId: String): Boolean = synonymsDsl.contains(elemId)
-    def hasDslSynonyms: Boolean = synonymsDsl.nonEmpty
+    def hasDslSynonyms(elemId: String): Boolean = dslSynonyms.contains(elemId)
+    def hasDslSynonyms: Boolean = dslSynonyms.nonEmpty
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index b246cac..bc41b96 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
@@ -33,18 +33,20 @@ import scala.collection.mutable
   *     In this case chunks contain value name.
   * @param isDirect Direct or permuted synonym flag.
   * @param value Optional value name if this is a value synonym.
-  * @param perm Flag.
+  * @param sparse Flag.
+  * @param permute Flag.
   */
 class NCProbeSynonym(
     val isElementId: Boolean,
     val isValueName: Boolean,
     val isDirect: Boolean,
     val value: String = null,
-    val perm: Boolean
+    val sparse: Boolean,
+    val permute: Boolean
 ) extends mutable.ArrayBuffer[NCProbeSynonymChunk] with 
Ordered[NCProbeSynonym] {
     require((isElementId && !isValueName && value == null) || !isElementId)
     require((isValueName && value != null) || !isValueName)
-    
+
     lazy val isTextOnly: Boolean = forall(_.kind == TEXT)
     lazy val regexChunks: Int = count(_.kind == REGEX)
     lazy val dslChunks: Int = count(_.kind == IDL)
@@ -109,7 +111,7 @@ class NCProbeSynonym(
                 if (seq.nonEmpty) {
                     val head = seq.head
 
-                    if (!perm && res.nonEmpty && getIndex(head) <= 
getIndex(res.last))
+                    if (!permute && res.nonEmpty && getIndex(head) <= 
getIndex(res.last))
                         state = -1
                     else {
                         all ++= seq
@@ -164,6 +166,7 @@ class NCProbeSynonym(
       */
     def isMatch(toks: NCNlpSentenceTokenBuffer): Boolean = {
         require(toks != null)
+        require(!sparse)
 
         if (toks.length == length) {
             if (isTextOnly)
@@ -180,12 +183,16 @@ class NCProbeSynonym(
       * @param toks
       * @return
       */
-    def trySparseMatch(toks: NCNlpSentenceTokenBuffer): 
Option[Seq[NCNlpSentenceToken]] =
+    def trySparseMatch(toks: NCNlpSentenceTokenBuffer): 
Option[Seq[NCNlpSentenceToken]] = {
+        require(toks != null)
+        require(sparse, s"Unexpected call on: $this")
+
         trySparseMatch0(
             toks,
             isMatch,
             (t: NCNlpSentenceToken) ⇒ t.startCharIndex
         )
+    }
 
     /**
       *
@@ -195,6 +202,7 @@ class NCProbeSynonym(
       */
     def isMatch(tows: Seq[NCDslContent], req: NCRequest): Boolean = {
         require(tows != null)
+        require(!sparse)
 
         if (tows.length == length && tows.count(_.isLeft) >= dslChunks)
             tows.zip(this).sortBy(p ⇒ getSort(p._2.kind)).forall { case (tow, 
chunk) ⇒ isMatch(tow, chunk, req) }
@@ -207,12 +215,17 @@ class NCProbeSynonym(
       * @param tows
       * @param req
       */
-    def trySparseMatch(tows: Seq[NCDslContent], req: NCRequest): 
Option[Seq[NCDslContent]] =
+    def trySparseMatch(tows: Seq[NCDslContent], req: NCRequest): 
Option[Seq[NCDslContent]] = {
+        require(tows != null)
+        require(req != null)
+        require(sparse, s"Unexpected call on: $this")
+
         trySparseMatch0(
             tows,
             (t: NCDslContent, chunk: NCProbeSynonymChunk) ⇒ isMatch(t, chunk, 
req),
             (t: NCDslContent) ⇒ if (t.isLeft) t.left.get.getStartCharIndex 
else t.right.get.startCharIndex
         )
+    }
 
     override def toString(): String = mkString(" ")
     
@@ -244,6 +257,14 @@ class NCProbeSynonym(
                             1
                         else if (!isDirect && that.isDirect)
                             -1
+                        else if (sparse && !that.sparse)
+                            1
+                        else if (!sparse && that.sparse)
+                            -1
+                        else if (permute && !that.permute)
+                            1
+                        else if (!permute && that.permute)
+                            -1
                         else // Both direct or indirect.
                             isTextOnly match {
                                 case true if !that.isTextOnly ⇒ 1
@@ -307,7 +328,8 @@ object NCProbeSynonym {
       * @param isDirect
       * @param value
       * @param chunks
-      * @param perm
+      * @param sparse
+      * @param permute
       */
     def apply(
         isElementId: Boolean,
@@ -315,9 +337,10 @@ object NCProbeSynonym {
         isDirect: Boolean,
         value: String,
         chunks: Seq[NCProbeSynonymChunk],
-        perm: Boolean
+        sparse: Boolean,
+        permute: Boolean
     ): NCProbeSynonym = {
-        var syn = new NCProbeSynonym(isElementId, isValueName, isDirect, 
value, perm)
+        var syn = new NCProbeSynonym(isElementId, isValueName, isDirect, 
value, sparse, permute)
         
         syn ++= chunks
         
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
index d0be67f..d908b62 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
@@ -101,10 +101,9 @@ object NCDeployManager extends NCService with 
DecorateAsScala {
     /**
       *
       * @param elmId Element ID.
-      * @param sparse Flag.
       * @param syn Element synonym.
       */
-    case class SynonymHolder(elmId: String, sparse: Boolean, syn: 
NCProbeSynonym)
+    case class SynonymHolder(elmId: String, syn: NCProbeSynonym)
 
     /**
       * Gives a list of JAR files at given path.
@@ -199,7 +198,7 @@ object NCDeployManager extends NCService with 
DecorateAsScala {
         def filterDsl(syns: Set[SynonymHolder], dsl: Boolean): 
Set[SynonymHolder] =
             syns.filter(s ⇒ ok(s.syn.exists(_.kind == IDL), dsl))
         def filterSparse(syns: Set[SynonymHolder], sparse: Boolean): 
Set[SynonymHolder] =
-            syns.filter(s ⇒ ok(s.sparse && s.syn.size > 1, sparse))
+            syns.filter(s ⇒ ok(s.syn.sparse, sparse))
 
         var cnt = 0
         val maxCnt = mdl.getMaxTotalSynonyms
@@ -220,8 +219,8 @@ object NCDeployManager extends NCService with 
DecorateAsScala {
                     s"]"
                 )
 
-            val sparse = elm.isSparse.orElse(mdl.isSparse)
-            val perm = elm.isPermutateSynonyms.orElse(mdl.isPermutateSynonyms)
+            val sparseFlag = elm.isSparse.orElse(mdl.isSparse)
+            val permuteFlag = 
elm.isPermutateSynonyms.orElse(mdl.isPermutateSynonyms)
 
             def addSynonym(
                 isElementId: Boolean,
@@ -229,11 +228,10 @@ object NCDeployManager extends NCService with 
DecorateAsScala {
                 value: String,
                 chunks: Seq[NCProbeSynonymChunk]
             ): Unit = {
-                def add(chunks: Seq[NCProbeSynonymChunk], isDirect: Boolean): 
Unit = {
+                def add(chunks: Seq[NCProbeSynonymChunk], perm: Boolean, 
sparse: Boolean, isDirect: Boolean): Unit = {
                     val holder = SynonymHolder(
                         elmId = elmId,
-                        sparse = sparse,
-                        syn = NCProbeSynonym(isElementId, isValueName, 
isDirect, value, chunks, perm)
+                        syn = NCProbeSynonym(isElementId, isValueName, 
isDirect, value, chunks, sparse, perm)
                     )
 
                     if (syns.add(holder)) {
@@ -266,15 +264,19 @@ object NCDeployManager extends NCService with 
DecorateAsScala {
                         )
                 }
 
+                def hasDsl(chunks: Seq[NCProbeSynonymChunk]) = 
chunks.exists(_.kind == IDL)
+
                 if (
-                    perm &&
-                    !sparse &&
+                    permuteFlag &&
+                    !sparseFlag &&
                     !isElementId &&
                     chunks.forall(_.wordStem != null)
                 )
-                    simplePermute(chunks).map(p ⇒ p.map(_.wordStem) → 
p).toMap.values.foreach(p ⇒ add(p, p == chunks))
+                    simplePermute(chunks).map(p ⇒ p.map(_.wordStem) → 
p).toMap.values.foreach(seq ⇒
+                        add(seq, isDirect = seq == chunks, perm = true, sparse 
= hasDsl(seq))
+                    )
                 else
-                    add(chunks, isDirect = true)
+                    add(chunks, isDirect = true, perm = false, sparse = 
hasDsl(chunks) || (sparseFlag && chunks.size > 1))
             }
 
             /**
@@ -512,9 +514,9 @@ object NCDeployManager extends NCService with 
DecorateAsScala {
             model = mdl,
             solver = solver,
             intents = intents.map(_._1).toSeq,
-            directSynonyms = mkFastAccessMap(filterSparse(notDsl, sparse = 
false), NCProbeSynonymsWrapper(_)),
+            continuousSynonyms = mkFastAccessMap(filterSparse(notDsl, sparse = 
false), NCProbeSynonymsWrapper(_)),
             sparseSynonyms = toMap(filterSparse(notDsl, sparse = true)),
-            synonymsDsl = toMap(filterDsl(syns.toSet, dsl = true)),
+            dslSynonyms = toMap(filterDsl(syns.toSet, dsl = true)),
             addStopWordsStems = addStopWords,
             exclStopWordsStems = exclStopWords,
             suspWordsStems = suspWords,
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
index 80d2d1e..9970e19 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
@@ -58,9 +58,14 @@ object NCModelManager extends NCService with DecorateAsScala 
{
             data.values.foreach(w ⇒ {
                 val mdl = w.model
 
-                val synDirectCnt = 
w.directSynonyms.flatMap(_._2.map(_._2.count)).sum
+                
println("w.directSynonyms="+w.continuousSynonyms.getOrElse("col:orders_shipped_date",
 Map.empty).mkString("\n"))
+                
println("w.sparseSynonyms="+w.sparseSynonyms.getOrElse("col:orders_shipped_date",
 Seq.empty).mkString("\n"))
+                
println("w.synonymsDsl="+w.dslSynonyms.getOrElse("col:orders_shipped_date", 
Seq.empty).mkString("\n"))
+                println
+
+                val synСontCnt = 
w.continuousSynonyms.flatMap(_._2.map(_._2.count)).sum
                 val synSparseCnt = w.sparseSynonyms.map(_._2.size).sum
-                val synDslCnt = w.synonymsDsl.map(_._2.size).sum
+                val synDslCnt = w.dslSynonyms.map(_._2.size).sum
                 val elmCnt = w.elements.keySet.size
                 val intentCnt = w.intents.size
 
@@ -73,7 +78,7 @@ object NCModelManager extends NCService with DecorateAsScala {
                     s"Origin:         ${mdl.getOrigin}",
                     s"Elements:       ${withWarn(elmCnt)}",
                     s"Synonyms:       ${withWarn(elmCnt)}",
-                    s" - Direct:      $synDirectCnt",
+                    s" - Continuous:  $synСontCnt",
                     s" - Sparse:      $synSparseCnt",
                     s" - DSL(Sparse): $synDslCnt",
                     s"Intents:        ${withWarn(intentCnt)}"
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 1da1059..4d78847 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -129,54 +129,6 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
 
     case class ComplexHolder(complexesWords: Seq[Complex], complexes: 
Seq[ComplexSeq])
 
-    /**
-      * Found-by-synonym model element.
-      *
-      * @param element Element.
-      * @param tokens Element tokens.
-      * @param synonym Synonyms.
-      * @param parts Parts for DSL synonyms.
-      * @param allToksIdxs All tokens indexes (whole tokens slice, has sense 
for sparse tokens)
-      */
-    case class ElementMatch(
-        element: NCElement,
-        tokens: Seq[NlpToken],
-        synonym: Synonym,
-        parts: Seq[TokType],
-        allToksIdxs: Seq[Int]
-    ) extends Ordered[ElementMatch] {
-        // Tokens sparsity.
-        lazy val sparsity: Int = U.calcSparsity(tokens.map(_.index))
-
-        // Number of tokens.
-        lazy val length: Int = tokens.size
-        lazy val tokensSet: Set[NlpToken] = tokens.toSet
-
-        override def compare(that: ElementMatch): Int = {
-            // Check synonym first, then length and then sparsity.
-            // Note that less sparsity means more certainty in a match.
-
-            if (that == null)
-                1
-            else if (synonym < that.synonym)
-                -1
-            else if (synonym > that.synonym)
-                1
-            else if (length < that.length)
-                -1
-            else if (length > that.length)
-                1
-            else if (sparsity < that.sparsity)
-                1
-            else if (sparsity > that.sparsity)
-                -1
-            else
-                0
-        }
-
-        override def toString: String = s"Element=${element.getId}, 
indexes=${tokens.map(_.index).mkString(",")}, synonym=$synonym"
-    }
-
     object State extends Enumeration {
         type State = Value
 
@@ -214,6 +166,7 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
       * @param metaOpt
       * @param parts
       * @param allToksIdxs
+      * @param continuous
       */
     private def mark(
         ns: NCNlpSentence,
@@ -223,7 +176,8 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
         syn: Option[Synonym],
         metaOpt: Option[Map[String, Object]],
         parts: Seq[TokType],
-        allToksIdxs: Seq[Int]
+        allToksIdxs: Seq[Int],
+        continuous: java.lang.Boolean
     ): Unit = {
         val params = mutable.ArrayBuffer.empty[(String, AnyRef)]
 
@@ -232,6 +186,7 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
 
         // Internal usage.
         params += "allToksIndexes" → allToksIdxs.asJava
+        params += "continuous" → continuous
 
         syn match {
             case Some(s) ⇒
@@ -306,7 +261,7 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                         override def isEnglish: Boolean = t.isEnglish
                     }
 
-                val res = parser.parse(
+                val parsingRes = parser.parse(
                     req,
                     mdl.model,
                     ns.map(to).asJava,
@@ -323,8 +278,8 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                     }).asJava
                 )
 
-                if (res != null)
-                    res.asScala.foreach(e ⇒ {
+                if (parsingRes != null)
+                    parsingRes.asScala.foreach(e ⇒ {
                         val elemId = e.getElementId
                         val words = e.getWords
 
@@ -340,8 +295,12 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                             ).getOrElse(throw new AssertionError(s"Custom 
model parser returned an invalid custom token: $w"))
                         )
 
+
                         // Checks element's tokens.
-                        if (!alreadyMarked(matchedToks, elemId))
+                        val idxs = matchedToks.map(_.index)
+                        val continuous = U.isContinuous(idxs.sorted)
+
+                        if (!alreadyMarked(matchedToks, idxs, continuous, 
elemId))
                             mark(
                                 ns,
                                 elem = mdl.elements.getOrElse(elemId, throw 
new NCE(s"Custom model parser returned unknown element ID: $elemId")),
@@ -350,7 +309,8 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                                 syn = None,
                                 metaOpt = Some(e.getMetadata.asScala),
                                 parts = Seq.empty,
-                                matchedToks.map(_.index)
+                                idxs,
+                                continuous
                             )
                     })
             }
@@ -386,15 +346,29 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
       * @param toks
       * @param elemId
       */
-    private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean = {
-        def hasIndex(n: NCNlpSentenceNote): Boolean =
-            n.dataOpt("allToksIndexes").asInstanceOf[Option[JList[Int]]] match 
{
-                case Some(idxs) ⇒ idxs.asScala.containsSlice(toks.map(_.index))
-                case None ⇒ false
-            }
-
-         toks.flatten.exists(n ⇒ n.noteType == elemId && hasIndex(n))
-    }
+    private def alreadyMarked(toks: Seq[NlpToken], allToksIndexes: Seq[Int], 
continuous: Boolean, elemId: String): Boolean =
+         toks.flatten.exists(n ⇒
+             n.noteType == elemId &&
+             {
+                 val res =
+                 if (n.data("continuous").asInstanceOf[Boolean])
+                     true
+                 else {
+                     if (continuous)
+                         false
+                     else
+                         
n.data("allToksIndexes").asInstanceOf[JList[Int]].asScala.containsSlice(allToksIndexes)
+                 }
+
+//                 println(s"n=$n")
+//                 println(s"res=$res, continuous=$continuous, 
toksIdxs=${toks.map(_.index)}, all="+n.data("allToksIndexes"))
+//                 println
+
+
+                 res
+
+             }
+         )
 
     /**
       *
@@ -419,13 +393,6 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
 
     /**
       *
-      */
-    private def mkCache(mdl: NCProbeModel): Cache =
-        mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]].empty ++
-            mdl.elements.keys.map(k ⇒ k → mutable.ArrayBuffer.empty[Seq[Int]])
-
-    /**
-      *
       * @param tows
       * @param ns
       */
@@ -438,12 +405,6 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
 
     /**
       *
-      * @param toks
-      */
-    private def tokString(toks: Seq[NlpToken]): String = toks.map(t ⇒ 
(t.origText, t.index)).mkString(" ")
-
-    /**
-      *
       * @param m
       * @param id
       * @return
@@ -543,41 +504,43 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                 val combosToks = combos(ns)
 
                 def go(): Unit = {
-                    val matches = mutable.ArrayBuffer.empty[ElementMatch]
-
-                    val cacheSparse = mkCache(mdl)
-                    val cacheDirect = mkCache(mdl)
-                    val dslCache = mutable.HashSet.empty[Seq[Complex]]
+                    val contCache = mutable.HashMap.empty[String, 
mutable.ArrayBuffer[Seq[Int]]] ++ mdl.elements.keys.map(k ⇒ k → 
mutable.ArrayBuffer.empty[Seq[Int]])
+                    lazy val dslCache = mutable.HashSet.empty[Seq[Complex]]
 
                     var found = false
 
-                    def add(typ: String, elm: NCElement, cache: Cache, res: 
Seq[NlpToken], allToksIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = 
Seq.empty): Unit = {
-                        var added = false
+                    def add(typ: String, elm: NCElement, res: Seq[NlpToken], 
allToksIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = {
+                        found = true
+                        val resIdxs = res.map(_.index)
 
-                        if (!matchExist(elm.getId, res)) {
-                            matches += ElementMatch(elm, res, s, parts, 
allToksIdxs)
+                        val continuous = U.isContinuous(resIdxs.sorted)
 
-                            added = true
-                        }
+                        if (continuous)
+                            contCache(elm.getId) += allToksIdxs
 
-                        cache(elm.getId) += allToksIdxs
-                        found = true
+                        val added = !alreadyMarked(res, allToksIdxs, 
continuous, elm.getId)
+
+                        if (added) {
+                            val direct = s.isDirect && U.isIncreased(resIdxs)
+
+                            mark(ns, elm, res, direct, syn = Some(s), metaOpt 
= None, parts, allToksIdxs, continuous)
+                        }
 
                         if (DEEP_DEBUG)
-                            logger.trace(
+                            println(
                                 s"Found element [" +
                                     s"id=${elm.getId}, " +
                                     s"type=$typ, " +
-                                    
s"indexes=${res.map(_.index).mkString("|")}, " +
-                                    
s"allTokensIndexes=${allToksIdxs.mkString("|")}, " +
+                                    s"text='${res.map(_.origText).mkString(" 
")}', " +
+                                    s"indexes=${resIdxs.mkString("[", ",", 
"]")}, " +
+                                    
s"allTokensIndexes=${allToksIdxs.mkString("[", ",", "]")}, " +
+                                    s"continuous=$continuous, " +
+                                    s"synonym=$s, " +
                                     s"added=$added" +
                                     s"]"
                             )
                     }
 
-                    def matchExist(elemId: String, toks: Seq[NlpToken]): 
Boolean =
-                        matches.exists(m ⇒ m.element.getId == elemId && 
toks.toSet.subsetOf(m.tokensSet))
-
                     for (toks ← combosToks) {
                         val tokIdxs = toks.map(_.index)
                         lazy val dslCombs: Seq[Seq[Complex]] = 
mkComplexCombinations(h, toks, dslCache.toSet)
@@ -587,12 +550,11 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                         for (
                             elm ← mdl.elements.values;
                             elemId = elm.getId;
-                            dirProc = 
cacheDirect(elemId).exists(_.containsSlice(tokIdxs));
-                            sparseProc = 
cacheSparse(elemId).exists(_.containsSlice(tokIdxs))
+                            contProc = 
contCache(elemId).exists(_.containsSlice(tokIdxs))
                             if
-                                (!dirProc || !sparseProc) &&
+                                !contProc &&
                                  // Checks whole tokens slice.
-                                !alreadyMarked(toks, elemId) && 
!matchExist(elemId, toks)
+                                !alreadyMarked(toks, tokIdxs, continuous = 
true, elemId)
                         ) {
                             // 1. SIMPLE.
                             found = false
@@ -605,19 +567,19 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                                 }
 
                             // 1.1 Direct.
-                            if (simpleEnabled && !dirProc && !found)
-                                fastAccess(mdl.directSynonyms, elemId, 
toks.length) match {
+                            if (simpleEnabled && !found)
+                                fastAccess(mdl.continuousSynonyms, elemId, 
toks.length) match {
                                     case Some(h) ⇒
                                         def tryMap(syns: Map[String, Synonym], 
notFound: () ⇒ Unit): Unit =
                                             syns.get(tokStems) match {
-                                                case Some(s) ⇒ add("direct 
simple", elm, cacheDirect, toks, tokIdxs, s)
+                                                case Some(s) ⇒ add("direct 
simple", elm, toks, tokIdxs, s)
                                                 case None ⇒ notFound()
                                             }
 
                                         def tryScan(syns: Seq[Synonym]): Unit =
                                             for (s ← syns if !found)
                                                 if (s.isMatch(toks))
-                                                    add("scan simple", elm, 
cacheDirect, toks, tokIdxs, s)
+                                                    add("scan simple", elm, 
toks, tokIdxs, s)
 
                                         tryMap(
                                             h.txtDirectSynonyms,
@@ -632,76 +594,42 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                                 }
 
                             // 1.2 Sparse.
-                            if (simpleEnabled && !sparseProc && !found)
+                            if (simpleEnabled && !found)
                                 for (s ← get(mdl.sparseSynonyms, elemId) if 
!found)
                                     s.trySparseMatch(toks) match {
-                                        case Some(res) ⇒ add("sparse simple", 
elm, cacheSparse, res, tokIdxs, s)
+                                        case Some(res) ⇒ add("sparse simple", 
elm, res, tokIdxs, s)
                                         case None ⇒ // No-op.
                                     }
 
                             // 2. DSL.
-                            if (state != SIMPLE && mdl.synonymsDsl.nonEmpty) {
+                            if (state != SIMPLE && mdl.dslSynonyms.nonEmpty) {
                                 found = false
 
                                 // 2.1 Sparse.
-                                if (mdl.hasDslSynonyms) {
-                                    if (!sparseProc)
-                                        for (s ← get(mdl.synonymsDsl, elemId); 
comb ← dslCombs if !found)
-                                            s.trySparseMatch(comb.map(_.data), 
req) match {
-                                                case Some(res) ⇒
-                                                    add("sparse DSL", elm, 
cacheSparse, toTokens(res, ns), tokIdxs, s, toParts(res, s))
-                                                    dslCache += comb
-                                                case None ⇒ // No-op.
-                                            }
-                                }
-                                // 2.2 Direct.
-                                else {
-                                    if (!dirProc)
-                                        for (s ← get(mdl.synonymsDsl, elemId); 
comb ← dslCombs if !found)
-                                            if (s.isMatch(comb.map(_.data), 
req)) {
-                                                add("direct DSL", elm, 
cacheDirect, toks, tokIdxs, s, toPartsComplex(comb, s))
+                                if (mdl.hasDslSynonyms)
+                                    for (s ← get(mdl.dslSynonyms, elemId); 
comb ← dslCombs if !found)
+                                        s.trySparseMatch(comb.map(_.data), 
req) match {
+                                            case Some(res) ⇒
+                                                add("sparse DSL", elm, 
toTokens(res, ns), tokIdxs, s, toParts(res, s))
                                                 dslCache += comb
-                                            }
-                                }
+                                            case None ⇒ // No-op.
+                                        }
+                                // 2.2 Direct.
+                                else
+                                    for (s ← get(mdl.dslSynonyms, elemId); 
comb ← dslCombs if !found)
+                                        if (s.isMatch(comb.map(_.data), req)) {
+                                            add("direct DSL", elm, toks, 
tokIdxs, s, toPartsComplex(comb, s))
+                                            dslCache += comb
+                                        }
                             }
                         }
                     }
 
-                    for ((m, idx) ← matches.zipWithIndex) {
-                        if (DEEP_DEBUG)
-                            logger.trace(
-                                s"Model '$mdlId' element found (${idx + 1} of 
${matches.size}) [" +
-                                    s"elementId=${m.element.getId}, " +
-                                    s"synonym=${m.synonym}, " +
-                                    s"tokens=${tokString(m.tokens)}" +
-                                    s"]"
-                            )
-
-                        val tokIdxs = m.tokens.map(_.index)
-                        val direct = m.synonym.isDirect && 
!tokIdxs.zip(tokIdxs.tail).exists { case (x, y) ⇒ x > y }
-
-                        var added = false
-
-                        // Checks element's tokens.
-                        if (!alreadyMarked(m.tokens, m.element.getId)) {
-                            mark(ns, m.element, m.tokens, direct, syn = 
Some(m.synonym), metaOpt = None, m.parts, m.allToksIdxs)
 
-                            added = true
-                        }
-
-                        if (DEEP_DEBUG)
-                            logger.trace(
-                                s"Element ${if (added) "added" else "skipped"} 
[" +
-                                    s"id=${m.element.getId}, " +
-                                    
s"indexes=${m.tokens.map(_.index).mkString("|")}, " +
-                                    
s"allTokensIndexes=${m.allToksIdxs.mkString("|")}, " +
-                                    s"]"
-                            )
-                    }
                 }
 
                 if (DEEP_DEBUG)
-                    logger.trace(s"Exexucution started with state: $state")
+                    println(s"Execution started with state: $state.")
 
                 go()
 
@@ -716,5 +644,5 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
         }
     }
 
-    def isComplex(mdl: NCProbeModel): Boolean = mdl.synonymsDsl.nonEmpty || 
!mdl.model.getParsers.isEmpty
+    def isComplex(mdl: NCProbeModel): Boolean = mdl.dslSynonyms.nonEmpty || 
!mdl.model.getParsers.isEmpty
 }
\ No newline at end of file

[incubator-nlpcraft] 02/08: WIP.

Reply via email to