This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-287 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 8c2f76d06cd2fad51507eb51d92d5aed20ea96d8 Author: Sergey Kamov <[email protected]> AuthorDate: Mon Apr 12 18:52:15 2021 +0300 WIP. --- .../scala/org/apache/nlpcraft/common/package.scala | 2 +- .../org/apache/nlpcraft/common/util/NCUtils.scala | 42 +++- .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala | 12 +- .../nlpcraft/probe/mgrs/NCProbeSynonym.scala | 41 +++- .../probe/mgrs/deploy/NCDeployManager.scala | 30 +-- .../nlpcraft/probe/mgrs/model/NCModelManager.scala | 11 +- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 240 ++++++++------------- 7 files changed, 184 insertions(+), 194 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala index c4d8bad..74a0e3e 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala @@ -36,7 +36,7 @@ package object common { final val U = NCUtils // Internal deep debug flag (more verbose tracing). - final val DEEP_DEBUG = false + final val DEEP_DEBUG = true // Model and token **internal** metadata keys. final val TOK_META_ALIASES_KEY = "__NLPCRAFT_TOK_META_ALIASES" diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala index 141e813..13a1c89 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala @@ -1424,12 +1424,14 @@ object NCUtils extends LazyLogging { * @param e */ def prettyError(logger: Logger, title: String, e: Throwable): Unit = { - // Keep the full trace in the 'trace' log level. - logger.trace(title, e) + e.printStackTrace() - prettyErrorImpl(new PrettyErrorLogger { - override def log(s: String): Unit = logger.error(s) - }, title, e) + // Keep the full trace in the 'trace' log level. +// logger.trace(title, e) +// +// prettyErrorImpl(new PrettyErrorLogger { +// override def log(s: String): Unit = logger.error(s) +// }, title, e) } /** @@ -2122,4 +2124,34 @@ object NCUtils extends LazyLogging { case Nil ⇒ List(Nil) case head :: tail ⇒ for (h ← head; t ← permute(tail)) yield h :: t } + + /** + * + * @param idxs + * @return + */ + def isContinuous(idxs: Seq[Int]): Boolean = { + require(idxs.nonEmpty) + + idxs.size match { + case 0 ⇒ throw new AssertionError() + case 1 ⇒ true + case _ ⇒ idxs.zip(idxs.tail).forall { case (x, y) ⇒ x + 1 == y } + } + } + + /** + * + * @param idxs + * @return + */ + def isIncreased(idxs: Seq[Int]): Boolean = { + require(idxs.nonEmpty) + + idxs.size match { + case 0 ⇒ throw new AssertionError() + case 1 ⇒ true + case _ ⇒ !idxs.zip(idxs.tail).exists { case (x, y) ⇒ x > y } + } + } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala index 31fa627..1618421 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala @@ -28,9 +28,9 @@ import scala.collection.{Map, Seq} * @param model * @param solver * @param intents - * @param directSynonyms + * @param continuousSynonyms * @param sparseSynonyms - * @param synonymsDsl + * @param dslSynonyms * @param exclStopWordsStems * @param suspWordsStems * @param elements @@ -39,15 +39,15 @@ case class NCProbeModel( model: NCModel, solver: NCIntentSolver, intents: Seq[NCIdlIntent], - directSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , NCProbeSynonymsWrapper]], // Fast access map. + continuousSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , NCProbeSynonymsWrapper]], // Fast access map. sparseSynonyms: Map[String /*Element ID*/, Seq[NCProbeSynonym]], - synonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast access map. + dslSynonyms: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast access map. addStopWordsStems: Set[String], exclStopWordsStems: Set[String], suspWordsStems: Set[String], elements: Map[String /*Element ID*/ , NCElement], samples: Set[(String, Seq[Seq[String]])] ) { - def hasDslSynonyms(elemId: String): Boolean = synonymsDsl.contains(elemId) - def hasDslSynonyms: Boolean = synonymsDsl.nonEmpty + def hasDslSynonyms(elemId: String): Boolean = dslSynonyms.contains(elemId) + def hasDslSynonyms: Boolean = dslSynonyms.nonEmpty } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala index b246cac..bc41b96 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala @@ -33,18 +33,20 @@ import scala.collection.mutable * In this case chunks contain value name. * @param isDirect Direct or permuted synonym flag. * @param value Optional value name if this is a value synonym. - * @param perm Flag. + * @param sparse Flag. + * @param permute Flag. */ class NCProbeSynonym( val isElementId: Boolean, val isValueName: Boolean, val isDirect: Boolean, val value: String = null, - val perm: Boolean + val sparse: Boolean, + val permute: Boolean ) extends mutable.ArrayBuffer[NCProbeSynonymChunk] with Ordered[NCProbeSynonym] { require((isElementId && !isValueName && value == null) || !isElementId) require((isValueName && value != null) || !isValueName) - + lazy val isTextOnly: Boolean = forall(_.kind == TEXT) lazy val regexChunks: Int = count(_.kind == REGEX) lazy val dslChunks: Int = count(_.kind == IDL) @@ -109,7 +111,7 @@ class NCProbeSynonym( if (seq.nonEmpty) { val head = seq.head - if (!perm && res.nonEmpty && getIndex(head) <= getIndex(res.last)) + if (!permute && res.nonEmpty && getIndex(head) <= getIndex(res.last)) state = -1 else { all ++= seq @@ -164,6 +166,7 @@ class NCProbeSynonym( */ def isMatch(toks: NCNlpSentenceTokenBuffer): Boolean = { require(toks != null) + require(!sparse) if (toks.length == length) { if (isTextOnly) @@ -180,12 +183,16 @@ class NCProbeSynonym( * @param toks * @return */ - def trySparseMatch(toks: NCNlpSentenceTokenBuffer): Option[Seq[NCNlpSentenceToken]] = + def trySparseMatch(toks: NCNlpSentenceTokenBuffer): Option[Seq[NCNlpSentenceToken]] = { + require(toks != null) + require(sparse, s"Unexpected call on: $this") + trySparseMatch0( toks, isMatch, (t: NCNlpSentenceToken) ⇒ t.startCharIndex ) + } /** * @@ -195,6 +202,7 @@ class NCProbeSynonym( */ def isMatch(tows: Seq[NCDslContent], req: NCRequest): Boolean = { require(tows != null) + require(!sparse) if (tows.length == length && tows.count(_.isLeft) >= dslChunks) tows.zip(this).sortBy(p ⇒ getSort(p._2.kind)).forall { case (tow, chunk) ⇒ isMatch(tow, chunk, req) } @@ -207,12 +215,17 @@ class NCProbeSynonym( * @param tows * @param req */ - def trySparseMatch(tows: Seq[NCDslContent], req: NCRequest): Option[Seq[NCDslContent]] = + def trySparseMatch(tows: Seq[NCDslContent], req: NCRequest): Option[Seq[NCDslContent]] = { + require(tows != null) + require(req != null) + require(sparse, s"Unexpected call on: $this") + trySparseMatch0( tows, (t: NCDslContent, chunk: NCProbeSynonymChunk) ⇒ isMatch(t, chunk, req), (t: NCDslContent) ⇒ if (t.isLeft) t.left.get.getStartCharIndex else t.right.get.startCharIndex ) + } override def toString(): String = mkString(" ") @@ -244,6 +257,14 @@ class NCProbeSynonym( 1 else if (!isDirect && that.isDirect) -1 + else if (sparse && !that.sparse) + 1 + else if (!sparse && that.sparse) + -1 + else if (permute && !that.permute) + 1 + else if (!permute && that.permute) + -1 else // Both direct or indirect. isTextOnly match { case true if !that.isTextOnly ⇒ 1 @@ -307,7 +328,8 @@ object NCProbeSynonym { * @param isDirect * @param value * @param chunks - * @param perm + * @param sparse + * @param permute */ def apply( isElementId: Boolean, @@ -315,9 +337,10 @@ object NCProbeSynonym { isDirect: Boolean, value: String, chunks: Seq[NCProbeSynonymChunk], - perm: Boolean + sparse: Boolean, + permute: Boolean ): NCProbeSynonym = { - var syn = new NCProbeSynonym(isElementId, isValueName, isDirect, value, perm) + var syn = new NCProbeSynonym(isElementId, isValueName, isDirect, value, sparse, permute) syn ++= chunks diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala index d0be67f..d908b62 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala @@ -101,10 +101,9 @@ object NCDeployManager extends NCService with DecorateAsScala { /** * * @param elmId Element ID. - * @param sparse Flag. * @param syn Element synonym. */ - case class SynonymHolder(elmId: String, sparse: Boolean, syn: NCProbeSynonym) + case class SynonymHolder(elmId: String, syn: NCProbeSynonym) /** * Gives a list of JAR files at given path. @@ -199,7 +198,7 @@ object NCDeployManager extends NCService with DecorateAsScala { def filterDsl(syns: Set[SynonymHolder], dsl: Boolean): Set[SynonymHolder] = syns.filter(s ⇒ ok(s.syn.exists(_.kind == IDL), dsl)) def filterSparse(syns: Set[SynonymHolder], sparse: Boolean): Set[SynonymHolder] = - syns.filter(s ⇒ ok(s.sparse && s.syn.size > 1, sparse)) + syns.filter(s ⇒ ok(s.syn.sparse, sparse)) var cnt = 0 val maxCnt = mdl.getMaxTotalSynonyms @@ -220,8 +219,8 @@ object NCDeployManager extends NCService with DecorateAsScala { s"]" ) - val sparse = elm.isSparse.orElse(mdl.isSparse) - val perm = elm.isPermutateSynonyms.orElse(mdl.isPermutateSynonyms) + val sparseFlag = elm.isSparse.orElse(mdl.isSparse) + val permuteFlag = elm.isPermutateSynonyms.orElse(mdl.isPermutateSynonyms) def addSynonym( isElementId: Boolean, @@ -229,11 +228,10 @@ object NCDeployManager extends NCService with DecorateAsScala { value: String, chunks: Seq[NCProbeSynonymChunk] ): Unit = { - def add(chunks: Seq[NCProbeSynonymChunk], isDirect: Boolean): Unit = { + def add(chunks: Seq[NCProbeSynonymChunk], perm: Boolean, sparse: Boolean, isDirect: Boolean): Unit = { val holder = SynonymHolder( elmId = elmId, - sparse = sparse, - syn = NCProbeSynonym(isElementId, isValueName, isDirect, value, chunks, perm) + syn = NCProbeSynonym(isElementId, isValueName, isDirect, value, chunks, sparse, perm) ) if (syns.add(holder)) { @@ -266,15 +264,19 @@ object NCDeployManager extends NCService with DecorateAsScala { ) } + def hasDsl(chunks: Seq[NCProbeSynonymChunk]) = chunks.exists(_.kind == IDL) + if ( - perm && - !sparse && + permuteFlag && + !sparseFlag && !isElementId && chunks.forall(_.wordStem != null) ) - simplePermute(chunks).map(p ⇒ p.map(_.wordStem) → p).toMap.values.foreach(p ⇒ add(p, p == chunks)) + simplePermute(chunks).map(p ⇒ p.map(_.wordStem) → p).toMap.values.foreach(seq ⇒ + add(seq, isDirect = seq == chunks, perm = true, sparse = hasDsl(seq)) + ) else - add(chunks, isDirect = true) + add(chunks, isDirect = true, perm = false, sparse = hasDsl(chunks) || (sparseFlag && chunks.size > 1)) } /** @@ -512,9 +514,9 @@ object NCDeployManager extends NCService with DecorateAsScala { model = mdl, solver = solver, intents = intents.map(_._1).toSeq, - directSynonyms = mkFastAccessMap(filterSparse(notDsl, sparse = false), NCProbeSynonymsWrapper(_)), + continuousSynonyms = mkFastAccessMap(filterSparse(notDsl, sparse = false), NCProbeSynonymsWrapper(_)), sparseSynonyms = toMap(filterSparse(notDsl, sparse = true)), - synonymsDsl = toMap(filterDsl(syns.toSet, dsl = true)), + dslSynonyms = toMap(filterDsl(syns.toSet, dsl = true)), addStopWordsStems = addStopWords, exclStopWordsStems = exclStopWords, suspWordsStems = suspWords, diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala index 80d2d1e..9970e19 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala @@ -58,9 +58,14 @@ object NCModelManager extends NCService with DecorateAsScala { data.values.foreach(w ⇒ { val mdl = w.model - val synDirectCnt = w.directSynonyms.flatMap(_._2.map(_._2.count)).sum + println("w.directSynonyms="+w.continuousSynonyms.getOrElse("col:orders_shipped_date", Map.empty).mkString("\n")) + println("w.sparseSynonyms="+w.sparseSynonyms.getOrElse("col:orders_shipped_date", Seq.empty).mkString("\n")) + println("w.synonymsDsl="+w.dslSynonyms.getOrElse("col:orders_shipped_date", Seq.empty).mkString("\n")) + println + + val synСontCnt = w.continuousSynonyms.flatMap(_._2.map(_._2.count)).sum val synSparseCnt = w.sparseSynonyms.map(_._2.size).sum - val synDslCnt = w.synonymsDsl.map(_._2.size).sum + val synDslCnt = w.dslSynonyms.map(_._2.size).sum val elmCnt = w.elements.keySet.size val intentCnt = w.intents.size @@ -73,7 +78,7 @@ object NCModelManager extends NCService with DecorateAsScala { s"Origin: ${mdl.getOrigin}", s"Elements: ${withWarn(elmCnt)}", s"Synonyms: ${withWarn(elmCnt)}", - s" - Direct: $synDirectCnt", + s" - Continuous: $synСontCnt", s" - Sparse: $synSparseCnt", s" - DSL(Sparse): $synDslCnt", s"Intents: ${withWarn(intentCnt)}" diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 1da1059..4d78847 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -129,54 +129,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { case class ComplexHolder(complexesWords: Seq[Complex], complexes: Seq[ComplexSeq]) - /** - * Found-by-synonym model element. - * - * @param element Element. - * @param tokens Element tokens. - * @param synonym Synonyms. - * @param parts Parts for DSL synonyms. - * @param allToksIdxs All tokens indexes (whole tokens slice, has sense for sparse tokens) - */ - case class ElementMatch( - element: NCElement, - tokens: Seq[NlpToken], - synonym: Synonym, - parts: Seq[TokType], - allToksIdxs: Seq[Int] - ) extends Ordered[ElementMatch] { - // Tokens sparsity. - lazy val sparsity: Int = U.calcSparsity(tokens.map(_.index)) - - // Number of tokens. - lazy val length: Int = tokens.size - lazy val tokensSet: Set[NlpToken] = tokens.toSet - - override def compare(that: ElementMatch): Int = { - // Check synonym first, then length and then sparsity. - // Note that less sparsity means more certainty in a match. - - if (that == null) - 1 - else if (synonym < that.synonym) - -1 - else if (synonym > that.synonym) - 1 - else if (length < that.length) - -1 - else if (length > that.length) - 1 - else if (sparsity < that.sparsity) - 1 - else if (sparsity > that.sparsity) - -1 - else - 0 - } - - override def toString: String = s"Element=${element.getId}, indexes=${tokens.map(_.index).mkString(",")}, synonym=$synonym" - } - object State extends Enumeration { type State = Value @@ -214,6 +166,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param metaOpt * @param parts * @param allToksIdxs + * @param continuous */ private def mark( ns: NCNlpSentence, @@ -223,7 +176,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { syn: Option[Synonym], metaOpt: Option[Map[String, Object]], parts: Seq[TokType], - allToksIdxs: Seq[Int] + allToksIdxs: Seq[Int], + continuous: java.lang.Boolean ): Unit = { val params = mutable.ArrayBuffer.empty[(String, AnyRef)] @@ -232,6 +186,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { // Internal usage. params += "allToksIndexes" → allToksIdxs.asJava + params += "continuous" → continuous syn match { case Some(s) ⇒ @@ -306,7 +261,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { override def isEnglish: Boolean = t.isEnglish } - val res = parser.parse( + val parsingRes = parser.parse( req, mdl.model, ns.map(to).asJava, @@ -323,8 +278,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { }).asJava ) - if (res != null) - res.asScala.foreach(e ⇒ { + if (parsingRes != null) + parsingRes.asScala.foreach(e ⇒ { val elemId = e.getElementId val words = e.getWords @@ -340,8 +295,12 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { ).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w")) ) + // Checks element's tokens. - if (!alreadyMarked(matchedToks, elemId)) + val idxs = matchedToks.map(_.index) + val continuous = U.isContinuous(idxs.sorted) + + if (!alreadyMarked(matchedToks, idxs, continuous, elemId)) mark( ns, elem = mdl.elements.getOrElse(elemId, throw new NCE(s"Custom model parser returned unknown element ID: $elemId")), @@ -350,7 +309,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { syn = None, metaOpt = Some(e.getMetadata.asScala), parts = Seq.empty, - matchedToks.map(_.index) + idxs, + continuous ) }) } @@ -386,15 +346,29 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param toks * @param elemId */ - private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean = { - def hasIndex(n: NCNlpSentenceNote): Boolean = - n.dataOpt("allToksIndexes").asInstanceOf[Option[JList[Int]]] match { - case Some(idxs) ⇒ idxs.asScala.containsSlice(toks.map(_.index)) - case None ⇒ false - } - - toks.flatten.exists(n ⇒ n.noteType == elemId && hasIndex(n)) - } + private def alreadyMarked(toks: Seq[NlpToken], allToksIndexes: Seq[Int], continuous: Boolean, elemId: String): Boolean = + toks.flatten.exists(n ⇒ + n.noteType == elemId && + { + val res = + if (n.data("continuous").asInstanceOf[Boolean]) + true + else { + if (continuous) + false + else + n.data("allToksIndexes").asInstanceOf[JList[Int]].asScala.containsSlice(allToksIndexes) + } + +// println(s"n=$n") +// println(s"res=$res, continuous=$continuous, toksIdxs=${toks.map(_.index)}, all="+n.data("allToksIndexes")) +// println + + + res + + } + ) /** * @@ -419,13 +393,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { /** * - */ - private def mkCache(mdl: NCProbeModel): Cache = - mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]].empty ++ - mdl.elements.keys.map(k ⇒ k → mutable.ArrayBuffer.empty[Seq[Int]]) - - /** - * * @param tows * @param ns */ @@ -438,12 +405,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { /** * - * @param toks - */ - private def tokString(toks: Seq[NlpToken]): String = toks.map(t ⇒ (t.origText, t.index)).mkString(" ") - - /** - * * @param m * @param id * @return @@ -543,41 +504,43 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { val combosToks = combos(ns) def go(): Unit = { - val matches = mutable.ArrayBuffer.empty[ElementMatch] - - val cacheSparse = mkCache(mdl) - val cacheDirect = mkCache(mdl) - val dslCache = mutable.HashSet.empty[Seq[Complex]] + val contCache = mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]] ++ mdl.elements.keys.map(k ⇒ k → mutable.ArrayBuffer.empty[Seq[Int]]) + lazy val dslCache = mutable.HashSet.empty[Seq[Complex]] var found = false - def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], allToksIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = { - var added = false + def add(typ: String, elm: NCElement, res: Seq[NlpToken], allToksIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = { + found = true + val resIdxs = res.map(_.index) - if (!matchExist(elm.getId, res)) { - matches += ElementMatch(elm, res, s, parts, allToksIdxs) + val continuous = U.isContinuous(resIdxs.sorted) - added = true - } + if (continuous) + contCache(elm.getId) += allToksIdxs - cache(elm.getId) += allToksIdxs - found = true + val added = !alreadyMarked(res, allToksIdxs, continuous, elm.getId) + + if (added) { + val direct = s.isDirect && U.isIncreased(resIdxs) + + mark(ns, elm, res, direct, syn = Some(s), metaOpt = None, parts, allToksIdxs, continuous) + } if (DEEP_DEBUG) - logger.trace( + println( s"Found element [" + s"id=${elm.getId}, " + s"type=$typ, " + - s"indexes=${res.map(_.index).mkString("|")}, " + - s"allTokensIndexes=${allToksIdxs.mkString("|")}, " + + s"text='${res.map(_.origText).mkString(" ")}', " + + s"indexes=${resIdxs.mkString("[", ",", "]")}, " + + s"allTokensIndexes=${allToksIdxs.mkString("[", ",", "]")}, " + + s"continuous=$continuous, " + + s"synonym=$s, " + s"added=$added" + s"]" ) } - def matchExist(elemId: String, toks: Seq[NlpToken]): Boolean = - matches.exists(m ⇒ m.element.getId == elemId && toks.toSet.subsetOf(m.tokensSet)) - for (toks ← combosToks) { val tokIdxs = toks.map(_.index) lazy val dslCombs: Seq[Seq[Complex]] = mkComplexCombinations(h, toks, dslCache.toSet) @@ -587,12 +550,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { for ( elm ← mdl.elements.values; elemId = elm.getId; - dirProc = cacheDirect(elemId).exists(_.containsSlice(tokIdxs)); - sparseProc = cacheSparse(elemId).exists(_.containsSlice(tokIdxs)) + contProc = contCache(elemId).exists(_.containsSlice(tokIdxs)) if - (!dirProc || !sparseProc) && + !contProc && // Checks whole tokens slice. - !alreadyMarked(toks, elemId) && !matchExist(elemId, toks) + !alreadyMarked(toks, tokIdxs, continuous = true, elemId) ) { // 1. SIMPLE. found = false @@ -605,19 +567,19 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { } // 1.1 Direct. - if (simpleEnabled && !dirProc && !found) - fastAccess(mdl.directSynonyms, elemId, toks.length) match { + if (simpleEnabled && !found) + fastAccess(mdl.continuousSynonyms, elemId, toks.length) match { case Some(h) ⇒ def tryMap(syns: Map[String, Synonym], notFound: () ⇒ Unit): Unit = syns.get(tokStems) match { - case Some(s) ⇒ add("direct simple", elm, cacheDirect, toks, tokIdxs, s) + case Some(s) ⇒ add("direct simple", elm, toks, tokIdxs, s) case None ⇒ notFound() } def tryScan(syns: Seq[Synonym]): Unit = for (s ← syns if !found) if (s.isMatch(toks)) - add("scan simple", elm, cacheDirect, toks, tokIdxs, s) + add("scan simple", elm, toks, tokIdxs, s) tryMap( h.txtDirectSynonyms, @@ -632,76 +594,42 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { } // 1.2 Sparse. - if (simpleEnabled && !sparseProc && !found) + if (simpleEnabled && !found) for (s ← get(mdl.sparseSynonyms, elemId) if !found) s.trySparseMatch(toks) match { - case Some(res) ⇒ add("sparse simple", elm, cacheSparse, res, tokIdxs, s) + case Some(res) ⇒ add("sparse simple", elm, res, tokIdxs, s) case None ⇒ // No-op. } // 2. DSL. - if (state != SIMPLE && mdl.synonymsDsl.nonEmpty) { + if (state != SIMPLE && mdl.dslSynonyms.nonEmpty) { found = false // 2.1 Sparse. - if (mdl.hasDslSynonyms) { - if (!sparseProc) - for (s ← get(mdl.synonymsDsl, elemId); comb ← dslCombs if !found) - s.trySparseMatch(comb.map(_.data), req) match { - case Some(res) ⇒ - add("sparse DSL", elm, cacheSparse, toTokens(res, ns), tokIdxs, s, toParts(res, s)) - dslCache += comb - case None ⇒ // No-op. - } - } - // 2.2 Direct. - else { - if (!dirProc) - for (s ← get(mdl.synonymsDsl, elemId); comb ← dslCombs if !found) - if (s.isMatch(comb.map(_.data), req)) { - add("direct DSL", elm, cacheDirect, toks, tokIdxs, s, toPartsComplex(comb, s)) + if (mdl.hasDslSynonyms) + for (s ← get(mdl.dslSynonyms, elemId); comb ← dslCombs if !found) + s.trySparseMatch(comb.map(_.data), req) match { + case Some(res) ⇒ + add("sparse DSL", elm, toTokens(res, ns), tokIdxs, s, toParts(res, s)) dslCache += comb - } - } + case None ⇒ // No-op. + } + // 2.2 Direct. + else + for (s ← get(mdl.dslSynonyms, elemId); comb ← dslCombs if !found) + if (s.isMatch(comb.map(_.data), req)) { + add("direct DSL", elm, toks, tokIdxs, s, toPartsComplex(comb, s)) + dslCache += comb + } } } } - for ((m, idx) ← matches.zipWithIndex) { - if (DEEP_DEBUG) - logger.trace( - s"Model '$mdlId' element found (${idx + 1} of ${matches.size}) [" + - s"elementId=${m.element.getId}, " + - s"synonym=${m.synonym}, " + - s"tokens=${tokString(m.tokens)}" + - s"]" - ) - - val tokIdxs = m.tokens.map(_.index) - val direct = m.synonym.isDirect && !tokIdxs.zip(tokIdxs.tail).exists { case (x, y) ⇒ x > y } - - var added = false - - // Checks element's tokens. - if (!alreadyMarked(m.tokens, m.element.getId)) { - mark(ns, m.element, m.tokens, direct, syn = Some(m.synonym), metaOpt = None, m.parts, m.allToksIdxs) - added = true - } - - if (DEEP_DEBUG) - logger.trace( - s"Element ${if (added) "added" else "skipped"} [" + - s"id=${m.element.getId}, " + - s"indexes=${m.tokens.map(_.index).mkString("|")}, " + - s"allTokensIndexes=${m.allToksIdxs.mkString("|")}, " + - s"]" - ) - } } if (DEEP_DEBUG) - logger.trace(s"Exexucution started with state: $state") + println(s"Execution started with state: $state.") go() @@ -716,5 +644,5 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { } } - def isComplex(mdl: NCProbeModel): Boolean = mdl.synonymsDsl.nonEmpty || !mdl.model.getParsers.isEmpty + def isComplex(mdl: NCProbeModel): Boolean = mdl.dslSynonyms.nonEmpty || !mdl.model.getParsers.isEmpty } \ No newline at end of file
