This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-41-1 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit d87440edb71a370fca153e93d6caa0e74c8abd88 Author: Sergey Kamov <[email protected]> AuthorDate: Wed Sep 9 13:47:14 2020 +0300 WIP. --- .../nlpcraft/model/impl/NCModelWrapper.scala | 160 +++-- .../apache/nlpcraft/model/impl/NCTokenImpl.scala | 7 +- .../nlpcraft/probe/mgrs/NCModelDecorator.scala | 117 ---- .../probe/mgrs/conn/NCConnectionManager.scala | 4 +- .../probe/mgrs/deploy/NCDeployManager.scala | 637 +++++++++++++++++++- .../inspections/inspectors/NCProbeInspection.scala | 2 +- .../nlpcraft/probe/mgrs/model/NCModelManager.scala | 666 +-------------------- .../nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala | 4 +- .../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 40 +- .../dictionary/NCDictionaryEnricher.scala | 6 +- .../mgrs/nlp/enrichers/limit/NCLimitEnricher.scala | 6 +- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 21 +- .../enrichers/relation/NCRelationEnricher.scala | 6 +- .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 6 +- .../enrichers/stopword/NCStopWordEnricher.scala | 10 +- .../suspicious/NCSuspiciousNounsEnricher.scala | 6 +- .../mgrs/nlp/validate/NCValidateManager.scala | 40 +- 17 files changed, 823 insertions(+), 915 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCModelWrapper.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCModelWrapper.scala index f1f0eb5..c356f90 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCModelWrapper.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCModelWrapper.scala @@ -17,53 +17,135 @@ package org.apache.nlpcraft.model.impl +import java.io.Serializable +import java.util + +import org.apache.nlpcraft.common.TOK_META_ALIASES_KEY +import org.apache.nlpcraft.common.nlp.NCNlpSentence import org.apache.nlpcraft.model.intent.impl.NCIntentSolver -import org.apache.nlpcraft.model.{NCContext, NCIntentMatch, NCModel, NCRejection, NCResult, NCVariant} +import org.apache.nlpcraft.model.{NCContext, NCCustomParser, NCElement, NCIntentMatch, NCModel, NCRejection, NCResult, NCVariant} +import org.apache.nlpcraft.probe.mgrs.NCSynonym + +import scala.collection.JavaConverters._ +import scala.collection.{Seq, mutable} /** - * Internal model implementation combining model and intent solver. - * - * @param proxy Mandatory model proxy. - * @param solver Optional solver. - */ -class NCModelWrapper(val proxy: NCModel, val solver: NCIntentSolver) extends NCModel { + * + * @param proxy + * @param solver + * @param syns + * @param synsDsl + * @param addStopWordsStems + * @param exclStopWordsStems + * @param suspWordsStems + * @param elms + */ +case class NCModelWrapper( + proxy: NCModel, + solver: NCIntentSolver, + syns: Map[String/*Element ID*/, Map[Int/*Synonym length*/, Seq[NCSynonym]]], // Fast access map. + synsDsl: Map[String/*Element ID*/, Map[Int/*Synonym length*/, Seq[NCSynonym]]], // Fast access map. + addStopWordsStems: Set[String], + exclStopWordsStems: Set[String], + suspWordsStems: Set[String], + elms: Map[String/*Element ID*/, NCElement] +) extends NCModel { require(proxy != null) override def getId: String = proxy.getId override def getName: String = proxy.getName override def getVersion: String = proxy.getVersion - override def getDescription = proxy.getDescription - override def getMaxUnknownWords = proxy.getMaxUnknownWords - override def getMaxFreeWords = proxy.getMaxFreeWords - override def getMaxSuspiciousWords = proxy.getMaxSuspiciousWords - override def getMinWords = proxy.getMinWords - override def getMaxWords = proxy.getMaxWords - override def getMinTokens = proxy.getMinTokens - override def getMaxTokens = proxy.getMaxTokens - override def getMinNonStopwords = proxy.getMinNonStopwords - override def isNonEnglishAllowed = proxy.isNonEnglishAllowed - override def isNotLatinCharsetAllowed = proxy.isNotLatinCharsetAllowed - override def isSwearWordsAllowed = proxy.isSwearWordsAllowed - override def isNoNounsAllowed = proxy.isNoNounsAllowed - override def isPermutateSynonyms = proxy.isPermutateSynonyms - override def isDupSynonymsAllowed = proxy.isDupSynonymsAllowed - override def getMaxTotalSynonyms = proxy.getMaxTotalSynonyms - override def isNoUserTokensAllowed = proxy.isNoUserTokensAllowed - override def getJiggleFactor = proxy.getJiggleFactor - override def getMetadata = proxy.getMetadata - override def getAdditionalStopWords = proxy.getAdditionalStopWords - override def getExcludedStopWords = proxy.getExcludedStopWords - override def getSuspiciousWords = proxy.getSuspiciousWords - override def getMacros = proxy.getMacros - override def getParsers = proxy.getParsers - override def getElements = proxy.getElements - override def getEnabledBuiltInTokens = proxy.getEnabledBuiltInTokens - override def onParsedVariant(`var`: NCVariant) = proxy.onParsedVariant(`var`) - override def onContext(ctx: NCContext) = proxy.onContext(ctx) - override def onMatchedIntent(ctx: NCIntentMatch) = proxy.onMatchedIntent(ctx) - override def onResult(ctx: NCIntentMatch, res: NCResult) = proxy.onResult(ctx, res) - override def onRejection(ctx: NCIntentMatch, e: NCRejection) = proxy.onRejection(ctx, e) - override def onError(ctx: NCContext, e: Throwable) = proxy.onError(ctx, e) + override def getDescription: String = proxy.getDescription + override def getMaxUnknownWords: Int = proxy.getMaxUnknownWords + override def getMaxFreeWords: Int = proxy.getMaxFreeWords + override def getMaxSuspiciousWords: Int = proxy.getMaxSuspiciousWords + override def getMinWords: Int = proxy.getMinWords + override def getMaxWords: Int = proxy.getMaxWords + override def getMinTokens: Int = proxy.getMinTokens + override def getMaxTokens: Int = proxy.getMaxTokens + override def getMinNonStopwords: Int = proxy.getMinNonStopwords + override def isNonEnglishAllowed: Boolean = proxy.isNonEnglishAllowed + override def isNotLatinCharsetAllowed: Boolean = proxy.isNotLatinCharsetAllowed + override def isSwearWordsAllowed: Boolean = proxy.isSwearWordsAllowed + override def isNoNounsAllowed: Boolean = proxy.isNoNounsAllowed + override def isPermutateSynonyms: Boolean = proxy.isPermutateSynonyms + override def isDupSynonymsAllowed: Boolean = proxy.isDupSynonymsAllowed + override def getMaxTotalSynonyms: Int = proxy.getMaxTotalSynonyms + override def isNoUserTokensAllowed: Boolean = proxy.isNoUserTokensAllowed + override def getJiggleFactor: Int = proxy.getJiggleFactor + override def getMetadata: util.Map[String, AnyRef] = proxy.getMetadata + override def getAdditionalStopWords: util.Set[String] = proxy.getAdditionalStopWords + override def getExcludedStopWords: util.Set[String] = proxy.getExcludedStopWords + override def getSuspiciousWords: util.Set[String] = proxy.getSuspiciousWords + override def getMacros: util.Map[String, String] = proxy.getMacros + override def getParsers: util.List[NCCustomParser] = proxy.getParsers + override def getElements: util.Set[NCElement] = proxy.getElements + override def getEnabledBuiltInTokens: util.Set[String] = proxy.getEnabledBuiltInTokens + override def onParsedVariant(`var`: NCVariant): Boolean = proxy.onParsedVariant(`var`) + override def onContext(ctx: NCContext): NCResult = proxy.onContext(ctx) + override def onMatchedIntent(ctx: NCIntentMatch): Boolean = proxy.onMatchedIntent(ctx) + override def onResult(ctx: NCIntentMatch, res: NCResult): NCResult = proxy.onResult(ctx, res) + override def onRejection(ctx: NCIntentMatch, e: NCRejection): NCResult = proxy.onRejection(ctx, e) + override def onError(ctx: NCContext, e: Throwable): NCResult = proxy.onError(ctx, e) override def onInit(): Unit = proxy.onInit() override def onDiscard(): Unit = proxy.onDiscard() + + /** + * Makes variants for given sentences. + * + * @param srvReqId Server request ID. + * @param sens Sentences. + */ + def makeVariants(srvReqId: String, sens: Seq[NCNlpSentence]): Seq[NCVariant] = { + val seq = sens.map(_.toSeq.map(nlpTok ⇒ NCTokenImpl(this, srvReqId, nlpTok) → nlpTok)) + val toks = seq.map(_.map { case (tok, _) ⇒ tok }) + + case class Key(id: String, from: Int, to: Int) + + val keys2Toks = toks.flatten.map(t ⇒ Key(t.getId, t.getStartCharIndex, t.getEndCharIndex) → t).toMap + val partsKeys = mutable.HashSet.empty[Key] + + seq.flatten.foreach { case (tok, tokNlp) ⇒ + if (tokNlp.isUser) { + val userNotes = tokNlp.filter(_.isUser) + + require(userNotes.size == 1) + + val optList: Option[util.List[util.HashMap[String, Serializable]]] = userNotes.head.dataOpt("parts") + + optList match { + case Some(list) ⇒ + val keys = + list.asScala.map(m ⇒ + Key( + m.get("id").asInstanceOf[String], + m.get("startcharindex").asInstanceOf[Integer], + m.get("endcharindex").asInstanceOf[Integer] + ) + ) + val parts = keys.map(keys2Toks) + + parts.zip(list.asScala).foreach { case (part, map) ⇒ + map.get(TOK_META_ALIASES_KEY) match { + case null ⇒ // No-op. + case aliases ⇒ part.getMetadata.put(TOK_META_ALIASES_KEY, aliases.asInstanceOf[Object]) + } + } + + tok.setParts(parts) + partsKeys ++= keys + + case None ⇒ // No-op. + } + } + } + + // We can't collapse parts earlier, because we need them here (setParts method, few lines above.) + toks.filter(sen ⇒ + !sen.exists(t ⇒ + t.getId != "nlpcraft:nlp" && + partsKeys.contains(Key(t.getId, t.getStartCharIndex, t.getEndCharIndex)) + ) + ).map(p ⇒ new NCVariantImpl(p.asJava)) + } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala index 6970e8b..66ab4cb 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala @@ -23,7 +23,6 @@ import java.util.Collections import org.apache.nlpcraft.common._ import org.apache.nlpcraft.common.nlp.NCNlpSentenceToken import org.apache.nlpcraft.model._ -import org.apache.nlpcraft.probe.mgrs.NCModelDecorator import scala.collection.JavaConverters._ import scala.collection.{Seq, mutable} @@ -99,7 +98,7 @@ private[nlpcraft] class NCTokenImpl( } private[nlpcraft] object NCTokenImpl { - def apply(mdl: NCModelDecorator, srvReqId: String, tok: NCNlpSentenceToken): NCTokenImpl = { + def apply(mdl: NCModelWrapper, srvReqId: String, tok: NCNlpSentenceToken): NCTokenImpl = { // nlpcraft:nlp and some optional (after collapsing). require(tok.size <= 2, s"Unexpected token [size=${tok.size}, token=$tok]") @@ -142,7 +141,7 @@ private[nlpcraft] object NCTokenImpl { elm.getMetadata.asScala.foreach { case (k, v) ⇒ md.put(k, v.asInstanceOf[java.io.Serializable]) } new NCTokenImpl( - mdl.wrapper, + mdl, srvReqId = srvReqId, id = elm.getId, grps = elm.getGroups.asScala, @@ -165,7 +164,7 @@ private[nlpcraft] object NCTokenImpl { md.put("nlpcraft:nlp:freeword", !isStop && note.isNlp) new NCTokenImpl( - mdl.wrapper, + mdl, srvReqId = srvReqId, id = note.noteType, // Use NLP note type as synthetic element ID. grps = Seq(note.noteType), // Use NLP note type as synthetic element group. diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCModelDecorator.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCModelDecorator.scala deleted file mode 100644 index f1a5a6f..0000000 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCModelDecorator.scala +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nlpcraft.probe.mgrs - -import java.io.Serializable -import java.util - -import org.apache.nlpcraft.common.TOK_META_ALIASES_KEY -import org.apache.nlpcraft.common.nlp.NCNlpSentence -import org.apache.nlpcraft.model.impl.{NCModelWrapper, NCTokenImpl, NCVariantImpl} -import org.apache.nlpcraft.model.{NCElement, NCVariant} - -import scala.collection.JavaConverters._ -import scala.collection.{Seq, mutable} -import scala.language.implicitConversions - -/** - * - * @param wrapper Decorated model. - * @param syns Fast-access synonyms map for first phase. - * @param synsDsl Fast-access synonyms map for second phase. - * @param addStopWordsStems Stemmatized additional stopwords. - * @param exclStopWordsStems Stemmatized excluded stopwords. - * @param suspWordsStems Stemmatized suspicious stopwords. - * @param elms Map of model elements. - */ -case class NCModelDecorator( - wrapper: NCModelWrapper, - syns: Map[String/*Element ID*/, Map[Int/*Synonym length*/, Seq[NCSynonym]]], // Fast access map. - synsDsl: Map[String/*Element ID*/, Map[Int/*Synonym length*/, Seq[NCSynonym]]], // Fast access map. - addStopWordsStems: Set[String], - exclStopWordsStems: Set[String], - suspWordsStems: Set[String], - elms: Map[String/*Element ID*/, NCElement] -) extends java.io.Serializable { - /** - * Makes variants for given sentences. - * - * @param srvReqId Server request ID. - * @param sens Sentences. - */ - def makeVariants(srvReqId: String, sens: Seq[NCNlpSentence]): Seq[NCVariant] = { - val seq = sens.map(_.toSeq.map(nlpTok ⇒ NCTokenImpl(this, srvReqId, nlpTok) → nlpTok)) - val toks = seq.map(_.map { case (tok, _) ⇒ tok }) - - case class Key(id: String, from: Int, to: Int) - - val keys2Toks = toks.flatten.map(t ⇒ Key(t.getId, t.getStartCharIndex, t.getEndCharIndex) → t).toMap - val partsKeys = mutable.HashSet.empty[Key] - - seq.flatten.foreach { case (tok, tokNlp) ⇒ - if (tokNlp.isUser) { - val userNotes = tokNlp.filter(_.isUser) - - require(userNotes.size == 1) - - val optList: Option[util.List[util.HashMap[String, Serializable]]] = userNotes.head.dataOpt("parts") - - optList match { - case Some(list) ⇒ - val keys = - list.asScala.map(m ⇒ - Key( - m.get("id").asInstanceOf[String], - m.get("startcharindex").asInstanceOf[Integer], - m.get("endcharindex").asInstanceOf[Integer] - ) - ) - val parts = keys.map(keys2Toks) - - parts.zip(list.asScala).foreach { case (part, map) ⇒ - map.get(TOK_META_ALIASES_KEY) match { - case null ⇒ // No-op. - case aliases ⇒ part.getMetadata.put(TOK_META_ALIASES_KEY, aliases.asInstanceOf[Object]) - } - } - - tok.setParts(parts) - partsKeys ++= keys - - case None ⇒ // No-op. - } - } - } - - // We can't collapse parts earlier, because we need them here (setParts method, few lines above.) - toks.filter(sen ⇒ - !sen.exists(t ⇒ - t.getId != "nlpcraft:nlp" && - partsKeys.contains(Key(t.getId, t.getStartCharIndex, t.getEndCharIndex)) - ) - ).map(p ⇒ new NCVariantImpl(p.asJava)) - } - - override def toString: String = { - s"Probe model decorator [" + - s"id=${wrapper.getId}, " + - s"name=${wrapper.getName}, " + - s"version=${wrapper.getVersion}" + - s"]" - } -} diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala index dafaf5f..ab24173 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala @@ -35,7 +35,6 @@ import org.apache.nlpcraft.probe.mgrs.NCProbeMessage import org.apache.nlpcraft.probe.mgrs.cmd.NCCommandManager import org.apache.nlpcraft.probe.mgrs.model.NCModelManager -import scala.collection.JavaConverters._ import scala.collection.mutable /** @@ -228,8 +227,7 @@ object NCConnectionManager extends NCService { "PROBE_HOST_ADDR" → localHost.getHostAddress, "PROBE_HW_ADDR" → hwAddrs, "PROBE_MODELS" → - NCModelManager.getAllModels().map(m ⇒ { - val mdl = m.wrapper + NCModelManager.getAllModels().map(mdl ⇒ { // Model already validated. diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala index 3aca836..8c10c1d 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala @@ -18,27 +18,35 @@ package org.apache.nlpcraft.probe.mgrs.deploy import java.io._ -import java.util.jar.{JarInputStream ⇒ JIS} +import java.util.jar.{JarInputStream => JIS} +import java.util.regex.{Pattern, PatternSyntaxException} import io.opencensus.trace.Span import org.apache.nlpcraft.common._ import org.apache.nlpcraft.common.config.NCConfigurable +import org.apache.nlpcraft.common.makro.NCMacroParser +import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager +import org.apache.nlpcraft.common.util.NCUtils.{DSL_FIX, REGEX_FIX} import org.apache.nlpcraft.model._ import org.apache.nlpcraft.model.factories.basic.NCBasicModelFactory import org.apache.nlpcraft.model.impl.NCModelWrapper import org.apache.nlpcraft.model.intent.impl.{NCIntentScanner, NCIntentSolver} +import org.apache.nlpcraft.probe.mgrs.NCSynonymChunkKind.{DSL, REGEX, TEXT} +import org.apache.nlpcraft.probe.mgrs.{NCSynonym, NCSynonymChunk} +import org.apache.nlpcraft.probe.mgrs.model.NCModelSynonymDslCompiler import resource.managed import scala.collection.JavaConverters._ import scala.collection.convert.DecorateAsScala import scala.collection.{Seq, mutable} -import scala.collection.mutable.ArrayBuffer +import scala.collection.mutable.{ArrayBuffer, ListBuffer} import scala.util.control.Exception._ /** * Model deployment manager. */ object NCDeployManager extends NCService with DecorateAsScala { + private final val TOKENS_PROVIDERS_PREFIXES = Set("nlpcraft:", "google:", "stanford:", "opennlp:", "spacy:") private final val ID_REGEX = "^[_a-zA-Z]+[a-zA-Z0-9:-_]*$" @volatile private var models: ArrayBuffer[NCModelWrapper] = _ @@ -55,8 +63,18 @@ object NCDeployManager extends NCService with DecorateAsScala { } /** + * + * @param elementId Element ID. + * @param synonym Element synonym. + */ + case class SynonymHolder( + elementId: String, + synonym: NCSynonym + ) + + /** * Gives a list of JAR files at given path. - * + * * @param path Path to scan. * @return */ @@ -68,7 +86,7 @@ object NCDeployManager extends NCService with DecorateAsScala { if (jars == null) Seq.empty else jars.toSeq } - + /** * * @param mdl @@ -90,26 +108,284 @@ object NCDeployManager extends NCService with DecorateAsScala { val mdlId = mdl.getId + val parser = new NCMacroParser + + // Initialize macro parser. + mdl.getMacros.asScala.foreach(t ⇒ parser.addMacro(t._1, t._2)) + + var solver: NCIntentSolver = null + if (intents.nonEmpty) { // Check the uniqueness of intent IDs. U.getDups(intents.keys.toSeq.map(_.id)) match { case ids if ids.nonEmpty ⇒ throw new NCE(s"Duplicate intent IDs found for '$mdlId' model: ${ids.mkString(",")}") case _ ⇒ () } - + logger.info(s"Intents found in the model: $mdlId") - val solver = new NCIntentSolver( + solver = new NCIntentSolver( intents.toList.map(x ⇒ (x._1, (z: NCIntentMatch) ⇒ x._2.apply(z))) ) - - new NCModelWrapper(mdl, solver) } - else { + else logger.warn(s"Model has no intents: $mdlId") - new NCModelWrapper(mdl, null) + checkModelConfig(mdl) + + for (elm ← mdl.getElements.asScala) + checkElement(mdl, elm) + + checkElementIdsDups(mdl) + checkCyclicDependencies(mdl) + + val addStopWords = checkAndStemmatize(mdl.getAdditionalStopWords, "Additional stopword") + val exclStopWords = checkAndStemmatize(mdl.getExcludedStopWords, "Excluded stopword") + val suspWords = checkAndStemmatize(mdl.getSuspiciousWords, "Suspicious word") + + checkStopwordsDups(addStopWords, exclStopWords) + + val syns = mutable.HashSet.empty[SynonymHolder] + + var cnt = 0 + val maxCnt = mdl.getMaxTotalSynonyms + + // Process and check elements. + for (elm ← mdl.getElements.asScala) { + val elmId = elm.getId + + def addSynonym( + isElementId: Boolean, + isValueName: Boolean, + value: String, + chunks: Seq[NCSynonymChunk]): Unit = { + def add(chunks: Seq[NCSynonymChunk], isDirect: Boolean): Unit = { + val holder = SynonymHolder( + elementId = elmId, + synonym = NCSynonym(isElementId, isValueName, isDirect, value, chunks) + ) + + if (syns.add(holder)) { + cnt += 1 + + if (cnt > maxCnt) + throw new NCE(s"Too many synonyms detected [" + + s"model=${mdl.getId}, " + + s"max=$maxCnt" + + s"]") + + if (value == null) + logger.trace(s"Synonym #${syns.size} added [" + + s"model=${mdl.getId}, " + + s"elementId=$elmId, " + + s"synonym=${chunks.mkString(" ")}" + + s"]") + else + logger.trace(s"Synonym #${syns.size} added [" + + s"model=${mdl.getId}, " + + s"elementId=$elmId, " + + s"synonym=${chunks.mkString(" ")}, " + + s"value=$value" + + s"]") + } + else + logger.trace( + s"Synonym already added (ignoring) [" + + s"model=${mdl.getId}, " + + s"elementId=$elmId, " + + s"synonym=${chunks.mkString(" ")}, " + + s"value=$value" + + s"]" + ) + } + + if (mdl.isPermutateSynonyms && !isElementId && chunks.forall(_.wordStem != null)) + simplePermute(chunks).map(p ⇒ p.map(_.wordStem) → p).toMap.values.foreach(p ⇒ add(p, p == chunks)) + else + add(chunks, isDirect = true) + } + + /** + * + * @param id + * @return + */ + def chunkIdSplit(id: String): Seq[NCSynonymChunk] = { + val chunks = chunkSplit(NCNlpCoreManager.tokenize(id).map(_.token).mkString(" ")) + + // IDs can only be simple strings. + if (chunks.exists(_.kind != TEXT)) + throw new NCE(s"Invalid ID: $id") + + chunks + } + + // Add element ID as a synonyms (dups ignored). + val idChunks = Seq(chunkIdSplit(elmId)) + + idChunks.distinct.foreach(ch ⇒ addSynonym(isElementId = true, isValueName = false, null, ch)) + + // Add straight element synonyms (dups printed as warnings). + val synsChunks = for (syn ← elm.getSynonyms.asScala.flatMap(parser.expand)) yield chunkSplit(syn) + + if (U.containsDups(synsChunks.flatten)) + logger.trace(s"Element synonyms duplicate (ignoring) [" + + s"model=${mdl.getId}, " + + s"elementId=$elmId, " + + s"synonym=${synsChunks.diff(synsChunks.distinct).distinct.map(_.mkString(",")).mkString(";")}" + + s"]" + ) + + synsChunks.distinct.foreach(ch ⇒ addSynonym(isElementId = false, isValueName = false, null, ch)) + + val vals = + (if (elm.getValues != null) elm.getValues.asScala else Seq.empty) ++ + (if (elm.getValueLoader != null) elm.getValueLoader.load(elm).asScala else Seq.empty) + + // Add value synonyms. + val valNames = vals.map(_.getName) + + if (U.containsDups(valNames)) + logger.trace(s"Element values names duplicate (ignoring) [" + + s"model=${mdl.getId}, " + + s"elementId=$elmId, " + + s"names=${valNames.diff(valNames.distinct).distinct.mkString(",")}" + + s"]" + ) + + for (v ← vals.map(p ⇒ p.getName → p).toMap.values) { + val valId = v.getName + val valSyns = v.getSynonyms.asScala + + val idChunks = Seq(chunkIdSplit(valId)) + + // Add value name as a synonyms (dups ignored) + idChunks.distinct.foreach(ch ⇒ addSynonym(isElementId = false, isValueName = true, valId, ch)) + + // Add straight value synonyms (dups printed as warnings) + var skippedOneLikeName = false + + val chunks = + valSyns.flatMap(parser.expand).flatMap(valSyn ⇒ { + val valSyns = chunkSplit(valSyn) + + if (idChunks.contains(valSyns) && !skippedOneLikeName) { + skippedOneLikeName = true + + None + } + else + Some(valSyns) + }) + + if (U.containsDups(chunks.toList)) + logger.trace(s"Element synonyms duplicate (ignoring) [" + + s"model=${mdl.getId}, " + + s"elementId=$elmId, " + + s"value=$valId, " + + s"synonym=${chunks.diff(chunks.distinct).distinct.map(_.mkString(",")).mkString(";")}" + + s"]" + ) + + chunks.distinct.foreach(ch ⇒ addSynonym(isElementId = false, isValueName = false, valId, ch)) + } + } + + val valLdrs = mutable.HashSet.empty[NCValueLoader] + + for (elm ← mdl.getElements.asScala) { + val ldr = elm.getValueLoader + + if (ldr != null) + valLdrs += ldr } + + // Discard value loaders, if any. + for (ldr ← valLdrs) + ldr.onDiscard() + + var foundDups = false + + val allAliases = + syns + .flatMap(_.synonym) + .groupBy(_.origText) + .map(x ⇒ (x._1, x._2.map(_.alias).filter(_ != null))) + .values + .flatten + .toList + + // Check for DSl alias uniqueness. + if (U.containsDups(allAliases)) { + for (dupAlias ← allAliases.diff(allAliases.distinct)) + logger.warn(s"Duplicate DSL alias '$dupAlias' found for model: ${mdl.getId}") + + throw new NCE(s"Duplicate DSL aliases found for model '${mdl.getId}'- check log messages.") + } + + val idAliasDups = + mdl + .getElements.asScala + .map(_.getId) + .intersect(allAliases.toSet) + + // Check that DSL aliases don't intersect with element IDs. + if (idAliasDups.nonEmpty) { + for (dup ← idAliasDups) + logger.warn(s"Duplicate element ID and DSL alias '$dup' found for model: ${mdl.getId}") + + throw new NCE(s"Duplicate element ID and DSL aliases found for model '${mdl.getId}'- check log messages.") + } + + // Check for synonym dups across all elements. + for ( + ((syn, isDirect), holders) ← + syns.groupBy(p ⇒ (p.synonym.mkString(" "), p.synonym.isDirect)) if holders.size > 1 && isDirect + ) { + logger.trace(s"Duplicate synonym detected (ignoring) [" + + s"model=${mdl.getId}, " + + s"element=${ + holders.map( + p ⇒ s"id=${p.elementId}${if (p.synonym.value == null) "" else s", value=${p.synonym.value}"}" + ).mkString("(", ",", ")") + }, " + + s"synonym=$syn" + + s"]" + ) + + foundDups = true + } + + if (foundDups) { + if (!mdl.isDupSynonymsAllowed) + throw new NCE(s"Duplicated synonyms are not allowed for model '${mdl.getId}' - check trace messages.") + + logger.warn(s"Found duplicate synonyms - check trace logging for model: ${mdl.getId}") + logger.warn(s"Duplicates are allowed by '${mdl.getId}' model but large number may degrade the performance.") + } + + mdl.getMetadata.put(MDL_META_ALL_ALIASES_KEY, allAliases.toSet) + mdl.getMetadata.put(MDL_META_ALL_ELM_IDS_KEY, + mdl.getElements.asScala.map(_.getId).toSet ++ + Set("nlpcraft:nlp") ++ + mdl.getEnabledBuiltInTokens.asScala + ) + mdl.getMetadata.put(MDL_META_ALL_GRP_IDS_KEY, + mdl.getElements.asScala.flatMap(_.getGroups.asScala).toSet ++ + Set("nlpcraft:nlp") ++ + mdl.getEnabledBuiltInTokens.asScala + ) + + NCModelWrapper( + proxy = mdl, + solver = solver, + syns = mkFastAccessMap(filter(syns, dsl = false)), + synsDsl = mkFastAccessMap(filter(syns, dsl = true)), + addStopWordsStems = addStopWords, + exclStopWordsStems = exclStopWords, + suspWordsStems = suspWords, + elms = mdl.getElements.asScala.map(elm ⇒ (elm.getId, elm)).toMap + ) } /** @@ -156,7 +432,28 @@ object NCDeployManager extends NCService with DecorateAsScala { } /** - * + * + * @param set + * @return + */ + private def mkFastAccessMap(set: Set[SynonymHolder]): Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , Seq[NCSynonym]]] = + set + .groupBy(_.elementId) + .map { + case (elmId, holders) ⇒ ( + elmId, + holders + .map(_.synonym) + .groupBy(_.size) + .map { + // Sort synonyms from most important to least important. + case (k, v) ⇒ (k, v.toSeq.sorted.reverse) + } + ) + } + + /** + * * @param cls Model class. * @param src Model class source. */ @@ -168,19 +465,19 @@ object NCDeployManager extends NCService with DecorateAsScala { s"class=${cls.getName}, " + s"factory=${modelFactory.getClass.getName}, " + s"source=$src" + - "]", e) + "]", e) case Right(model) ⇒ model } - + /** - * + * * @param jarFile JAR file to extract from. */ @throws[NCE] private def extractModels(jarFile: File): Seq[NCModelWrapper] = { val clsLdr = Thread.currentThread().getContextClassLoader - + val classes = mutable.ArrayBuffer.empty[Class[_ <: NCModel]] managed(new JIS(new BufferedInputStream(new FileInputStream(jarFile)))) acquireAndGet { in ⇒ @@ -199,7 +496,7 @@ object NCDeployManager extends NCService with DecorateAsScala { catch { // Errors are possible for JARs like log4j etc, which have runtime dependencies. // We don't need these messages in log beside trace, so ignore... - case _: ClassNotFoundException ⇒ () + case _: ClassNotFoundException ⇒ () case _: NoClassDefFoundError ⇒ () } } @@ -207,14 +504,14 @@ object NCDeployManager extends NCService with DecorateAsScala { entry = in.getNextJarEntry } } - + classes.map(cls ⇒ wrap( makeModelFromSource(cls, jarFile.getPath) ) ) } - + @throws[NCE] override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒ modelFactory = new NCBasicModelFactory @@ -224,29 +521,29 @@ object NCDeployManager extends NCService with DecorateAsScala { Config.modelFactoryType match { case Some(mft) ⇒ modelFactory = makeModelFactory(mft) - + modelFactory.initialize(Config.modelFactoryProps.getOrElse(Map.empty[String, String]).asJava) - + case None ⇒ // No-op. } - + models ++= Config.models.map(makeModel) - + Config.jarsFolder match { case Some(jarsFolder) ⇒ val jarsFile = new File(jarsFolder) - + if (!jarsFile.exists()) throw new NCE(s"JAR folder path '$jarsFolder' does not exist.") if (!jarsFile.isDirectory) throw new NCE(s"JAR folder path '$jarsFolder' is not a directory.") - + val src = this.getClass.getProtectionDomain.getCodeSource val locJar = if (src == null) null else new File(src.getLocation.getPath) - + for (jar ← scanJars(jarsFile) if jar != locJar) models ++= extractModels(jar) - + case None ⇒ // No-op. } @@ -265,7 +562,7 @@ object NCDeployManager extends NCService with DecorateAsScala { if (mdlName != null && mdlName.isEmpty) throw new NCE(s"Model name cannot be empty string: $mdlId") if (mdlId != null && mdlId.isEmpty) - throw new NCE( s"Model ID cannot be empty string: $mdlId") + throw new NCE(s"Model ID cannot be empty string: $mdlId") if (mdlVer != null && mdlVer.length > 16) throw new NCE(s"Model version cannot be empty string: $mdlId") if (mdlName != null && mdlName.length > 64) @@ -274,7 +571,7 @@ object NCDeployManager extends NCService with DecorateAsScala { throw new NCE(s"Model ID is too long (32 max): $mdlId") if (mdlVer != null && mdlVer.length > 16) throw new NCE(s"Model version is too long (16 max): $mdlId") - + for (elm ← mdl.getElements.asScala) if (!elm.getId.matches(ID_REGEX)) throw new NCE(s"Model element ID '${elm.getId}' does not match '$ID_REGEX' regex in: $mdlId") @@ -282,18 +579,18 @@ object NCDeployManager extends NCService with DecorateAsScala { if (U.containsDups(models.map(_.getId).toList)) throw new NCE("Duplicate model IDs detected.") - + super.start() } @throws[NCE] override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒ if (modelFactory != null) - modelFactory.terminate() + modelFactory.terminate() if (models != null) models.clear() - + super.stop() } @@ -302,4 +599,282 @@ object NCDeployManager extends NCService with DecorateAsScala { * @return */ def getModels: Seq[NCModelWrapper] = models + + /** + * Permutes and drops duplicated. + * For a given multi-word synonym we allow a single word move left or right only one position per permutation + * (i.e. only one word jiggles per permutation). + * E.g. for "A B C D" synonym we'll have only the following permutations: + * "A, B, C, D" + * "A, B, D, C" + * "A, C, B, D" + * "B, A, C, D" + * + * @param seq Initial sequence. + * @return Permutations. + */ + private def simplePermute[T](seq: Seq[T]): Seq[Seq[T]] = + seq.length match { + case 0 ⇒ Seq.empty + case 1 ⇒ Seq(seq) + case n ⇒ + def permute(idx1: Int, idx2: Int): Seq[T] = + seq.zipWithIndex.map { case (t, idx) ⇒ + if (idx == idx1) + seq(idx2) + else if (idx == idx2) + seq(idx1) + else + t + } + + Seq(seq) ++ + seq.zipWithIndex.flatMap { case (_, idx) ⇒ + if (idx == 0) + Seq(permute(0, 1)) + else if (idx == n - 1) + Seq(permute(n - 2, n - 1)) + else + Seq(permute(idx - 1, idx), permute(idx, idx + 1)) + }.distinct + } + + /** + * + * @param jc + * @param name + * @return + */ + private def checkAndStemmatize(jc: java.util.Set[String], name: String): Set[String] = + for (word: String ← jc.asScala.toSet) yield + if (hasWhitespace(word)) + throw new NCE(s"$name cannot have whitespace: '$word'") + else + NCNlpCoreManager.stem(word) + + /** + * Checks cyclic child-parent dependencies. + * + * @param mdl Model. + */ + @throws[NCE] + private def checkCyclicDependencies(mdl: NCModel): Unit = + for (elm ← mdl.getElements.asScala) { + if (elm.getParentId != null) { + val seen = mutable.ArrayBuffer.empty[String] + + var parentId: String = null + var x = elm + + do { + parentId = x.getParentId + + if (parentId != null) { + if (seen.contains(parentId)) + throw new NCE(s"Cyclic parent dependency starting at model element '${x.getId}'.") + else { + seen += parentId + + x = mdl.getElements.asScala.find(_.getId == parentId) getOrElse { + throw new NCE(s"Unknown parent ID '$parentId' for model element '${x.getId}'.") + + null + } + } + } + } + while (parentId != null) + } + } + + /** + * + * @param mdl Model. + */ + @throws[NCE] + private def checkElementIdsDups(mdl: NCModel): Unit = { + val ids = mutable.HashSet.empty[String] + + for (id ← mdl.getElements.asScala.map(_.getId)) + if (ids.contains(id)) + throw new NCE(s"Duplicate model element ID '$id'.") + else + ids += id + } + + /** + * Verifies model element in isolation. + * + * @param mdl Model. + * @param elm Element to verify. + */ + @throws[NCE] + private def checkElement(mdl: NCModel, elm: NCElement): Unit = + if (elm.getId == null) + throw new NCE(s"Model element ID is not provided.'") + else if (elm.getId.length == 0) + throw new NCE(s"Model element ID cannot be empty.'") + else { + val elmId = elm.getId + + if (elmId.toLowerCase.startsWith("nlpcraft:")) + throw new NCE(s"Model element '$elmId' type cannot start with 'nlpcraft:'.") + + if (hasWhitespace(elmId)) + throw new NCE(s"Model element ID '$elmId' cannot have whitespaces.") + } + + /** + * + * @param mdl Model. + */ + private def checkModelConfig(mdl: NCModel): Unit = { + def checkInt(v: Int, name: String, min: Int = 0, max: Int = Integer.MAX_VALUE): Unit = + if (v < min) + throw new NCE(s"Invalid model configuration value '$name' [value=$v, min=$min]") + else if (v > max) + throw new NCE(s"Invalid model configuration value '$name' [value=$v, max=$min]") + + checkInt(mdl.getMaxUnknownWords, "maxUnknownWords") + checkInt(mdl.getMaxFreeWords, "maxFreeWords") + checkInt(mdl.getMaxSuspiciousWords, "maxSuspiciousWords") + checkInt(mdl.getMinWords, "minWords", min = 1) + checkInt(mdl.getMinNonStopwords, "minNonStopwords") + checkInt(mdl.getMinTokens, "minTokens") + checkInt(mdl.getMaxTokens, "maxTokens", max = 100) + checkInt(mdl.getMaxWords, "maxWords", min = 1, max = 100) + checkInt(mdl.getJiggleFactor, "jiggleFactor", max = 4) + + val unsToks = + mdl.getEnabledBuiltInTokens.asScala.filter(t ⇒ + // 'stanford', 'google', 'opennlp', 'spacy' - any names, not validated. + t == null || + !TOKENS_PROVIDERS_PREFIXES.exists(typ ⇒ t.startsWith(typ)) || + // 'nlpcraft' names validated. + (t.startsWith("nlpcraft:") && !NCModelView.DFLT_ENABLED_BUILTIN_TOKENS.contains(t)) + ) + + if (unsToks.nonEmpty) + throw new NCE(s"Invalid model 'enabledBuiltInTokens' token IDs: ${unsToks.mkString(", ")}") + } + + /** + * Checks whether or not given string has any whitespaces. + * + * @param s String to check. + * @return + */ + private def hasWhitespace(s: String): Boolean = s.exists(_.isWhitespace) + + private def filter(set: mutable.HashSet[SynonymHolder], dsl: Boolean): Set[SynonymHolder] = + set.toSet.filter(s ⇒ { + val b = s.synonym.exists(_.kind == DSL) + + if (dsl) b else !b + }) + + /** + * + * @param chunk Synonym chunk. + * @return + */ + @throws[NCE] + private def mkChunk(chunk: String): NCSynonymChunk = { + def stripSuffix(fix: String, s: String): String = s.slice(fix.length, s.length - fix.length) + + // Regex synonym. + if (startsAndEnds(REGEX_FIX, chunk)) { + val ptrn = stripSuffix(REGEX_FIX, chunk) + + if (ptrn.length > 0) + try + NCSynonymChunk(kind = REGEX, origText = chunk, regex = Pattern.compile(ptrn)) + catch { + case e: PatternSyntaxException ⇒ throw new NCE(s"Invalid regex syntax in: $chunk", e) + } + else + throw new NCE(s"Empty regex synonym detected: $chunk") + } + // DSL-based synonym. + else if (startsAndEnds(DSL_FIX, chunk)) { + val dsl = stripSuffix(DSL_FIX, chunk) + val compUnit = NCModelSynonymDslCompiler.parse(dsl) + + val x = NCSynonymChunk(alias = compUnit.alias, kind = DSL, origText = chunk, dslPred = compUnit.predicate) + + x + } + // Regular word. + else + NCSynonymChunk(kind = TEXT, origText = chunk, wordStem = NCNlpCoreManager.stem(chunk)) + } + + /** + * + * @param adds Additional stopword stems. + * @param excls Excluded stopword stems. + */ + @throws[NCE] + private def checkStopwordsDups(adds: Set[String], excls: Set[String]): Unit = { + val cross = adds.intersect(excls) + + if (cross.nonEmpty) + throw new NCE(s"Duplicate stems in additional and excluded stopwords: '${cross.mkString(",")}'") + } + + /** + * + * @param fix Prefix and suffix. + * @param s String to search prefix and suffix in. + * @return + */ + private def startsAndEnds(fix: String, s: String): Boolean = + s.startsWith(fix) && s.endsWith(fix) + + /** + * + * @param s + * @return + */ + @throws[NCE] + private def chunkSplit(s: String): Seq[NCSynonymChunk] = { + val x = s.trim() + + val chunks = ListBuffer.empty[String] + + var start = 0 + var curr = 0 + val len = x.length - (2 + 2) // 2 is a prefix/suffix length. Hack... + + def splitUp(s: String): Seq[String] = s.split(" ").map(_.trim).filter(_.nonEmpty).toSeq + + def processChunk(fix: String): Unit = { + chunks ++= splitUp(x.substring(start, curr)) + + x.indexOf(fix, curr + fix.length) match { + case -1 ⇒ throw new NCE(s"Invalid synonym definition in: $x") + case n ⇒ + chunks += x.substring(curr, n + fix.length) + start = n + fix.length + curr = start + } + } + + def isFix(fix: String): Boolean = + x.charAt(curr) == fix.charAt(0) && + x.charAt(curr + 1) == fix.charAt(1) + + while (curr < len) { + if (isFix(REGEX_FIX)) + processChunk(REGEX_FIX) + else if (isFix(DSL_FIX)) + processChunk(DSL_FIX) + else + curr += 1 + } + + chunks ++= splitUp(x.substring(start)) + + chunks.map(mkChunk) + } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/inspections/inspectors/NCProbeInspection.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/inspections/inspectors/NCProbeInspection.scala index ca4d0c4..52b0767 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/inspections/inspectors/NCProbeInspection.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/inspections/inspectors/NCProbeInspection.scala @@ -50,7 +50,7 @@ trait NCProbeInspection extends NCInspectionService { val suggs = mutable.Buffer.empty[String] NCModelManager.getModel(mdlId) match { - case Some(x) ⇒ body(x.wrapper, args, suggs, warns, errs) + case Some(x) ⇒ body(x, args, suggs, warns, errs) case None ⇒ errs += s"Model not found: $mdlId" } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala index 646bf44..40760e1 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala @@ -18,125 +18,60 @@ package org.apache.nlpcraft.probe.mgrs.model import java.util -import java.util.regex.{Pattern, PatternSyntaxException} import io.opencensus.trace.Span import org.apache.nlpcraft.common._ import org.apache.nlpcraft.common.ascii.NCAsciiTable -import org.apache.nlpcraft.common.makro.NCMacroParser -import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager -import org.apache.nlpcraft.common.util.NCUtils._ import org.apache.nlpcraft.model._ import org.apache.nlpcraft.model.impl.NCModelWrapper import org.apache.nlpcraft.model.intent.impl.NCIntentScanner -import org.apache.nlpcraft.probe.mgrs.NCSynonymChunkKind._ import org.apache.nlpcraft.probe.mgrs.deploy._ -import org.apache.nlpcraft.probe.mgrs.inspections.NCInspectionManager -import org.apache.nlpcraft.probe.mgrs.{NCModelDecorator, NCSynonym, NCSynonymChunk} import scala.collection.JavaConverters._ import scala.collection.convert.DecorateAsScala -import scala.collection.convert.ImplicitConversions._ -import scala.collection.mutable -import scala.collection.mutable.ListBuffer -import scala.concurrent.ExecutionContext.Implicits.global -import scala.util.{Failure, Success} import scala.util.control.Exception._ /** * Model manager. */ object NCModelManager extends NCService with DecorateAsScala { - private final val TOKENS_PROVIDERS_PREFIXES = Set("nlpcraft:", "google:", "stanford:", "opennlp:", "spacy:") - // Deployed models keyed by their IDs. - @volatile private var models: mutable.Map[String, NCModelDecorator] = _ + @volatile private var models: Map[String, NCModelWrapper] = _ // Access mutex. private final val mux = new Object() - private final val DFLT_INSPECTIONS = Seq("macros", "intents", "synonyms") - - /** - * - * @param elementId Element ID. - * @param synonym Element synonym. - */ - case class SynonymHolder( - elementId: String, - synonym: NCSynonym - ) - - /** - * @param mdl Model. - */ - private def addNewModel(mdl: NCModelWrapper): Unit = { - require(Thread.holdsLock(mux)) - - checkModelConfig(mdl) - - val parser = new NCMacroParser - - // Initialize macro parser. - mdl.getMacros.asScala.foreach(t ⇒ parser.addMacro(t._1, t._2)) - - models += mdl.getId → verifyAndDecorate(mdl, parser) - - // Init callback on the model. - mdl.onInit() - } @throws[NCE] override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { span ⇒ - models = mutable.HashMap.empty[String, NCModelDecorator] + val tbl = NCAsciiTable("Model ID", "Name", "Ver.", "Elements", "Synonyms") mux.synchronized { - NCDeployManager.getModels.foreach(addNewModel) - - if (models.isEmpty) - throw new NCException("No models to deploy. Probe requires at least one data model to start.") + models = NCDeployManager.getModels.map(mdl ⇒ { + mdl.onInit() - val tbl = NCAsciiTable("Model ID", "Name", "Ver.", "Elements", "Synonyms") + mdl.proxy.getId → mdl + }).toMap models.values.foreach(mdl ⇒ { val synCnt = mdl.syns.values.flatMap(_.values).flatten.size tbl += ( - mdl.wrapper.getId, - mdl.wrapper.getName, - mdl.wrapper.getVersion, + mdl.getId, + mdl.getName, + mdl.getVersion, mdl.elms.keySet.size, synCnt ) - }) + } - tbl.info(logger, Some(s"Models deployed: ${models.size}\n")) - - for (mdl ← models.values; insId ← DFLT_INSPECTIONS) { - val mdlId = mdl.wrapper.getId - - NCInspectionManager.inspect(mdlId, insId, null, parent).onComplete{ - case Success(res) ⇒ - res.errors().asScala.foreach( - p ⇒ logger.error(s"Validation error [model=$mdlId, inspection=$insId, text=$p") - ) - res.warnings().asScala.foreach( - p ⇒ logger.warn(s"Validation warning [model=$mdlId, inspection=$insId, text=$p") - ) - res.suggestions().asScala.foreach( - p ⇒ logger.info(s"Validation suggestion [model=$mdlId, inspection=$insId, text=$p") - ) + tbl.info(logger, Some(s"Models deployed: ${models.size}\n")) - case Failure(e) ⇒ logger.error(s"Error processing inspections: $mdlId", e) - } - } - - addTags( - span, - "deployedModels" → models.values.map(_.wrapper.getId).mkString(",") - ) - } + addTags( + span, + "deployedModels" → models.values.map(_.getId).mkString(",") + ) super.start() } @@ -162,579 +97,18 @@ object NCModelManager extends NCService with DecorateAsScala { override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒ mux.synchronized { if (models != null) - models.values.foreach(m ⇒ discardModel(m.wrapper)) + models.values.foreach(m ⇒ discardModel(m)) } super.stop() } - /** - * - * @param fix Prefix and suffix. - * @param s String to search prefix and suffix in. - * @return - */ - private def startsAndEnds(fix: String, s: String): Boolean = - s.startsWith(fix) && s.endsWith(fix) - - /** - * - * @param s - * @return - */ - @throws[NCE] - private def chunkSplit(s: String): Seq[NCSynonymChunk] = { - val x = s.trim() - - val chunks = ListBuffer.empty[String] - - var start = 0 - var curr = 0 - val len = x.length - (2 + 2) // 2 is a prefix/suffix length. Hack... - - def splitUp(s: String): Seq[String] = s.split(" ").map(_.trim).filter(_.nonEmpty).toSeq - - def processChunk(fix: String): Unit = { - chunks ++= splitUp(x.substring(start, curr)) - - x.indexOf(fix, curr + fix.length) match { - case -1 ⇒ throw new NCE(s"Invalid synonym definition in: $x") - case n ⇒ - chunks += x.substring(curr, n + fix.length) - start = n + fix.length - curr = start - } - } - - def isFix(fix: String): Boolean = - x.charAt(curr) == fix.charAt(0) && - x.charAt(curr + 1) == fix.charAt(1) - - while (curr < len) { - if (isFix(REGEX_FIX)) - processChunk(REGEX_FIX) - else if (isFix(DSL_FIX)) - processChunk(DSL_FIX) - else - curr += 1 - } - - chunks ++= splitUp(x.substring(start)) - - chunks.map(mkChunk) - } - - /** - * - * @param chunk Synonym chunk. - * @return - */ - @throws[NCE] - private def mkChunk(chunk: String): NCSynonymChunk = { - def stripSuffix(fix: String, s: String): String = s.slice(fix.length, s.length - fix.length) - - // Regex synonym. - if (startsAndEnds(REGEX_FIX, chunk)) { - val ptrn = stripSuffix(REGEX_FIX, chunk) - - if (ptrn.length > 0) - try - NCSynonymChunk(kind = REGEX, origText = chunk, regex = Pattern.compile(ptrn)) - catch { - case e: PatternSyntaxException ⇒ throw new NCE(s"Invalid regex syntax in: $chunk", e) - } - else - throw new NCE(s"Empty regex synonym detected: $chunk") - } - // DSL-based synonym. - else if (startsAndEnds(DSL_FIX, chunk)) { - val dsl = stripSuffix(DSL_FIX, chunk) - val compUnit = NCModelSynonymDslCompiler.parse(dsl) - - val x = NCSynonymChunk(alias = compUnit.alias, kind = DSL, origText = chunk, dslPred = compUnit.predicate) - - x - } - // Regular word. - else - NCSynonymChunk(kind = TEXT, origText = chunk, wordStem = NCNlpCoreManager.stem(chunk)) - } - - /** - * - * @param adds Additional stopword stems. - * @param excls Excluded stopword stems. - */ - @throws[NCE] - private def checkStopwordsDups(adds: Set[String], excls: Set[String]): Unit = { - val cross = adds.intersect(excls) - - if (cross.nonEmpty) - throw new NCE(s"Duplicate stems in additional and excluded stopwords: '${cross.mkString(",")}'") - } - - /** - * Verifies given model and makes a decorator optimized for model enricher. - * - * @param mdl Model to verify and decorate. - * @param parser Initialized macro parser. - * @return Model decorator. - */ - @throws[NCE] - private def verifyAndDecorate(mdl: NCModelWrapper, parser: NCMacroParser): NCModelDecorator = { - for (elm ← mdl.getElements) - checkElement(mdl, elm) - - checkElementIdsDups(mdl) - checkCyclicDependencies(mdl) - - val addStopWords = checkAndStemmatize(mdl.getAdditionalStopWords, "Additional stopword") - val exclStopWords = checkAndStemmatize(mdl.getExcludedStopWords, "Excluded stopword") - val suspWords = checkAndStemmatize(mdl.getSuspiciousWords, "Suspicious word") - - checkStopwordsDups(addStopWords, exclStopWords) - - val syns = mutable.HashSet.empty[SynonymHolder] - - var cnt = 0 - val maxCnt = mdl.getMaxTotalSynonyms - - // Process and check elements. - for (elm ← mdl.getElements) { - val elmId = elm.getId - - def addSynonym( - isElementId: Boolean, - isValueName: Boolean, - value: String, - chunks: Seq[NCSynonymChunk]): Unit = { - def add(chunks: Seq[NCSynonymChunk], isDirect: Boolean): Unit = { - val holder = SynonymHolder( - elementId = elmId, - synonym = NCSynonym(isElementId, isValueName, isDirect, value, chunks) - ) - - if (syns.add(holder)) { - cnt += 1 - - if (cnt > maxCnt) - throw new NCE(s"Too many synonyms detected [" + - s"model=${mdl.getId}, " + - s"max=$maxCnt" + - s"]") - - if (value == null) - logger.trace(s"Synonym #${syns.size} added [" + - s"model=${mdl.getId}, " + - s"elementId=$elmId, " + - s"synonym=${chunks.mkString(" ")}" + - s"]") - else - logger.trace(s"Synonym #${syns.size} added [" + - s"model=${mdl.getId}, " + - s"elementId=$elmId, " + - s"synonym=${chunks.mkString(" ")}, " + - s"value=$value" + - s"]") - } - else - logger.trace( - s"Synonym already added (ignoring) [" + - s"model=${mdl.getId}, " + - s"elementId=$elmId, " + - s"synonym=${chunks.mkString(" ")}, " + - s"value=$value" + - s"]" - ) - } - - if (mdl.isPermutateSynonyms && !isElementId && chunks.forall(_.wordStem != null)) - simplePermute(chunks).map(p ⇒ p.map(_.wordStem) → p).toMap.values.foreach(p ⇒ add(p, p == chunks)) - else - add(chunks, isDirect = true) - } - - /** - * - * @param id - * @return - */ - def chunkIdSplit(id: String): Seq[NCSynonymChunk] = { - val chunks = chunkSplit(NCNlpCoreManager.tokenize(id).map(_.token).mkString(" ")) - - // IDs can only be simple strings. - if (chunks.exists(_.kind != TEXT)) - throw new NCE(s"Invalid ID: $id") - - chunks - } - - // Add element ID as a synonyms (dups ignored). - val idChunks = Seq(chunkIdSplit(elmId)) - - idChunks.distinct.foreach(ch ⇒ addSynonym(isElementId = true, isValueName = false, null, ch)) - - // Add straight element synonyms (dups printed as warnings). - val synsChunks = for (syn ← elm.getSynonyms.flatMap(parser.expand)) yield chunkSplit(syn) - - if (U.containsDups(synsChunks.flatten)) - logger.trace(s"Element synonyms duplicate (ignoring) [" + - s"model=${mdl.getId}, " + - s"elementId=$elmId, " + - s"synonym=${synsChunks.diff(synsChunks.distinct).distinct.map(_.mkString(",")).mkString(";")}" + - s"]" - ) - - synsChunks.distinct.foreach(ch ⇒ addSynonym(isElementId = false, isValueName = false, null, ch)) - - val vals = - (if (elm.getValues != null) elm.getValues.asScala else Seq.empty) ++ - (if (elm.getValueLoader != null) elm.getValueLoader.load(elm).asScala else Seq.empty) - - // Add value synonyms. - val valNames = vals.map(_.getName) - - if (U.containsDups(valNames)) - logger.trace(s"Element values names duplicate (ignoring) [" + - s"model=${mdl.getId}, " + - s"elementId=$elmId, " + - s"names=${valNames.diff(valNames.distinct).distinct.mkString(",")}" + - s"]" - ) - - for (v ← vals.map(p ⇒ p.getName → p).toMap.values) { - val valId = v.getName - val valSyns = v.getSynonyms.asScala - - val idChunks = Seq(chunkIdSplit(valId)) - - // Add value name as a synonyms (dups ignored) - idChunks.distinct.foreach(ch ⇒ addSynonym(isElementId = false, isValueName = true, valId, ch)) - - // Add straight value synonyms (dups printed as warnings) - var skippedOneLikeName = false - - val chunks = - valSyns.flatMap(parser.expand).flatMap(valSyn ⇒ { - val valSyns = chunkSplit(valSyn) - - if (idChunks.contains(valSyns) && !skippedOneLikeName) { - skippedOneLikeName = true - - None - } - else - Some(valSyns) - }) - - if (U.containsDups(chunks.toList)) - logger.trace(s"Element synonyms duplicate (ignoring) [" + - s"model=${mdl.getId}, " + - s"elementId=$elmId, " + - s"value=$valId, " + - s"synonym=${chunks.diff(chunks.distinct).distinct.map(_.mkString(",")).mkString(";")}" + - s"]" - ) - - chunks.distinct.foreach(ch ⇒ addSynonym(isElementId = false, isValueName = false, valId, ch)) - } - } - - val valLdrs = mutable.HashSet.empty[NCValueLoader] - - for (elm ← mdl.getElements) { - val ldr = elm.getValueLoader - - if (ldr != null) - valLdrs += ldr - } - - // Discard value loaders, if any. - for (ldr ← valLdrs) - ldr.onDiscard() - - var foundDups = false - - val allAliases = - syns - .flatMap(_.synonym) - .groupBy(_.origText) - .map(x ⇒ (x._1, x._2.map(_.alias).filter(_ != null))) - .values - .flatten - .toList - - // Check for DSl alias uniqueness. - if (U.containsDups(allAliases)) { - for (dupAlias ← allAliases.diff(allAliases.distinct)) - logger.warn(s"Duplicate DSL alias '$dupAlias' found for model: ${mdl.getId}") - - throw new NCE(s"Duplicate DSL aliases found for model '${mdl.getId}'- check log messages.") - } - - val idAliasDups = - mdl - .getElements - .map(_.getId) - .intersect(allAliases.toSet) - - // Check that DSL aliases don't intersect with element IDs. - if (idAliasDups.nonEmpty) { - for (dup ← idAliasDups) - logger.warn(s"Duplicate element ID and DSL alias '$dup' found for model: ${mdl.getId}") - - throw new NCE(s"Duplicate element ID and DSL aliases found for model '${mdl.getId}'- check log messages.") - } - - // Check for synonym dups across all elements. - for ( - ((syn, isDirect), holders) ← - syns.groupBy(p ⇒ (p.synonym.mkString(" "), p.synonym.isDirect)) if holders.size > 1 && isDirect - ) { - logger.trace(s"Duplicate synonym detected (ignoring) [" + - s"model=${mdl.getId}, " + - s"element=${holders.map( - p ⇒ s"id=${p.elementId}${if (p.synonym.value == null) "" else s", value=${p.synonym.value}"}" - ).mkString("(", ",", ")")}, " + - s"synonym=$syn" + - s"]" - ) - - foundDups = true - } - - if (foundDups) { - if (!mdl.isDupSynonymsAllowed) - throw new NCE(s"Duplicated synonyms are not allowed for model '${mdl.getId}' - check trace messages.") - - logger.warn(s"Found duplicate synonyms - check trace logging for model: ${mdl.getId}") - logger.warn(s"Duplicates are allowed by '${mdl.getId}' model but large number may degrade the performance.") - } - - mdl.getMetadata.put(MDL_META_ALL_ALIASES_KEY, allAliases.toSet) - mdl.getMetadata.put(MDL_META_ALL_ELM_IDS_KEY, - mdl.getElements.map(_.getId).toSet ++ - Set("nlpcraft:nlp") ++ - mdl.getEnabledBuiltInTokens - ) - mdl.getMetadata.put(MDL_META_ALL_GRP_IDS_KEY, - mdl.getElements.flatMap(_.getGroups.asScala).toSet ++ - Set("nlpcraft:nlp") ++ - mdl.getEnabledBuiltInTokens - ) - - /** - * - * @param set - * @return - */ - def mkFastAccessMap(set: Set[SynonymHolder]): Map[String/*Element ID*/, Map[Int/*Synonym length*/, Seq[NCSynonym]]] = - set - .groupBy(_.elementId) - .map { - case (elmId, holders) ⇒ ( - elmId, - holders - .map(_.synonym) - .groupBy(_.size) - .map { - // Sort synonyms from most important to least important. - case (k, v) ⇒ (k, v.toSeq.sorted.reverse) - } - ) - } - - def filter(set: mutable.HashSet[SynonymHolder], dsl: Boolean): Set[SynonymHolder] = - set.toSet.filter(s ⇒ { - val b = s.synonym.exists(_.kind == DSL) - - if (dsl) b else !b - }) - - NCModelDecorator( - wrapper = mdl, - syns = mkFastAccessMap(filter(syns, dsl = false)), - synsDsl = mkFastAccessMap(filter(syns, dsl = true)), - addStopWordsStems = addStopWords, - exclStopWordsStems = exclStopWords, - suspWordsStems = suspWords, - elms = mdl.getElements.map(elm ⇒ (elm.getId, elm)).toMap - ) - } - - /** - * Permutes and drops duplicated. - * For a given multi-word synonym we allow a single word move left or right only one position per permutation - * (i.e. only one word jiggles per permutation). - * E.g. for "A B C D" synonym we'll have only the following permutations: - * "A, B, C, D" - * "A, B, D, C" - * "A, C, B, D" - * "B, A, C, D" - * - * @param seq Initial sequence. - * @return Permutations. - */ - private def simplePermute[T](seq: Seq[T]): Seq[Seq[T]] = - seq.length match { - case 0 ⇒ Seq.empty - case 1 ⇒ Seq(seq) - case n ⇒ - def permute(idx1: Int, idx2: Int): Seq[T] = - seq.zipWithIndex.map { case (t, idx) ⇒ - if (idx == idx1) - seq(idx2) - else if (idx == idx2) - seq(idx1) - else - t - } - - Seq(seq)++ - seq.zipWithIndex.flatMap { case (_, idx) ⇒ - if (idx == 0) - Seq(permute(0, 1)) - else if (idx == n - 1) - Seq(permute(n - 2, n - 1)) - else - Seq(permute(idx - 1, idx), permute(idx, idx + 1)) - }.distinct - } - - /** - * - * @param jc - * @param name - * @return - */ - private def checkAndStemmatize(jc: java.util.Set[String], name: String): Set[String] = - for (word: String ← jc.asScala.toSet) yield - if (hasWhitespace(word)) - throw new NCE(s"$name cannot have whitespace: '$word'") - else - NCNlpCoreManager.stem(word) - - /** - * Checks cyclic child-parent dependencies. - * - * @param mdl Model. - */ - @throws[NCE] - private def checkCyclicDependencies(mdl: NCModel): Unit = - for (elm ← mdl.getElements) { - if (elm.getParentId != null) { - val seen = mutable.ArrayBuffer.empty[String] - - var parentId: String = null - var x = elm - - do { - parentId = x.getParentId - - if (parentId != null) { - if (seen.contains(parentId)) - throw new NCE(s"Cyclic parent dependency starting at model element '${x.getId}'.") - else { - seen += parentId - - x = mdl.getElements.find(_.getId == parentId) getOrElse { - throw new NCE(s"Unknown parent ID '$parentId' for model element '${x.getId}'.") - - null - } - } - } - } - while (parentId != null) - } - } - - /** - * - * @param mdl Model. - */ - @throws[NCE] - private def checkElementIdsDups(mdl: NCModel): Unit = { - val ids = mutable.HashSet.empty[String] - - for (id ← mdl.getElements.toList.map(_.getId)) - if (ids.contains(id)) - throw new NCE(s"Duplicate model element ID '$id'.") - else - ids += id - } - - /** - * Verifies model element in isolation. - * - * @param mdl Model. - * @param elm Element to verify. - */ - @throws[NCE] - private def checkElement(mdl: NCModel, elm: NCElement): Unit = { - if (elm.getId == null) - throw new NCE(s"Model element ID is not provided.'") - else if (elm.getId.length == 0) - throw new NCE(s"Model element ID cannot be empty.'") - else { - val elmId = elm.getId - - if (elmId.toLowerCase.startsWith("nlpcraft:")) - throw new NCE(s"Model element '$elmId' type cannot start with 'nlpcraft:'.") - - if (hasWhitespace(elmId)) - throw new NCE(s"Model element ID '$elmId' cannot have whitespaces.") - } - } - - /** - * Checks whether or not given string has any whitespaces. - * - * @param s String to check. - * @return - */ - private def hasWhitespace(s: String): Boolean = - s.exists(_.isWhitespace) - - /** - * - * @param mdl Model. - */ - private def checkModelConfig(mdl: NCModel): Unit = { - def checkInt(v: Int, name: String, min: Int = 0, max: Int = Integer.MAX_VALUE): Unit = - if (v < min) - throw new NCE(s"Invalid model configuration value '$name' [value=$v, min=$min]") - else if (v > max) - throw new NCE(s"Invalid model configuration value '$name' [value=$v, max=$min]") - - checkInt(mdl.getMaxUnknownWords, "maxUnknownWords") - checkInt(mdl.getMaxFreeWords, "maxFreeWords") - checkInt(mdl.getMaxSuspiciousWords, "maxSuspiciousWords") - checkInt(mdl.getMinWords, "minWords", min = 1) - checkInt(mdl.getMinNonStopwords, "minNonStopwords") - checkInt(mdl.getMinTokens, "minTokens") - checkInt(mdl.getMaxTokens, "maxTokens", max = 100) - checkInt(mdl.getMaxWords, "maxWords", min = 1, max = 100) - checkInt(mdl.getJiggleFactor, "jiggleFactor", max = 4) - - val unsToks = - mdl.getEnabledBuiltInTokens.filter(t ⇒ - // 'stanford', 'google', 'opennlp', 'spacy' - any names, not validated. - t == null || - !TOKENS_PROVIDERS_PREFIXES.exists(typ ⇒ t.startsWith(typ)) || - // 'nlpcraft' names validated. - (t.startsWith("nlpcraft:") && !NCModelView.DFLT_ENABLED_BUILTIN_TOKENS.contains(t)) - ) - - if (unsToks.nonEmpty) - throw new NCE(s"Invalid model 'enabledBuiltInTokens' token IDs: ${unsToks.mkString(", ")}") - } /** * * @return */ - def getAllModels(parent: Span = null): List[NCModelDecorator] = + def getAllModels(parent: Span = null): List[NCModelWrapper] = startScopedSpan("getAllModels", parent) { _ ⇒ mux.synchronized { models.values.toList @@ -746,7 +120,7 @@ object NCModelManager extends NCService with DecorateAsScala { * @param mdlId Model ID. * @return */ - def getModel(mdlId: String, parent: Span = null): Option[NCModelDecorator] = + def getModel(mdlId: String, parent: Span = null): Option[NCModelWrapper] = startScopedSpan("getModel", parent, "modelId" → mdlId) { _ ⇒ mux.synchronized { models.get(mdlId) @@ -754,6 +128,7 @@ object NCModelManager extends NCService with DecorateAsScala { } /** + * TODO: * Gets model data which can be transferred between probe and server. * * @param mdlId Model ID. @@ -762,8 +137,7 @@ object NCModelManager extends NCService with DecorateAsScala { */ def getModelInfo(mdlId: String, parent: Span = null): java.util.Map[String, Any] = startScopedSpan("getModel", parent, "mdlId" → mdlId) { _ ⇒ - val mdl = mux.synchronized { models.get(mdlId) }. - getOrElse(throw new NCE(s"Model not found: '$mdlId'")).wrapper + val mdl = mux.synchronized { models.get(mdlId) }.getOrElse(throw new NCE(s"Model not found: '$mdlId'")) val data = new util.HashMap[String, Any]() diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala index 15acb12..cfba3e7 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala @@ -23,7 +23,7 @@ import com.typesafe.scalalogging.LazyLogging import io.opencensus.trace.Span import org.apache.nlpcraft.common.nlp._ import org.apache.nlpcraft.common.{NCService, _} -import org.apache.nlpcraft.probe.mgrs.NCModelDecorator +import org.apache.nlpcraft.model.impl.NCModelWrapper import scala.collection.Map import scala.language.implicitConversions @@ -42,5 +42,5 @@ abstract class NCProbeEnricher extends NCService with LazyLogging { * @param parent Span parent. */ @throws[NCE] - def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span): Unit + def enrich(mdl: NCModelWrapper, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span): Unit } \ No newline at end of file diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala index acea56c..850b7a8 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala @@ -30,7 +30,7 @@ import org.apache.nlpcraft.common.config.NCConfigurable import org.apache.nlpcraft.common.debug.NCLogHolder import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote} import org.apache.nlpcraft.model._ -import org.apache.nlpcraft.model.impl.{NCModelWrapper, NCTokenLogger} +import org.apache.nlpcraft.model.impl.NCTokenLogger import org.apache.nlpcraft.model.intent.impl.NCIntentSolverInput import org.apache.nlpcraft.model.opencensus.stats.NCOpenCensusModelStats import org.apache.nlpcraft.model.tools.embedded.NCEmbeddedResult @@ -315,7 +315,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { logger.info(s"REJECT response $msgName sent [srvReqId=$srvReqId, response=${errMsg.get}]") } - val mdlDec = NCModelManager + val mdl = NCModelManager .getModel(mdlId, span) .getOrElse(throw new NCE(s"Model not found: $mdlId")) @@ -324,7 +324,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { val validNlpSens = nlpSens.flatMap(nlpSen ⇒ try { - NCValidateManager.preValidate(mdlDec, nlpSen, span) + NCValidateManager.preValidate(mdl, nlpSen, span) Some(nlpSen) } @@ -361,14 +361,14 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { val sensSeq = validNlpSens.flatMap(nlpSen ⇒ { // Independent of references. - NCDictionaryEnricher.enrich(mdlDec, nlpSen, senMeta, span) - NCSuspiciousNounsEnricher.enrich(mdlDec, nlpSen, senMeta, span) - NCStopWordEnricher.enrich(mdlDec, nlpSen, senMeta, span) + NCDictionaryEnricher.enrich(mdl, nlpSen, senMeta, span) + NCSuspiciousNounsEnricher.enrich(mdl, nlpSen, senMeta, span) + NCStopWordEnricher.enrich(mdl, nlpSen, senMeta, span) case class Holder(enricher: NCProbeEnricher, getNotes: () ⇒ Seq[NCNlpSentenceNote]) def get(name: String, e: NCProbeEnricher): Option[Holder] = - if (mdlDec.wrapper.getEnabledBuiltInTokens.contains(name)) + if (mdl.getEnabledBuiltInTokens.contains(name)) Some(Holder(e, () ⇒ nlpSen.flatten.filter(_.noteType == name))) else None @@ -394,7 +394,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { def get(): Seq[NCNlpSentenceNote] = h.getNotes().sortBy(p ⇒ (p.tokenIndexes.head, p.noteType)) val notes1 = get() - h → h.enricher.enrich(mdlDec, nlpSen, senMeta, span) + h → h.enricher.enrich(mdl, nlpSen, senMeta, span) val notes2 = get() @@ -434,7 +434,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { }).toMap // Loop has sense if model is complex (has user defined parsers or DSL based synonyms) - continue = NCModelEnricher.isComplex(mdlDec) && res.exists { case (_, same) ⇒ !same } + continue = NCModelEnricher.isComplex(mdl) && res.exists { case (_, same) ⇒ !same } if (DEEP_DEBUG) if (continue) { @@ -464,7 +464,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { // Final validation before execution. try - sensSeq.foreach(NCValidateManager.postValidate(mdlDec, _, span)) + sensSeq.foreach(NCValidateManager.postValidate(mdl, _, span)) catch { case e: NCValidateException ⇒ val (errMsg, errCode) = getError(e.code) @@ -487,13 +487,13 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { val meta = mutable.HashMap.empty[String, Any] ++ senMeta val req = NCRequestImpl(meta, srvReqId) - var senVars = mdlDec.makeVariants(srvReqId, sensSeq) + var senVars = mdl.makeVariants(srvReqId, sensSeq) // Sentence variants can be filtered by model. val fltSenVars: Seq[(NCVariant, Int)] = senVars. zipWithIndex. - flatMap { case (variant, i) ⇒ if (mdlDec.wrapper.onParsedVariant(variant)) Some(variant, i) else None } + flatMap { case (variant, i) ⇒ if (mdl.onParsedVariant(variant)) Some(variant, i) else None } senVars = fltSenVars.map(_._1) val allVars = senVars.flatMap(_.asScala) @@ -528,7 +528,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { // Create model query context. val ctx: NCContext = new NCContext { override lazy val getRequest: NCRequest = req - override lazy val getModel: NCModel = mdlDec.wrapper + override lazy val getModel: NCModel = mdl override lazy val getServerRequestId: String = srvReqId override lazy val getConversation: NCConversation = new NCConversation { @@ -546,7 +546,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { logKey = U.mkLogHolderKey(srvReqId) - val meta = mdlDec.wrapper.getMetadata + val meta = mdl.getMetadata meta.synchronized { meta.put(logKey, logHldr) @@ -572,19 +572,17 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { def onFinish(): Unit = { if (logKey != null) - mdlDec.wrapper.getMetadata.remove(logKey) + mdl.getMetadata.remove(logKey) span.end() } - val mdl: NCModelWrapper = mdlDec.wrapper - val solverIn = new NCIntentSolverInput(ctx) // Execute model query asynchronously. U.asFuture( _ ⇒ { - var res = mdlDec.wrapper.onContext(ctx) + var res = mdl.onContext(ctx) start = System.currentTimeMillis() @@ -627,7 +625,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { if (e.getCause != null) logger.info(s"Rejection cause:", e.getCause) - val res = mdlDec.wrapper.onRejection(solverIn.intentMatch, e) + val res = mdl.onRejection(solverIn.intentMatch, e) if (res != null) respondWithResult(res, None) @@ -656,7 +654,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { logger.error(s"Unexpected error for server request ID: $srvReqId", e) - val res = mdlDec.wrapper.onError(ctx, e) + val res = mdl.onError(ctx, e) if (res != null) respondWithResult(res, None) @@ -682,7 +680,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { "resBody" → res.getBody ) - val res0 = mdlDec.wrapper.onResult(solverIn.intentMatch, res) + val res0 = mdl.onResult(solverIn.intentMatch, res) respondWithResult(if (res0 != null) res0 else res, if (logHldr != null) Some(logHldr.toJson) else None) } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala index b1cd2fa..4905273 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala @@ -24,7 +24,7 @@ import org.apache.nlpcraft.common.nlp._ import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager import org.apache.nlpcraft.common.nlp.dict._ import org.apache.nlpcraft.common.{NCService, _} -import org.apache.nlpcraft.probe.mgrs.NCModelDecorator +import org.apache.nlpcraft.model.impl.NCModelWrapper import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import scala.collection.Map @@ -54,10 +54,10 @@ object NCDictionaryEnricher extends NCProbeEnricher { } @throws[NCE] - override def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = + override def enrich(mdl: NCModelWrapper, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, - "modelId" → mdl.wrapper.getId, + "modelId" → mdl.getId, "txt" → ns.text) { _ ⇒ ns.foreach(t ⇒ { // Dictionary. diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala index 7b583d6..4286b34 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala @@ -25,7 +25,7 @@ import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager import org.apache.nlpcraft.common.nlp.numeric.{NCNumeric, NCNumericManager} import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken} import org.apache.nlpcraft.common.{NCE, NCService} -import org.apache.nlpcraft.probe.mgrs.NCModelDecorator +import org.apache.nlpcraft.model.impl.NCModelWrapper import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import scala.collection.JavaConverters._ @@ -236,10 +236,10 @@ object NCLimitEnricher extends NCProbeEnricher { } @throws[NCE] - override def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = + override def enrich(mdl: NCModelWrapper, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, - "modelId" → mdl.wrapper.getId, + "modelId" → mdl.getId, "txt" → ns.text) { _ ⇒ val notes = mutable.HashSet.empty[NCNlpSentenceNote] val numsMap = NCNumericManager.find(ns).filter(_.unit.isEmpty).map(p ⇒ p.tokens → p).toMap diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index dcd71a2..c228c97 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -24,9 +24,10 @@ import io.opencensus.trace.Span import org.apache.nlpcraft.common._ import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, _} import org.apache.nlpcraft.model._ +import org.apache.nlpcraft.model.impl.NCModelWrapper import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl -import org.apache.nlpcraft.probe.mgrs.{NCModelDecorator, NCSynonym} +import org.apache.nlpcraft.probe.mgrs.NCSynonym import scala.collection.JavaConverters._ import scala.collection.convert.DecorateAsScala @@ -297,15 +298,15 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { */ private def alreadyMarked(toks: Seq[NCNlpSentenceToken], elemId: String): Boolean = toks.forall(_.isTypeOf(elemId)) - def isComplex(mdl: NCModelDecorator): Boolean = mdl.synsDsl.nonEmpty || !mdl.wrapper.getParsers.isEmpty + def isComplex(mdl: NCModelWrapper): Boolean = mdl.synsDsl.nonEmpty || !mdl.getParsers.isEmpty @throws[NCE] - override def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = + override def enrich(mdl: NCModelWrapper, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, - "modelId" → mdl.wrapper.getId, + "modelId" → mdl.getId, "txt" → ns.text) { span ⇒ - val jiggleFactor = mdl.wrapper.getJiggleFactor + val jiggleFactor = mdl.getJiggleFactor val cache = mutable.HashSet.empty[Seq[Int]] val matches = ArrayBuffer.empty[ElementMatch] @@ -392,7 +393,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { startScopedSpan("jiggleProc", span, "srvReqId" → ns.srvReqId, - "modelId" → mdl.wrapper.getId, + "modelId" → mdl.getId, "txt" → ns.text) { _ ⇒ // Iterate over depth-limited permutations of the original sentence with and without stopwords. jiggle(ns, jiggleFactor).foreach(procPerm) @@ -413,7 +414,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { for ((m, idx) ← matches.zipWithIndex) { if (DEEP_DEBUG) logger.trace( - s"Model '${mdl.wrapper.getId}' element found (${idx + 1} of $matchCnt) [" + + s"Model '${mdl.getId}' element found (${idx + 1} of $matchCnt) [" + s"elementId=${m.element.getId}, " + s"synonym=${m.synonym}, " + s"tokens=${tokString(m.tokens)}" + @@ -429,14 +430,14 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { mark(ns, elem = elm, toks = m.tokens, direct = direct, syn = Some(syn), metaOpt = None, parts = m.parts) } - val parsers = mdl.wrapper.getParsers + val parsers = mdl.getParsers for (parser ← parsers.asScala) { parser.onInit() startScopedSpan("customParser", span, "srvReqId" → ns.srvReqId, - "modelId" → mdl.wrapper.getId, + "modelId" → mdl.getId, "txt" → ns.text) { _ ⇒ def to(t: NCNlpSentenceToken): NCCustomWord = new NCCustomWord { @@ -458,7 +459,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { val res = parser.parse( NCRequestImpl(senMeta, ns.srvReqId), - mdl.wrapper, + mdl, ns.map(to).asJava, ns.flatten.distinct.filter(!_.isNlp).map(n ⇒ { val noteId = n.noteType diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala index da439d4..d223a01 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala @@ -24,7 +24,7 @@ import org.apache.nlpcraft.common.makro.NCMacroParser import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken} import org.apache.nlpcraft.common.{NCE, NCService} -import org.apache.nlpcraft.probe.mgrs.NCModelDecorator +import org.apache.nlpcraft.model.impl.NCModelWrapper import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import scala.collection.JavaConverters._ @@ -138,10 +138,10 @@ object NCRelationEnricher extends NCProbeEnricher { } @throws[NCE] - override def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = + override def enrich(mdl: NCModelWrapper, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, - "modelId" → mdl.wrapper.getId, + "modelId" → mdl.getId, "txt" → ns.text) { _ ⇒ // Tries to grab tokens direct way. // Example: A, B, C ⇒ ABC, AB, BC .. (AB will be processed first) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala index 33d4c4c..67e4ec5 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala @@ -24,7 +24,7 @@ import org.apache.nlpcraft.common.NCService import org.apache.nlpcraft.common.makro.NCMacroParser import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken} -import org.apache.nlpcraft.probe.mgrs.NCModelDecorator +import org.apache.nlpcraft.model.impl.NCModelWrapper import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import scala.collection.JavaConverters._ @@ -415,10 +415,10 @@ object NCSortEnricher extends NCProbeEnricher { toks.length == toks2.length || toks.count(isImportant) == toks2.count(isImportant) } - override def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, meta: Map[String, Serializable], parent: Span): Unit = + override def enrich(mdl: NCModelWrapper, ns: NCNlpSentence, meta: Map[String, Serializable], parent: Span): Unit = startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, - "modelId" → mdl.wrapper.getId, + "modelId" → mdl.getId, "txt" → ns.text) { _ ⇒ val notes = mutable.HashSet.empty[NCNlpSentenceNote] diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala index 089b5ff..8d52564 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala @@ -23,7 +23,7 @@ import io.opencensus.trace.Span import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken} import org.apache.nlpcraft.common.{NCE, NCService, U} -import org.apache.nlpcraft.probe.mgrs.NCModelDecorator +import org.apache.nlpcraft.model.impl.NCModelWrapper import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import scala.annotation.tailrec @@ -176,12 +176,12 @@ object NCStopWordEnricher extends NCProbeEnricher { /** * Marks as stopwords, words with POS from configured list, which also placed before another stop words. */ - private def processCommonStops(mdl: NCModelDecorator, ns: NCNlpSentence): Unit = { + private def processCommonStops(mdl: NCModelWrapper, ns: NCNlpSentence): Unit = { /** * Marks as stopwords, words with POS from configured list, which also placed before another stop words. */ @tailrec - def processCommonStops0(mdl: NCModelDecorator, ns: NCNlpSentence): Unit = { + def processCommonStops0(mdl: NCModelWrapper, ns: NCNlpSentence): Unit = { val max = ns.size - 1 var stop = true @@ -206,11 +206,11 @@ object NCStopWordEnricher extends NCProbeEnricher { } @throws[NCE] - override def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = { + override def enrich(mdl: NCModelWrapper, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = { def mark(stems: Set[String], f: Boolean): Unit = ns.filter(t ⇒ stems.contains(t.stem)).foreach(t ⇒ ns.fixNote(t.getNlpNote, "stopWord" → f)) - startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, "modelId" → mdl.wrapper.getId, "txt" → ns.text) { _ ⇒ + startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, "modelId" → mdl.getId, "txt" → ns.text) { _ ⇒ mark(mdl.exclStopWordsStems, f = false) mark(mdl.addStopWordsStems, f = true) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/suspicious/NCSuspiciousNounsEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/suspicious/NCSuspiciousNounsEnricher.scala index de7799d..e797051 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/suspicious/NCSuspiciousNounsEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/suspicious/NCSuspiciousNounsEnricher.scala @@ -22,7 +22,7 @@ import java.io.Serializable import io.opencensus.trace.Span import org.apache.nlpcraft.common.{NCE, NCService} import org.apache.nlpcraft.common.nlp._ -import org.apache.nlpcraft.probe.mgrs.NCModelDecorator +import org.apache.nlpcraft.model.impl.NCModelWrapper import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import scala.collection.Map @@ -40,10 +40,10 @@ object NCSuspiciousNounsEnricher extends NCProbeEnricher { } @throws[NCE] - override def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = + override def enrich(mdl: NCModelWrapper, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, - "modelId" → mdl.wrapper.getId, + "modelId" → mdl.getId, "txt" → ns.text) { _ ⇒ ns.filter(t ⇒ mdl.suspWordsStems.contains(t.stem)).foreach(t ⇒ ns.fixNote(t.getNlpNote, "suspNoun" → true)) } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/validate/NCValidateManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/validate/NCValidateManager.scala index 42bce81..6cde756 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/validate/NCValidateManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/validate/NCValidateManager.scala @@ -22,7 +22,7 @@ import io.opencensus.trace.Span import org.apache.tika.langdetect.OptimaizeLangDetector import org.apache.nlpcraft.common.NCService import org.apache.nlpcraft.common.nlp.NCNlpSentence -import org.apache.nlpcraft.probe.mgrs.NCModelDecorator +import org.apache.nlpcraft.model.impl.NCModelWrapper /** * Probe pre/post enrichment validator. @@ -51,22 +51,21 @@ object NCValidateManager extends NCService with LazyLogging { * @param parent Parent tracing span. */ @throws[NCValidateException] - def preValidate(mdl: NCModelDecorator, ns: NCNlpSentence, parent: Span = null): Unit = + def preValidate(mdl: NCModelWrapper, ns: NCNlpSentence, parent: Span = null): Unit = startScopedSpan("validate", parent, "srvReqId" → ns.srvReqId, "txt" → ns.text, - "modelId" → mdl.wrapper.getId) { _ ⇒ - val model = mdl.wrapper - - if (!model.isNotLatinCharsetAllowed && !ns.text.matches("""[\s\w\p{Punct}]+""")) + "modelId" → mdl.getId) { _ ⇒ + + if (!mdl.isNotLatinCharsetAllowed && !ns.text.matches("""[\s\w\p{Punct}]+""")) throw NCValidateException("ALLOW_NON_LATIN_CHARSET") - if (!model.isNonEnglishAllowed && !langFinder.detect(ns.text).isLanguage("en")) + if (!mdl.isNonEnglishAllowed && !langFinder.detect(ns.text).isLanguage("en")) throw NCValidateException("ALLOW_NON_ENGLISH") - if (!model.isNoNounsAllowed && !ns.exists(_.pos.startsWith("n"))) + if (!mdl.isNoNounsAllowed && !ns.exists(_.pos.startsWith("n"))) throw NCValidateException("ALLOW_NO_NOUNS") - if (model.getMinWords > ns.map(_.wordLength).sum) + if (mdl.getMinWords > ns.map(_.wordLength).sum) throw NCValidateException("MIN_WORDS") - if (ns.size > model.getMaxTokens) + if (ns.size > mdl.getMaxTokens) throw NCValidateException("MAX_TOKENS") } @@ -77,30 +76,29 @@ object NCValidateManager extends NCService with LazyLogging { * @param parent Optional parent span. */ @throws[NCValidateException] - def postValidate(mdl: NCModelDecorator, ns: NCNlpSentence, parent: Span = null): Unit = + def postValidate(mdl: NCModelWrapper, ns: NCNlpSentence, parent: Span = null): Unit = startScopedSpan("validate", parent, "srvReqId" → ns.srvReqId, "txt" → ns.text, - "modelId" → mdl.wrapper.getId) { _ ⇒ + "modelId" → mdl.getId) { _ ⇒ val types = ns.flatten.filter(!_.isNlp).map(_.noteType).distinct val overlapNotes = ns.map(tkn ⇒ types.flatMap(tp ⇒ tkn.getNotes(tp))).filter(_.size > 1).flatten - val model = mdl.wrapper - + if (overlapNotes.nonEmpty) throw NCValidateException("OVERLAP_NOTES") - if (!model.isNoUserTokensAllowed && !ns.exists(_.exists(!_.noteType.startsWith("nlpcraft:")))) + if (!mdl.isNoUserTokensAllowed && !ns.exists(_.exists(!_.noteType.startsWith("nlpcraft:")))) throw NCValidateException("ALLOW_NO_USER_TOKENS") - if (!model.isSwearWordsAllowed && ns.exists(_.getNlpValueOpt[Boolean]("swear").getOrElse(false))) + if (!mdl.isSwearWordsAllowed && ns.exists(_.getNlpValueOpt[Boolean]("swear").getOrElse(false))) throw NCValidateException("ALLOW_SWEAR_WORDS") - if (model.getMinNonStopwords > ns.count(!_.isStopWord)) + if (mdl.getMinNonStopwords > ns.count(!_.isStopWord)) throw NCValidateException("MIN_NON_STOPWORDS") - if (model.getMinTokens > ns.size) + if (mdl.getMinTokens > ns.size) throw NCValidateException("MIN_TOKENS") - if (model.getMaxUnknownWords < ns.count(t ⇒ t.isNlp && !t.isSynthetic && !t.isKnownWord)) + if (mdl.getMaxUnknownWords < ns.count(t ⇒ t.isNlp && !t.isSynthetic && !t.isKnownWord)) throw NCValidateException("MAX_UNKNOWN_WORDS") - if (model.getMaxSuspiciousWords < ns.count(_.getNlpValueOpt[Boolean]("suspNoun").getOrElse(false))) + if (mdl.getMaxSuspiciousWords < ns.count(_.getNlpValueOpt[Boolean]("suspNoun").getOrElse(false))) throw NCValidateException("MAX_SUSPICIOUS_WORDS") - if (model.getMaxFreeWords < ns.count(_.isNlp)) + if (mdl.getMaxFreeWords < ns.count(_.isNlp)) throw NCValidateException("MAX_FREE_WORDS") } }
