This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-70_NEW in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 022469eb9dc06dbd14a6166aefcf7ce091c06e7e Author: Sergey Kamov <[email protected]> AuthorDate: Wed Jun 16 20:39:47 2021 +0300 WIP. --- .../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 4 ++ .../probe/mgrs/conn/NCConnectionManager.scala | 29 +++++++++++- .../nlpcraft/server/mdo/NCProbeModelMdo.scala | 9 +++- .../nlp/enrichers/NCServerEnrichmentManager.scala | 17 +++++--- .../enrichers/ctxword/ContextWordEnricher.scala | 51 ++++++++++++++++++++++ .../nlpcraft/server/probe/NCProbeManager.scala | 25 ++++++++--- .../nlpcraft/server/query/NCQueryManager.scala | 4 +- 7 files changed, 124 insertions(+), 15 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala index f508745..eef05de 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala @@ -18,6 +18,7 @@ package org.apache.nlpcraft.common.nlp import org.apache.nlpcraft.common._ +import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo import java.io.{Serializable => JSerializable} import java.util.{Collections, List => JList} @@ -40,6 +41,7 @@ import org.apache.nlpcraft.common.nlp.NCNlpSentence._ * @param srvReqId Server request ID. * @param text Normalized text. * @param enabledBuiltInToks Enabled built-in tokens. + * @param mlConfig Machine learning configuration. Optional. * @param tokens Initial buffer. * @param firstProbePhase Processing phase flag. * @param deletedNotes Deleted overridden notes with their tokens. @@ -50,6 +52,7 @@ class NCNlpSentence( val srvReqId: String, val text: String, val enabledBuiltInToks: Set[String], + val mlConfig: Option[NCModelMLConfigMdo], override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32), var firstProbePhase: Boolean = true, private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty, @@ -67,6 +70,7 @@ class NCNlpSentence( srvReqId = srvReqId, text = text, enabledBuiltInToks = enabledBuiltInToks, + mlConfig = mlConfig, tokens = tokens.map(_.clone()), deletedNotes = deletedNotes.map(p => p._1.clone() -> p._2.map(_.clone())), initNlpNotes = initNlpNotes, diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala index 159ffd2..c911342 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala @@ -32,8 +32,9 @@ import java.io.{EOFException, IOException, InterruptedIOException} import java.net.{InetAddress, NetworkInterface} import java.util import java.util.concurrent.CountDownLatch -import java.util.{Properties, TimeZone} +import java.util.{Collections, Properties, TimeZone} import scala.collection.mutable +import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, SeqHasAsJava, SetHasAsScala} /** * Probe down/up link connection manager. @@ -213,6 +214,28 @@ object NCConnectionManager extends NCService { NCModelManager.getAllModels().map(wrapper => { val mdl = wrapper.model + val ctxWordElems = mdl.getElements.asScala.filter(_.isContextWordSupport) + + // TODO: validate: too many values, examples. missed them. + val ( + values, + samples + ): ( + java.util.Map[String, java.util.Map[String, java.util.List[String]]], + java.util.Map[String, java.util.List[String]] + ) = + if (ctxWordElems.isEmpty) + (Collections.emptyMap(), Collections.emptyMap()) + else { + ( + ctxWordElems.map(e => + e.getId -> + e.getValues.asScala.map(p => p.getName -> p.getSynonyms).toMap.asJava + ).toMap.asJava, + wrapper.samples.map(p => p._1 -> p._2.flatMap(p => p).asJava).toMap.asJava + ) + } + // Model already validated. // util.HashSet created to avoid scala collections serialization error. @@ -221,7 +244,9 @@ object NCConnectionManager extends NCService { mdl.getId, mdl.getName, mdl.getVersion, - new util.HashSet[String](mdl.getEnabledBuiltInTokens) + new util.HashSet[String](mdl.getEnabledBuiltInTokens), + values, + samples ) }) ), cryptoKey) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala index 16edd61..ad80245 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala @@ -19,6 +19,12 @@ package org.apache.nlpcraft.server.mdo import org.apache.nlpcraft.server.mdo.impl._ + +@NCMdoEntity(sql = false) +case class NCModelMLConfigMdo( + @NCMdoField values: Map[String /*Element ID*/, Map[/*Value*/String, /*Synonym*/Seq[String]]], + @NCMdoField samples: Map[String /*Element ID*/, Seq[String]/*Samples*/] +) /** * Probe model MDO. */ @@ -27,7 +33,8 @@ case class NCProbeModelMdo( @NCMdoField id: String, @NCMdoField name: String, @NCMdoField version: String, - @NCMdoField enabledBuiltInTokens: Set[String] + @NCMdoField enabledBuiltInTokens: Set[String], + @NCMdoField mlConfig: Option[NCModelMLConfigMdo] ) extends NCAnnotatedMdo[NCProbeModelMdo] { override def hashCode(): Int = s"$id$name".hashCode() diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala index 4f91bc2..e420676 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala @@ -26,6 +26,7 @@ import org.apache.nlpcraft.common.pool.NCThreadPoolManager import org.apache.nlpcraft.common.{NCService, _} import org.apache.nlpcraft.server.ignite.NCIgniteHelpers._ import org.apache.nlpcraft.server.ignite.NCIgniteInstance +import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo import org.apache.nlpcraft.server.nlp.core.{NCNlpNerEnricher, NCNlpServerManager} import org.apache.nlpcraft.server.nlp.enrichers.basenlp.NCBaseNlpEnricher import org.apache.nlpcraft.server.nlp.enrichers.coordinate.NCCoordinatesEnricher @@ -90,6 +91,7 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance { * @param srvReqId Server request ID. * @param normTxt Normalized text. * @param enabledBuiltInToks Enabled built-in tokens. + * @param mlConf Machine learning configuration. * @param parent Optional parent span. * @return */ @@ -97,9 +99,11 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance { srvReqId: String, normTxt: String, enabledBuiltInToks: Set[String], - parent: Span = null): NCNlpSentence = + mlConf: Option[NCModelMLConfigMdo], + parent: Span = null + ): NCNlpSentence = startScopedSpan("process", parent, "srvReqId" -> srvReqId, "txt" -> normTxt) { span => - val s = new NCNlpSentence(srvReqId, normTxt, enabledBuiltInToks) + val s = new NCNlpSentence(srvReqId, normTxt, enabledBuiltInToks, mlConf) // Server-side enrichment pipeline. // NOTE: order of enrichers is IMPORTANT. @@ -134,6 +138,7 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance { * @param srvReqId Server request ID. * @param txt Input text. * @param enabledBuiltInToks Set of enabled built-in token IDs. + * @param mlConf Machine learning configuration. * @param parent Optional parent span. */ @throws[NCE] @@ -141,7 +146,9 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance { srvReqId: String, txt: String, enabledBuiltInToks: Set[String], - parent: Span = null): NCNlpSentence = { + mlConf: Option[NCModelMLConfigMdo], + parent: Span = null + ): NCNlpSentence = { startScopedSpan("enrichPipeline", parent, "srvReqId" -> srvReqId, "txt" -> txt) { span => val normTxt = NCPreProcessManager.normalize(txt, spellCheck = true, span) @@ -159,9 +166,9 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance { h.sentence } else - process(srvReqId, normTxt, enabledBuiltInToks, span) + process(srvReqId, normTxt, enabledBuiltInToks, mlConf, span) case None => - process(srvReqId, normTxt, enabledBuiltInToks, span) + process(srvReqId, normTxt, enabledBuiltInToks, mlConf, span) } } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/ContextWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/ContextWordEnricher.scala new file mode 100644 index 0000000..c2dd843 --- /dev/null +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/ContextWordEnricher.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nlpcraft.server.nlp.enrichers.ctxword + +import io.opencensus.trace.Span +import org.apache.nlpcraft.common.NCService +import org.apache.nlpcraft.common.nlp.NCNlpSentence +import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher + +/** + * ContextWord enricher. + */ +object ContextWordEnricher extends NCServerEnricher { + override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ => + ackStarting() + ackStarted() + } + + override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ => + ackStopping() + ackStopped() + } + + override def enrich(ns: NCNlpSentence, parent: Span): Unit = { + ns.mlConfig match { + case Some(cfg) => + val nouns = ns.tokens.filter(_.pos.startsWith("N")) + + if (nouns.nonEmpty) { + nouns + } + + case None => // No-op. + } + } +} diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala index f572b9f..67acba8 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala @@ -31,7 +31,7 @@ import org.apache.nlpcraft.common.version.NCVersion import org.apache.nlpcraft.common.{NCService, _} import org.apache.nlpcraft.probe.mgrs.NCProbeMessage import org.apache.nlpcraft.server.company.NCCompanyManager -import org.apache.nlpcraft.server.mdo.{NCCompanyMdo, NCProbeMdo, NCProbeModelMdo, NCUserMdo} +import org.apache.nlpcraft.server.mdo.{NCCompanyMdo, NCModelMLConfigMdo, NCProbeMdo, NCProbeModelMdo, NCUserMdo} import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnrichmentManager import org.apache.nlpcraft.server.proclog.NCProcessLogManager import org.apache.nlpcraft.server.query.NCQueryManager @@ -45,7 +45,7 @@ import java.util.Collections import java.util.concurrent.ConcurrentHashMap import scala.collection.mutable import scala.concurrent.{ExecutionContext, Future, Promise} -import scala.jdk.CollectionConverters.SetHasAsScala +import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsScala, SetHasAsScala} import scala.util.{Failure, Success} /** @@ -613,25 +613,40 @@ object NCProbeManager extends NCService { String, String, String, - java.util.Set[String] + java.util.Set[String], + java.util.Map[String, java.util.Map[String, java.util.List[String]]], + java.util.Map[String, java.util.List[String]] )]]("PROBE_MODELS"). map { case ( mdlId, mdlName, mdlVer, - enabledBuiltInToks + enabledBuiltInToks, + values, + samples ) => require(mdlId != null) require(mdlName != null) require(mdlVer != null) require(enabledBuiltInToks != null) + require(values.isEmpty ^ samples.isEmpty) NCProbeModelMdo( id = mdlId, name = mdlName, version = mdlVer, - enabledBuiltInTokens = enabledBuiltInToks.asScala.toSet + enabledBuiltInTokens = enabledBuiltInToks.asScala.toSet, + mlConfig = + if (!values.isEmpty) + Some( + NCModelMLConfigMdo( + values = values.asScala.map(p => p._1 -> p._2.asScala.map(p => p._1 -> p._2.asScala.toSeq).toMap).toMap, + samples = samples.asScala.map(p => p._1 -> p._2.asScala.toSeq).toMap + ) + ) + else + None ) }.toSet diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala index f4d2afe..32492b9 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala @@ -272,7 +272,7 @@ object NCQueryManager extends NCService with NCIgniteInstance with NCOpenCensusS logger.info(s"New request received:\n$tbl") - val enabledBuiltInToks = NCProbeManager.getModel(mdlId, span).enabledBuiltInTokens + val mdl = NCProbeManager.getModel(mdlId, span) @throws[NCE] def unzipProperties(gzipOpt: Option[String]): Option[JavaMeta] = @@ -288,7 +288,7 @@ object NCQueryManager extends NCService with NCIgniteInstance with NCOpenCensusS company, mdlId, txt0, - NCServerEnrichmentManager.enrichPipeline(srvReqId, txt0, enabledBuiltInToks), + NCServerEnrichmentManager.enrichPipeline(srvReqId, txt0, mdl.enabledBuiltInTokens, mdl.mlConfig), usrAgent, rmtAddr, data,
