This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-70_NEW in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 544d8a40860393698ec80b5c992e890770469c16 Author: Sergey Kamov <[email protected]> AuthorDate: Mon Jun 28 14:33:32 2021 +0300 WIP. --- nlpcraft/pom.xml | 4 ++++ .../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 2 +- .../enrichers/ctxword/NCContextWordEnricher.scala | 26 ++++++++++++++++++++-- pom.xml | 7 ++++++ 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/nlpcraft/pom.xml b/nlpcraft/pom.xml index 4d8c292..62d3683 100644 --- a/nlpcraft/pom.xml +++ b/nlpcraft/pom.xml @@ -232,6 +232,10 @@ <groupId>org.jline</groupId> <artifactId>jline</artifactId> </dependency> + <dependency> + <groupId>org.jibx</groupId> + <artifactId>jibx-tools</artifactId> + </dependency> <!-- Test dependencies. --> <dependency> diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala index d237558..ed22935 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala @@ -52,7 +52,7 @@ class NCNlpSentence( val srvReqId: String, val text: String, val enabledBuiltInToks: Set[String], - val mlConfig: Option[NCModelMLConfigMdo], + val mlConfig: Option[NCModelMLConfigMdo] = None, var mlData: Map[Int, Map[String, Double]] = Map.empty, override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32), var firstProbePhase: Boolean = true, diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala index 4982e29..79c970e 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala @@ -18,12 +18,16 @@ package org.apache.nlpcraft.server.nlp.enrichers.ctxword import io.opencensus.trace.Span -import org.apache.nlpcraft.common.nlp.core.NCNlpPorterStemmer.stem +import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager +import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager.stem +import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken} import org.apache.nlpcraft.common.{NCE, NCService, U} import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher +import org.apache.nlpcraft.server.nlp.enrichers.basenlp.NCBaseNlpEnricher import org.apache.nlpcraft.server.sugsyn.{NCSuggestSynonymManager, NCSuggestionRequest, NCWordSuggestion} +import org.jibx.schema.codegen.extend.DefaultNameConverter import scala.collection.mutable import scala.concurrent.Await @@ -33,9 +37,14 @@ import scala.concurrent.duration.Duration * ContextWord enricher. */ object NCContextWordEnricher extends NCServerEnricher { + private final val POS_PLURALS = Set("NNS", "NNPS") + private final val POS_SINGULAR = Set("NN", "NNP") + private final val MAX_CTXWORD_SCORE = 2 private final val EXCLUSIVE_MIN_SCORE = -1.0 + private final val CONVERTER = new DefaultNameConverter + private case class ModelProbeKey(probeId: String, modelId: String) private case class ElementScore(elementId: String, averageScore: Double, senScore: Double, sampleScore: Double) @@ -171,6 +180,19 @@ object NCContextWordEnricher extends NCServerEnricher { @throws[NCE] private def askSamples(cfg: NCModelMLConfigMdo): ElementStemScore = { val sampleWords = cfg.samples.map(spaceTokenize).toSeq + + + sampleWords.map(s => { + val sampleSen = new NCNlpSentence("sampleReqId", sampleWords.mkString(" "), Set.empty) + + NCBaseNlpEnricher.enrich(sampleSen) + + sampleSen. + }) + + + + val sampleWordsStems = sampleWords.map(_.map(stem)) val recs: Map[String, Seq[NCSuggestionRequest]] = @@ -227,7 +249,7 @@ object NCContextWordEnricher extends NCServerEnricher { } } - val nounToks = ns.tokens.filter(_.pos.startsWith("N")) + val nounToks = ns.tokens.filter(t => NCPennTreebank.NOUNS_POS.contains(t.pos)) if (nounToks.nonEmpty) { val key = ModelProbeKey(cfg.probeId, cfg.modelId) diff --git a/pom.xml b/pom.xml index fd0d687..59e871a 100644 --- a/pom.xml +++ b/pom.xml @@ -154,6 +154,7 @@ <lightstep.grpc.ver>0.15.8</lightstep.grpc.ver> <junit.ver>5.5.1</junit.ver> <jsonpath.ver>2.4.0</jsonpath.ver> + <jibx.tools.ver>1.3.3</jibx.tools.ver> <!-- Force specific encoding on text resources. --> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> @@ -474,6 +475,12 @@ </dependency> <dependency> + <groupId>org.jibx</groupId> + <artifactId>jibx-tools</artifactId> + <version>${jibx.tools.ver}</version> + </dependency> + + <dependency> <groupId>edu.stanford.nlp</groupId> <artifactId>stanford-corenlp</artifactId> <version>${stanford.corenlp.ver}</version>
