This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-70_NEW by this push:
new 57a6a7f WIP.
57a6a7f is described below
commit 57a6a7fa97f5e2ac13cf4054ca36f56293759827
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Jun 29 20:41:15 2021 +0300
WIP.
---
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 2 +-
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 3 +-
.../enrichers/ctxword/NCContextWordEnricher.scala | 67 ++++++++++-----
.../nlpcraft/model/ctxword/NCContextWordSpec.scala | 97 +++++++++++++---------
.../model/ctxword/NCContextWordSpec2.scala | 72 ++++++++++++++++
5 files changed, 180 insertions(+), 61 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index ed22935..613a7ce 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -53,7 +53,7 @@ class NCNlpSentence(
val text: String,
val enabledBuiltInToks: Set[String],
val mlConfig: Option[NCModelMLConfigMdo] = None,
- var mlData: Map[Int, Map[String, Double]] = Map.empty,
+ var mlData: Map[Int, Map[String, java.util.List[Double]]] = Map.empty,
override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new
mutable.ArrayBuffer[NCNlpSentenceToken](32),
var firstProbePhase: Boolean = true,
private val deletedNotes: mutable.HashMap[NCNlpSentenceNote,
Seq[NCNlpSentenceToken]] = mutable.HashMap.empty,
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index c3f5e3d..042e86e 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -29,7 +29,6 @@ import
org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants,
NCTokenPartKey, NCProbeSynonym => Synonym}
import java.io.Serializable
-import java.lang
import java.util.{List => JList}
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
@@ -455,7 +454,7 @@ object NCModelEnricher extends NCProbeEnricher {
elem = mdl.elements.find(_._1 ==
elemId).getOrElse(throw new NCE(s"Element not found: $elemId"))._2,
toks = Seq(ns.tokens(tokIdx)),
direct = true,
- metaOpt = Some(Map("score" ->
score.asInstanceOf[AnyRef]))
+ metaOpt = Some(Map("scores" -> score))
)
val req = NCRequestImpl(senMeta, ns.srvReqId)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
index 16609b1..5b1b1c7 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
@@ -31,6 +31,7 @@ import org.jibx.schema.codegen.extend.DefaultNameConverter
import scala.collection.mutable
import scala.concurrent.Await
import scala.concurrent.duration.Duration
+import scala.jdk.CollectionConverters.SeqHasAsJava
/**
* ContextWord enricher.
@@ -38,23 +39,30 @@ import scala.concurrent.duration.Duration
object NCContextWordEnricher extends NCServerEnricher {
private final val MAX_CTXWORD_SCORE = 2
private final val EXCL_MIN_SCORE = -1.0
- private final val INCL_MIN_SCORE = 0.0
private final val INCL_MAX_SCORE = 1.0
private final val CONVERTER = new DefaultNameConverter
private case class ModelProbeKey(probeId: String, modelId: String)
- private case class ElementScore(elementId: String, averageScore: Double,
senScore: Double, sampleScore: Double)
+ private case class ElementScore(elementId: String, scores: Double*) {
+ lazy val maxScore: Double = scores.max // TODO" logic
+ override def toString: String = s"Element [id=$elementId,
scores=${scores.sortBy(p => -p).mkString(",", "[", "]")}]"
+ }
private case class ValuesHolder(
values: Map[/** Value as is */ String, /** Element ID */ Set[String]],
valuesStems: Map[/** Value's stem */ String, /** Element ID */
Set[String]]
- )
+ ) {
+ override def toString: String = s"Values [values=$values,
stems=$valuesStems]"
+ }
case class ScoreHolder(
normal: Map[/** Normal value */ String, /** Score */ Double],
stems: Map[/** Stem */ String, /** Score */ Double],
- lemma: Map[/** Lemma */ String, /** Score */ Double],
- )
+ lemma: Map[/** Lemma */ String, /** Score */ Double]
+ ) {
+ private def sort(m: Map[String, Double]): String =
m.toSeq.sortBy(-_._2).map({ case (k, v) => s"$k=$v" }).mkString(",")
+ override def toString: String = s"Score [normal=${sort(normal)},
stems=${sort(stems)}, lemma=${sort(lemma)}]"
+ }
@volatile private var valuesStems: mutable.HashMap[ModelProbeKey,
ValuesHolder] = _
@volatile private var samples: mutable.HashMap[ModelProbeKey, Map[/**
Element ID */String, ScoreHolder]] = _
@@ -75,6 +83,7 @@ object NCContextWordEnricher extends NCServerEnricher {
startScopedSpan("stop", parent) { _ =>
ackStopping()
+ // TODO: clear model cache
parser = null
samples = null
valuesStems = null
@@ -252,7 +261,7 @@ object NCContextWordEnricher extends NCServerEnricher {
val recs: Map[/** Element ID */String, Seq[NCSuggestionRequest]] =
(
for (
- (elemId, elemValues) <- cfg.values;
+ (elemId, elemValues) <- cfg.values.toSeq;
// Uses single words synonyms only.
elemValuesSyns =
elemValues.flatMap(_._2).toSet.filter(!_.contains(' '));
suggReq <- parseSample(
@@ -268,7 +277,7 @@ object NCContextWordEnricher extends NCServerEnricher {
yield (elemId, suggReq)
).
groupBy { case (elemId, _) => elemId }.
- map { case (elemId, m) => elemId -> m.toSeq.map(_._2) }
+ map { case (elemId, m) => elemId -> m.map(_._2) }
if (recs.nonEmpty) {
val resps =
syncExec(NCSuggestSynonymManager.suggestWords(recs.flatMap(_._2).toSeq))
@@ -312,15 +321,16 @@ object NCContextWordEnricher extends NCServerEnricher {
val detected = mutable.HashMap.empty[NCNlpSentenceToken,
mutable.HashSet[ElementScore]]
def add(
- nounTok: NCNlpSentenceToken, elemId: String, averageScore:
Double, senScore: Double, sampleScore: Double
+ nounTok: NCNlpSentenceToken, elemId: String, senScore:
Double, sampleScore: Double
): Unit = {
+ val maxScore = Math.max(senScore, sampleScore)
val tokElems = detected.getOrElseUpdate(nounTok,
mutable.HashSet.empty[ElementScore])
- def mkNew(): ElementScore = ElementScore(elemId,
averageScore, senScore, sampleScore)
+ def mkNew(): ElementScore = ElementScore(elemId, senScore,
sampleScore)
tokElems.find(_.elementId == elemId) match {
case Some(saved) =>
- if (averageScore > saved.averageScore) {
+ if (maxScore > saved.maxScore) {
tokElems -= saved
tokElems += mkNew()
}
@@ -336,6 +346,8 @@ object NCContextWordEnricher extends NCServerEnricher {
// 1. Values. Direct.
val valuesData = getValuesData(cfg, key)
+ //println("valuesData="+valuesData)
+
for (
nounTok <- nounToks;
elemId <-
@@ -343,11 +355,13 @@ object NCContextWordEnricher extends NCServerEnricher {
valuesData.values.getOrElse(nounTok.normText,
Set.empty) ++
valuesData.valuesStems.getOrElse(nounTok.stem,
Set.empty)
)
- add(nounTok, elemId, INCL_MAX_SCORE, INCL_MAX_SCORE,
INCL_MAX_SCORE)
+ add(nounTok, elemId, INCL_MAX_SCORE, INCL_MAX_SCORE)
// 2. Via examples.
val mdlSamples = getSamplesData(cfg, key)
+ //println("mdlSamples="+mdlSamples.mkString("\n"))
+
for (
nounTok <- nounToks;
(elemId, suggs) <- mdlSamples;
@@ -358,19 +372,31 @@ object NCContextWordEnricher extends NCServerEnricher {
).max
if score >= cfg.levels(elemId)
)
- add(nounTok, elemId, score, score, score)
+ add(nounTok, elemId, score, score)
// 3. Ask for sentence.
val idxs = ns.tokens.flatMap(p => if
(p.pos.startsWith("N")) Some(p.index) else None).toSeq
val reqs = idxs.map(idx =>
NCSuggestionRequest(ns.tokens.map(_.origText).toSeq, idx))
+ val resps =
+ syncExec(
+
NCSuggestSynonymManager.suggestWords(reqs)).flatMap { case (req, suggs) =>
suggs.map(_ -> req)
+ }
+
+// resps.toSeq.groupBy(_._2.index).foreach { case (_, seq)
=>
+// val sorted = seq.sortBy(-_._1.score)
+//
+// println("REQ=" + sorted.head._2)
+// println("Resps=" + sorted.map(_._1))
+// println()
+// }
+
+
+
for (
// Token index (tokIdx) should be correct because
request created from original words,
// separated by space, and Suggestion Manager uses
space tokenizer.
- (sugg, req) <-
- syncExec(
-
NCSuggestSynonymManager.suggestWords(reqs)).flatMap { case (req, suggs) =>
suggs.map(_ -> req)
- };
+ (sugg, req) <- resps;
senScore = normalizeScore(sugg.score);
(elemId, mdlSamplesSuggs) <- mdlSamples;
elemScore = cfg.levels(elemId);
@@ -379,15 +405,14 @@ object NCContextWordEnricher extends NCServerEnricher {
mdlSamplesSuggs.stems.getOrElse(stem(sugg.word), EXCL_MIN_SCORE),
mdlSamplesSuggs.normal.getOrElse(sugg.word.toLowerCase, EXCL_MIN_SCORE),
mdlSamplesSuggs.lemma.getOrElse(getSuggestionLemma(req, sugg), EXCL_MIN_SCORE)
- ).max;
- averageScore = (sampleScore + senScore) / 2
- if sampleScore >= INCL_MIN_SCORE && averageScore >=
elemScore
+ ).max
+ if sampleScore >= elemScore && senScore >= elemScore
// TODO: logic
)
- add(ns.tokens(req.index), elemId, averageScore,
senScore, sampleScore)
+ add(ns.tokens(req.index), elemId, senScore,
sampleScore)
}
ns.mlData = detected.map {
- case (tok, scores) => tok.index -> scores.map(p =>
p.elementId -> p.averageScore).toMap
+ case (tok, scores) => tok.index -> scores.map(p =>
p.elementId -> p.scores.asJava).toMap
}.toMap
println("detected="+detected.map(p => p._1.lemma -> p._2))
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
index 9afcf41..736f5be 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
@@ -17,17 +17,34 @@
package org.apache.nlpcraft.model.ctxword
+import org.apache.nlpcraft.common.ascii.NCAsciiTable
import org.apache.nlpcraft.model.{NCElement, NCIntent, NCIntentMatch,
NCIntentSample, NCIntentTerm, NCModel, NCResult, NCToken, NCValue}
import org.apache.nlpcraft.{NCTestContext, NCTestEnvironment}
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
+import java.text.DecimalFormat
import java.util.{Collections, Optional}
import java.{lang, util}
import scala.collection.mutable.ArrayBuffer
-import scala.jdk.CollectionConverters.{SeqHasAsJava, SetHasAsJava}
+import scala.jdk.CollectionConverters.{ListHasAsScala, SeqHasAsJava,
SetHasAsJava}
object NCContextWordSpecModel {
- private final val LEVEL = 0.4
+ val tables: ArrayBuffer[(String, NCAsciiTable)] =
ArrayBuffer.empty[(String, NCAsciiTable)]
+}
+
+import org.apache.nlpcraft.model.ctxword.NCContextWordSpecModel._
+
+/**
+ * Test model.
+ */
+class NCContextWordSpecModel extends NCModel {
+ override def getId: String = this.getClass.getSimpleName
+ override def getName: String = this.getClass.getSimpleName
+ override def getVersion: String = "1.0.0"
+
+ private final val FMT = new DecimalFormat("#0.00000")
+
+ val level = 0.4
case class Value(name: String, syns: String*) extends NCValue {
override def getName: String = name
@@ -42,22 +59,9 @@ object NCContextWordSpecModel {
}
object Element {
- def apply(id: String, values: NCValue*): Element = new Element(id,
LEVEL, values: _*)
+ def apply(id: String, values: NCValue*): Element = new Element(id,
level, values: _*)
}
- var testsData: ArrayBuffer[String] = ArrayBuffer.empty[String]
-}
-
-import NCContextWordSpecModel._
-
-/**
- * Test model.
- */
-class NCContextWordSpecModel extends NCModel {
- override def getId: String = this.getClass.getSimpleName
- override def getName: String = this.getClass.getSimpleName
- override def getVersion: String = "1.0.0"
-
override def getElements: util.Set[NCElement] =
Set(
Element("class:cars", Value("BMW")),
@@ -87,12 +91,24 @@ class NCContextWordSpecModel extends NCModel {
)
@NCIntent("intent=classification term(toks)~{has(tok_groups(),
'testGroup')}*")
def onMatch(ctx: NCIntentMatch, @NCIntentTerm("toks") toks:
List[NCToken]): NCResult = {
- val txt = ctx.getContext.getRequest.getNormalizedText
- val toksStr = toks.map(t =>
- s"[text=${t.getOriginalText}, elementId=${t.getId},
score=${t.getMetadata.get(s"${t.getId}:score")}]"
- ).mkString(", ")
+ val table = NCAsciiTable()
+
+ table #= ("Token text", "Element ID", "Scores")
- testsData += s"Matched [text=$txt, tokens=$toksStr"
+ for (t <- toks)
+ table += (
+ t.getOriginalText,
+ t.getId,
+ t.getMetadata.
+ get(s"${t.getId}:scores").
+ asInstanceOf[java.util.List[Double]].
+ asScala.
+ sortBy(-_).
+ map(FMT.format).
+ mkString(", ")
+ )
+
+ tables += ctx.getContext.getRequest.getNormalizedText -> table
val elemIds = toks.map(_.getId).distinct.mkString(" ")
val words = toks.map(_.getOriginalText).mkString(" ")
@@ -114,26 +130,33 @@ class NCContextWordSpec extends NCTestContext {
}
@BeforeEach
- private[ctxword] def before(): Unit = testsData.clear()
+ private[ctxword] def before(): Unit = tables.clear()
@AfterEach
- private[ctxword] def after(): Unit = testsData.foreach(println)
+ private[ctxword] def after(): Unit = {
+ println("MATCHED:")
+
+ for ((txt, table) <- tables) {
+ println(s"Text: $txt")
+ table.render()
+ }
+
+ tables.clear()
+ }
@Test
private[ctxword] def test(): Unit = {
- //check("I want to have a dogs and foxes", "class:animal", "dogs",
"foxes")
- //check("I bought dog's meat", "class:animal", "dog meat")
- check("I bought the meat", "class:animal", "dog meat")
-
-
- //check("I bought xxx dog's", "class:animal", "dog")
-// check("I want to have a dog and fox", "class:animal", "dog", "fox")
-// check("I fed your fish", "class:animal", "fish")
-//
-// check("I like to drive my Porsche and Volkswagen", "class:cars",
"Porsche", "Volkswagen")
-// check("Peugeot added motorcycles to its range in 1901",
"class:cars", "Peugeot", "motorcycles")
-//
-// check("The frost is possible today", "class:weather", "frost")
-// check("There's a very strong wind from the east now",
"class:weather", "wind")
+ check("I want to have dogs and foxes", "class:animal", "dogs", "foxes")
+ check("I bought dog's meat", "class:animal", "dog")
+ check("I bought meat dog's", "class:animal", "dog")
+
+ check("I want to have a dog and fox", "class:animal", "dog", "fox")
+ check("I fed your fish", "class:animal", "fish")
+
+ check("I like to drive my Porsche and Volkswagen", "class:cars",
"Porsche", "Volkswagen")
+ check("Peugeot added motorcycles to its range in 1901", "class:cars",
"Peugeot", "motorcycles")
+
+ check("The frost is possible today", "class:weather", "frost")
+ check("There's a very strong wind from the east now", "class:weather",
"wind")
}
}
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec2.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec2.scala
new file mode 100644
index 0000000..2a0f407
--- /dev/null
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec2.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.ctxword
+
+import org.apache.nlpcraft.model.ctxword.NCContextWordSpecModel._
+import org.apache.nlpcraft.{NCTestContext, NCTestEnvironment}
+import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
+
+/**
+ * Test model.
+ */
+class NCContextWordSpecModel2 extends NCContextWordSpecModel {
+ override val level: Double = 0
+}
+
+/**
+ * @see NCConversationSpecModel
+ */
+@NCTestEnvironment(model = classOf[NCContextWordSpecModel2], startClient =
true)
+class NCContextWordSpec2 extends NCTestContext {
+ private def check(txts: String*): Unit =
+ for (txt <- txts)
+ checkIntent(txt, "classification")
+
+ @BeforeEach
+ private[ctxword] def before(): Unit = tables.clear()
+
+ @AfterEach
+ private[ctxword] def after(): Unit = {
+ println("MATCHED:")
+
+ for ((txt, table) <- tables) {
+ println(s"Text: $txt")
+ table.render()
+ }
+
+ tables.clear()
+ }
+
+ @Test
+ private[ctxword] def test(): Unit = {
+ check(
+ "I want to have dogs and foxes",
+// "I bought dog's meat",
+// "I bought meat dog's",
+//
+// "I want to have a dog and fox",
+// "I fed your fish",
+//
+// "I like to drive my Porsche and Volkswagen",
+// "Peugeot added motorcycles to its range in 1901",
+//
+// "The frost is possible today",
+// "There's a very strong wind from the east now"
+ )
+ }
+}