This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-70_NEW by this push:
new 5aa45cd WIP.
5aa45cd is described below
commit 5aa45cdeb202de9ef7997cab647b53c64603dfed
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Jul 5 17:29:59 2021 +0300
WIP.
---
.../enrichers/ctxword/NCContextWordEnricher.scala | 175 +++++++++++----------
.../server/sugsyn/NCSuggestSynonymManager.scala | 8 +-
.../nlpcraft/model/ctxword/NCContextWordSpec.scala | 62 +++++---
3 files changed, 138 insertions(+), 107 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
index d7dcb22..6978967 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
@@ -51,8 +51,12 @@ object NCContextWordEnricher extends NCServerEnricher {
private final val FN = new DecimalFormat("#0.00000")
- private case class Score(score: Double, reason: String) {
- override def toString: String = s"${FN.format(score)}($reason)}"
+ private case class Score(score: Double, reason: Option[String] = None) {
+ override def toString: String =
+ reason match {
+ case Some(v) => s"${FN.format(score)}(via: '$v')}"
+ case None => s"${FN.format(score)}(direct)}"
+ }
}
private case class ModelProbeKey(probeId: String, modelId: String)
private case class ElementScore(elementId: String, scores: Score*) {
@@ -60,20 +64,16 @@ object NCContextWordEnricher extends NCServerEnricher {
s"Element [id=$elementId, scores=${scores.sortBy(p =>
-p.score).mkString("{ ", ", ", " }")}]"
}
+ // Key - word form (origin, stem). Value - Element IDs set.
+ type ElementsByKey = Map[/** Key */ String, /** Element ID */ Set[String]]
+
object ValuesHolder {
- def apply(
- normal: Map[/** Normal value */ String, /** Element ID */
Set[String]],
- stems: Map[/** Value's stem */ String, /** Element ID */
Set[String]]
- ): ValuesHolder = new ValuesHolder(
- normal,
- stems.filter(p => !normal.keySet.contains(p._1))
+ def apply(normal: ElementsByKey, stems: ElementsByKey): ValuesHolder =
new ValuesHolder(
+ normal, stems.filter(p => !normal.keySet.contains(p._1))
)
}
- class ValuesHolder(
- val normal: Map[/** Normal value */ String, /** Element ID */
Set[String]],
- val stems: Map[/** Value's stem */ String, /** Element ID */
Set[String]]
- ) {
+ class ValuesHolder(val normal: ElementsByKey, val stems: ElementsByKey) {
private def map2Str(m: Map[String, Set[String]]): String =
m.toSeq.flatMap(p => p._2.toSeq.map(x => x -> p._1)).
groupBy(_._1).map(p => p._1 -> p._2.map(_._2).
@@ -82,31 +82,28 @@ object NCContextWordEnricher extends NCServerEnricher {
override def toString: String = s"Values [normal=${map2Str(normal)},
stems=${map2Str(stems)}]"
}
- object ScoreHolder {
- private final val EXCL_MIN_SCORE = -1.0
+ // Key - word form (origin, stem, lemma).
+ // Scores list which extracted from suggestions for each example (direct
or artificial)
+ type ScoreFactors = Map[String, Seq[Double]]
- def apply(normals: Map[String, Double], stems: Map[String, Double],
lemmas: Map[String, Double]): ScoreHolder =
+ object ScoreHolder {
+ def apply(normals: ScoreFactors, stems: ScoreFactors, lemmas:
ScoreFactors): ScoreHolder =
new ScoreHolder(normals, stems -- normals.keySet, lemmas --
normals.keySet -- stems.keySet)
}
- import ScoreHolder._
+ class ScoreHolder(normals: ScoreFactors, stems: ScoreFactors, lemmas:
ScoreFactors) {
+ def get(m: ScoreFactors, key: String): Seq[Double] = m.getOrElse(key,
Seq.empty)
- class ScoreHolder(
- normals: Map[/** Normal value */ String, /** Score */ Double],
- stems: Map[/** Stem */ String, /** Score */ Double],
- lemmas: Map[/** Lemma */ String, /** Score */ Double]
- ) {
- def get(m: Map[String, Double], key: String): Double =
m.getOrElse(key, EXCL_MIN_SCORE)
+ def get(norm: String, stem: String, lemma: String): Seq[Double] =
+ get(normals, norm) ++ get(stems, stem) ++ get(lemmas, lemma)
- def get(norm: String, stem: String, lemma: String): Option[Double] =
- Seq(get(normals, norm), get(stems, stem), get(lemmas, lemma)).max
match {
- case EXCL_MIN_SCORE => None
- case max => Some(max)
- }
+ private def sort(m: ScoreFactors): String =
+ m.toSeq.
+ sortBy(p => -p._2.max).map(
+ { case (k, factors) => s"$k=${factors.sortBy(-_).map(p =>
FN.format(p)).mkString("{ ", ", ", " }")}" }
+ ).mkString("{ ", ", ", " }")
- private def sort(m: Map[String, Double]): String =
- m.toSeq.sortBy(-_._2).map({ case (k, v) => s"$k=${FN.format(v)}"
}).mkString(", ")
- override def toString: String = s"Score [normal: ${sort(normals)},
stems: ${sort(stems)}, lemma: ${sort(lemmas)}]"
+ override def toString: String = s"Score: ${sort(normals)}"
}
@volatile private var valuesStems: mutable.HashMap[ModelProbeKey,
ValuesHolder] = _
@@ -335,7 +332,7 @@ object NCContextWordEnricher extends NCServerEnricher {
for ((req, resp) <- resps) {
t += (
req,
- s"${resp.sortBy(-_.score).map(p =>
s"${p.word}=${FN.format(normalize(p.score))}").mkString(", ")}"
+ s"${resp.map(p =>
s"${p.word}=${FN.format(normalize(p.score))}").mkString(", ")}"
)
}
@@ -346,19 +343,28 @@ object NCContextWordEnricher extends NCServerEnricher {
val req2Elem = recs.flatMap { case (elemId, recs) => recs.map(p =>
p -> elemId) }
- def mkMap(convert: (NCSuggestionRequest, NCWordSuggestion) =>
String): Map[String, Map[String, Double]] =
- respsSeq.
+ def mkMap(convert: (NCSuggestionRequest, NCWordSuggestion) =>
String) = {
+ val seq: Seq[(String, Map[String, Seq[Double]])] = respsSeq.
map { case (req, suggs) =>
(
req2Elem(req),
suggs.groupBy(sygg => convert(req, sygg)).
- map { case (conv, suggs) => conv ->
normalize(suggs.map(_.score).max) }
- )
- }.
+ map { case (key, suggs) => key -> suggs.map(p
=> normalize(p.score)) }
+ )
+ }
+
+ seq.
groupBy { case (elemId, _) => elemId }.
- map { case (elemId, data) => elemId ->
data.flatMap(_._2).toMap }
+ map { case (elemId, data) => elemId -> {
+ val factors: Seq[(String, Seq[Double])] =
data.flatMap(_._2)
+
+ factors.
+ groupBy{ case (word, _) => word }.
+ map { case (word, factors) => word ->
factors.flatMap { case (_, factor) => factor } }
+ } }
+ }
- val normalMap = mkMap { (_, sugg ) => sugg.word.toLowerCase }
+ val normalMap: Map[String, Map[String, Seq[Double]]] = mkMap { (_,
sugg ) => sugg.word.toLowerCase }
val stemMap = mkMap { (_, sugg ) => stem(sugg.word) }
val lemmaMap = mkMap { (req, sugg ) => getSuggestionLemma(req,
sugg) }
@@ -381,28 +387,30 @@ object NCContextWordEnricher extends NCServerEnricher {
* @param scores
* @return
*/
- private def isMatched(elemScore: NCContextWordElementConfig, scores:
Double*): Boolean = {
- require(scores.nonEmpty)
-
- import NCContextWordElementConfig.NCContextWordElementPolicy._
-
- val policy = elemScore.getPolicy
- val elemScoreVal = elemScore.getScore
-
- policy match {
- case MEDIAN =>
- val sorted = scores.sorted
- val mid = sorted.length / 2
- val median = if (sorted.length % 2 == 0) (sorted(mid) +
sorted(mid - 1)) / 2 else sorted(mid)
-
- median >= elemScoreVal
- case ALL => scores.forall(_ >= elemScoreVal)
- case AVERAGE => scores.sum / scores.size >= elemScoreVal
- case ANY => scores.exists(_ >= elemScoreVal)
-
- case _ => throw new AssertionError(s"Unexpected policy: $policy")
+ private def isMatched(elemScore: NCContextWordElementConfig, scores:
Double*): Boolean =
+ if (scores.nonEmpty) {
+ import NCContextWordElementConfig.NCContextWordElementPolicy._
+
+ val policy = elemScore.getPolicy
+ val elemScoreVal = elemScore.getScore
+
+ policy match {
+ case MEDIAN =>
+ val sorted = scores.sorted
+ val mid = sorted.length / 2
+ val median = if (sorted.length % 2 == 0) (sorted(mid) +
sorted(mid - 1)) / 2
+ else sorted(mid)
+
+ median >= elemScoreVal
+ case ALL => scores.forall(_ >= elemScoreVal)
+ case AVERAGE => scores.sum / scores.size >= elemScoreVal
+ case ANY => scores.exists(_ >= elemScoreVal)
+
+ case _ => throw new AssertionError(s"Unexpected policy:
$policy")
+ }
}
- }
+ else
+ false
override def enrich(ns: NCNlpSentence, parent: Span): Unit =
startScopedSpan("stop", parent) { _ =>
@@ -430,19 +438,21 @@ object NCContextWordEnricher extends NCServerEnricher {
val key = ModelProbeKey(cfg.probeId, cfg.modelId)
// 1. Values. Direct.
- val valuesData = getValuesData(cfg, key)
+ val valsData = getValuesData(cfg, key)
if (DEBUG_MODE)
- logger.info(s"Values loaded [key=$key,
data=$valuesData]")
+ logger.info(s"Values loaded [key=$key,
data=$valsData]")
+
+ def get(m: Map[String, Set[String]], key: String):
Set[String] = m.getOrElse(key, Set.empty)
for (
nounTok <- nounToks;
elemId <-
-
valuesData.normal.getOrElse(nounTok.normText, Set.empty) ++
-
valuesData.normal.getOrElse(nounTok.lemma.toLowerCase, Set.empty) ++
- valuesData.stems.getOrElse(nounTok.stem,
Set.empty)
+ get(valsData.normal, nounTok.normText) ++
+ get(valsData.normal,
nounTok.lemma.toLowerCase) ++
+ get(valsData.stems, nounTok.stem)
)
- add(nounTok, elemId, Score(INCL_MAX_SCORE,
nounTok.normText))
+ add(nounTok, elemId, Score(INCL_MAX_SCORE))
// 2. Via examples.
val mdlCorpusData: Map[String, ScoreHolder] =
getCorpusData(cfg, key, parent)
@@ -450,10 +460,10 @@ object NCContextWordEnricher extends NCServerEnricher {
if (DEBUG_MODE) {
val t = NCAsciiTable()
- t #= ("Element", "Scores")
+ t #= ("Element", "Detailed")
- for ((elemId, scoreHolder) <- mdlCorpusData)
- t += (elemId, scoreHolder)
+ for ((elemId, sh) <- mdlCorpusData)
+ t += (elemId, sh)
t.info(logger, Some(s"Model corpus processed
[key=$key]"))
}
@@ -461,17 +471,17 @@ object NCContextWordEnricher extends NCServerEnricher {
for (
nounTok <- nounToks;
(elemId, suggs) <- mdlCorpusData;
- scoreOpt = suggs.get(nounTok.normText,
nounTok.stem, nounTok.lemma)
- if scoreOpt.isDefined &&
isMatched(cfg.elements(elemId), scoreOpt.get)
+ scores = suggs.get(nounTok.normText, nounTok.stem,
nounTok.lemma)
+ if isMatched(cfg.elements(elemId), scores :_*);
+ score <- scores
)
- add(nounTok, elemId, Score(scoreOpt.get,
nounTok.normText))
+ add(nounTok, elemId, Score(score))
// 3. Ask for sentence.
- val idxs = ns.tokens.flatMap(p => if
(p.pos.startsWith("N")) Some(p.index)
- else None).toSeq
+ val idxs = ns.tokens.flatMap(p => if
(p.pos.startsWith("N")) Some(p.index)else None).toSeq
val reqs = idxs.map(idx =>
NCSuggestionRequest(ns.tokens.map(_.origText).toSeq, idx))
- val resps =
+ val resps: Map[NCWordSuggestion, NCSuggestionRequest] =
syncExec(
NCSuggestSynonymManager.suggestWords(reqs,
parent = parent)).
flatMap { case (req, suggs) => suggs.map(_ ->
req)
@@ -502,14 +512,19 @@ object NCContextWordEnricher extends NCServerEnricher {
(sugg, req) <- resps;
senScore = normalize(sugg.score);
(elemId, mdlCorpusSuggs) <- mdlCorpusData;
- elemScore = cfg.elements(elemId);
- corpusScoreOpt =
+ elemCfg = cfg.elements(elemId);
+ corpusScores =
mdlCorpusSuggs.get(
sugg.word.toLowerCase,
stem(sugg.word), getSuggestionLemma(req, sugg)
)
- if corpusScoreOpt.isDefined &&
isMatched(elemScore, corpusScoreOpt.get, senScore)
- )
- add(ns.tokens(req.index), elemId, Score(senScore,
sugg.word), Score(corpusScoreOpt.get, sugg.word))
+ // TODO:
+ if isMatched(elemCfg, senScore) &&
isMatched(elemCfg, corpusScores :_*)
+ ) {
+ add(ns.tokens(req.index), elemId, Score(senScore,
Some(sugg.word)))
+//
+// for (corpusScore <- corpusScores)
+// add(ns.tokens(req.index), elemId,
Score(corpusScore, Some(sugg.word)))
+ }
}
ns.ctxWordData = detected.map {
@@ -520,7 +535,7 @@ object NCContextWordEnricher extends NCServerEnricher {
logger.info("Sentence detected elements:")
for ((tok, elems) <- detected)
- logger.info(s"${tok.origText}: ${elems.mkString(",
")}")
+ logger.info(s"${tok.origText}:
${elems.sortBy(-_.scores.map(_.score).max).mkString(", ")}")
}
case None => // No-op.
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala
index 0b30012..c2c8d23 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala
@@ -78,7 +78,7 @@ object NCSuggestSynonymManager extends NCService {
case 200 =>
val data: util.List[util.List[NCWordSuggestion]] =
GSON.fromJson(js, TYPE_RESP)
- data.asScala.map(p => if (p.isEmpty) Seq.empty else
p.asScala.tail.toSeq).toSeq
+ data.asScala.map(p => if (p.isEmpty) Seq.empty else
p.asScala.toSeq).toSeq
case _ =>
throw new NCE(
@@ -502,8 +502,10 @@ object NCSuggestSynonymManager extends NCService {
if (cnt.incrementAndGet() == batches.size) {
val min = minScoreOpt.getOrElse(DFLT_MIN_SCORE)
- val map: Map[NCSuggestionRequest,
Seq[NCWordSuggestion]] = data.asScala.groupBy(_.request).map { case (req, map)
=>
- req ->
map.flatMap(_.suggestions.filter(_.score >= min).toSeq)
+ val map: Map[NCSuggestionRequest,
Seq[NCWordSuggestion]] =
+ data.asScala.groupBy(_.request).map {
+ case (req, ress) =>
+ req ->
ress.flatMap(_.suggestions.filter(_.score >= min).toSeq).sortBy(-_.score)
}
// TODO ? logic?
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
index a8ac6c3..5a902f9 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
@@ -25,6 +25,7 @@ import org.junit.jupiter.api.Test
import java.util
import java.util.{Collections, Optional}
+import scala.collection.mutable.ArrayBuffer
import scala.jdk.CollectionConverters.{CollectionHasAsScala, MapHasAsJava,
SeqHasAsJava, SetHasAsJava}
object NCContextWordSpecModel {
@@ -49,8 +50,8 @@ class NCContextWordSpecModel extends NCModel {
override def getName: String = this.getClass.getSimpleName
override def getVersion: String = "1.0.0"
- val MDL_LEVEL = 0.0
- val MDL_POLICY = ALL
+ val MDL_LEVEL = 0.4
+ val MDL_POLICY = AVERAGE
override def getContextWordModelConfig: Optional[NCContextWordModelConfig]
= {
Optional.of(
@@ -68,18 +69,18 @@ class NCContextWordSpecModel extends NCModel {
override def getCorpus: util.List[String] =
Seq(
-// "I like drive my new BMW",
-// "BMW has the best engine",
-// "Luxury cars like Mercedes and BMW are prime
targets",
-// "BMW will install side air bags up front",
+ "I like drive my new BMW",
+ "BMW has the best engine",
+ "Luxury cars like Mercedes and BMW are prime targets",
+ "BMW will install side air bags up front",
"A wild cat is very dangerous",
"A fox eats hens",
"The fox was already in your chicken house",
-// "What is the local temperature?",
-// "This is the first day of heavy rain",
-// "It is the beautiful day, the sun is shining"
+ "What is the local temperature?",
+ "This is the first day of heavy rain",
+ "It is the beautiful day, the sun is shining"
).asJava
}
)
@@ -87,9 +88,9 @@ class NCContextWordSpecModel extends NCModel {
override def getElements: util.Set[NCElement] =
Set(
- //Element("class:cars", MDL_LEVEL, Value("BMW")),
+ Element("class:cars", MDL_LEVEL, Value("BMW")),
Element("class:animal", MDL_LEVEL, Value("fox"), Value("cat",
"tomcat")),
- //Element("class:weather", MDL_LEVEL, Value("temperature"),
Value("rain"), Value("sun"))
+ Element("class:weather", MDL_LEVEL, Value("temperature"),
Value("rain"), Value("sun"))
).map(p => {
val e: NCElement = p
@@ -97,6 +98,8 @@ class NCContextWordSpecModel extends NCModel {
}).asJava
override def onContext(ctx: NCContext): NCResult = {
+ val varRes = ArrayBuffer.empty[String]
+
val ok =
ctx.getVariants.asScala.exists(v => {
val testGroupToks =
v.asScala.toSeq.filter(_.getGroups.contains("testGroup"))
@@ -104,10 +107,19 @@ class NCContextWordSpecModel extends NCModel {
val elemIds = testGroupToks.map(_.getId).distinct.mkString(" ")
val words = testGroupToks.map(_.getOriginalText).mkString(" ")
+ val res = s"$elemIds $words"
+
+ varRes += res
+
NCContextWordSpecModel.expected == s"$elemIds $words"
})
- NCResult.text(if (ok) "OK" else "ERROR")
+ NCResult.text(
+ if (ok)
+ "OK"
+ else
+ s"ERROR: variant '${NCContextWordSpecModel.expected}' not
found. Found: ${varRes.mkString(", ")}"
+ )
}
}
@@ -119,22 +131,24 @@ class NCContextWordSpec extends NCTestContext {
private def check(txt: String, elemId: String, words: String*): Unit = {
NCContextWordSpecModel.expected = s"$elemId ${words.mkString(" ")}"
- require(getClient.ask(txt).getResult.get() == "OK")
+ val res = getClient.ask(txt).getResult.get()
+
+ require(res == "OK", s"Unexpected: $res")
}
@Test
private[ctxword] def test(): Unit = {
- //check("I want to have dogs and foxes", "class:animal", "dogs",
"foxes")
+ check("I want to have dogs and foxes", "class:animal", "dogs", "foxes")
check("I bought dog's meat", "class:animal", "dog")
-// check("I bought meat dog's", "class:animal", "dog")
-//
-// check("I want to have a dog and fox", "class:animal", "dog", "fox")
-// check("I fed your fish", "class:animal", "fish")
-//
-// check("I like to drive my Porsche and Volkswagen", "class:cars",
"Porsche", "Volkswagen")
-// check("Peugeot added motorcycles to its range in 1901",
"class:cars", "Peugeot", "motorcycles")
-//
-// check("The frost is possible today", "class:weather", "frost")
-// check("There's a very strong wind from the east now",
"class:weather", "wind")
+ check("I bought meat dog's", "class:animal", "dog")
+
+ check("I want to have a dog and fox", "class:animal", "dog", "fox")
+ check("I fed your fish", "class:animal", "fish")
+
+ check("I like to drive my Porsche and Volkswagen", "class:cars",
"Porsche", "Volkswagen")
+ check("Peugeot added motorcycles to its range in 1901", "class:cars",
"Peugeot", "motorcycles")
+
+ check("The frost is possible today", "class:weather", "frost")
+ check("There's a very strong wind from the east now", "class:weather",
"wind")
}
}