[incubator-nlpcraft] branch NLPCRAFT-70_NEW updated: WIP.

sergeykamov Mon, 05 Jul 2021 07:30:20 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-70_NEW by this push:
     new 5aa45cd  WIP.
5aa45cd is described below

commit 5aa45cdeb202de9ef7997cab647b53c64603dfed
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Jul 5 17:29:59 2021 +0300

    WIP.
---
 .../enrichers/ctxword/NCContextWordEnricher.scala  | 175 +++++++++++----------
 .../server/sugsyn/NCSuggestSynonymManager.scala    |   8 +-
 .../nlpcraft/model/ctxword/NCContextWordSpec.scala |  62 +++++---
 3 files changed, 138 insertions(+), 107 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
index d7dcb22..6978967 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
@@ -51,8 +51,12 @@ object NCContextWordEnricher extends NCServerEnricher {
 
     private final val FN = new DecimalFormat("#0.00000")
 
-    private case class Score(score: Double, reason: String) {
-        override def toString: String = s"${FN.format(score)}($reason)}"
+    private case class Score(score: Double, reason: Option[String] = None) {
+        override def toString: String =
+            reason match {
+                case Some(v) => s"${FN.format(score)}(via: '$v')}"
+                case None => s"${FN.format(score)}(direct)}"
+            }
     }
     private case class ModelProbeKey(probeId: String, modelId: String)
     private case class ElementScore(elementId: String, scores: Score*) {
@@ -60,20 +64,16 @@ object NCContextWordEnricher extends NCServerEnricher {
             s"Element [id=$elementId, scores=${scores.sortBy(p => 
-p.score).mkString("{ ", ", ", " }")}]"
     }
 
+    // Key - word form (origin, stem). Value - Element IDs set.
+    type ElementsByKey = Map[/** Key */ String, /** Element ID */ Set[String]]
+
     object ValuesHolder {
-        def apply(
-            normal: Map[/**  Normal value */ String, /** Element ID */ 
Set[String]],
-            stems: Map[/** Value's stem */ String, /** Element ID */ 
Set[String]]
-        ): ValuesHolder = new ValuesHolder(
-            normal,
-            stems.filter(p => !normal.keySet.contains(p._1))
+        def apply(normal: ElementsByKey, stems: ElementsByKey): ValuesHolder = 
new ValuesHolder(
+            normal, stems.filter(p => !normal.keySet.contains(p._1))
         )
     }
 
-    class ValuesHolder(
-        val normal: Map[/**  Normal value */ String, /** Element ID */ 
Set[String]],
-        val stems: Map[/** Value's stem */ String, /** Element ID */ 
Set[String]]
-    ) {
+    class ValuesHolder(val normal: ElementsByKey, val stems: ElementsByKey) {
         private def map2Str(m: Map[String, Set[String]]): String =
             m.toSeq.flatMap(p => p._2.toSeq.map(x => x -> p._1)).
                 groupBy(_._1).map(p => p._1 -> p._2.map(_._2).
@@ -82,31 +82,28 @@ object NCContextWordEnricher extends NCServerEnricher {
         override def toString: String = s"Values [normal=${map2Str(normal)}, 
stems=${map2Str(stems)}]"
     }
 
-    object ScoreHolder {
-        private final val EXCL_MIN_SCORE = -1.0
+    // Key - word form (origin, stem, lemma).
+    // Scores list which extracted from suggestions for each example (direct 
or artificial)
+    type ScoreFactors = Map[String, Seq[Double]]
 
-        def apply(normals: Map[String, Double], stems: Map[String, Double], 
lemmas: Map[String, Double]): ScoreHolder =
+    object ScoreHolder {
+        def apply(normals: ScoreFactors, stems: ScoreFactors, lemmas: 
ScoreFactors): ScoreHolder =
             new ScoreHolder(normals, stems -- normals.keySet, lemmas -- 
normals.keySet -- stems.keySet)
     }
 
-    import ScoreHolder._
+    class ScoreHolder(normals: ScoreFactors, stems: ScoreFactors, lemmas: 
ScoreFactors) {
+        def get(m: ScoreFactors, key: String): Seq[Double] = m.getOrElse(key, 
Seq.empty)
 
-    class ScoreHolder(
-        normals: Map[/** Normal value */ String, /** Score */ Double],
-        stems: Map[/** Stem */ String, /** Score */ Double],
-        lemmas: Map[/** Lemma */ String, /** Score */ Double]
-    ) {
-        def get(m: Map[String, Double], key: String): Double = 
m.getOrElse(key, EXCL_MIN_SCORE)
+        def get(norm: String, stem: String, lemma: String): Seq[Double] =
+            get(normals, norm) ++ get(stems, stem) ++ get(lemmas, lemma)
 
-        def get(norm: String, stem: String, lemma: String): Option[Double] =
-            Seq(get(normals, norm), get(stems, stem), get(lemmas, lemma)).max 
match {
-                case EXCL_MIN_SCORE => None
-                case max => Some(max)
-            }
+        private def sort(m: ScoreFactors): String =
+            m.toSeq.
+                sortBy(p => -p._2.max).map(
+                    { case (k, factors) => s"$k=${factors.sortBy(-_).map(p => 
FN.format(p)).mkString("{ ", ", ", " }")}" }
+                ).mkString("{ ", ", ", " }")
 
-        private def sort(m: Map[String, Double]): String =
-            m.toSeq.sortBy(-_._2).map({ case (k, v) => s"$k=${FN.format(v)}" 
}).mkString(", ")
-        override def toString: String = s"Score [normal: ${sort(normals)}, 
stems: ${sort(stems)}, lemma: ${sort(lemmas)}]"
+        override def toString: String = s"Score: ${sort(normals)}"
     }
 
     @volatile private var valuesStems: mutable.HashMap[ModelProbeKey, 
ValuesHolder] = _
@@ -335,7 +332,7 @@ object NCContextWordEnricher extends NCServerEnricher {
                 for ((req, resp) <- resps) {
                     t += (
                         req,
-                        s"${resp.sortBy(-_.score).map(p => 
s"${p.word}=${FN.format(normalize(p.score))}").mkString(", ")}"
+                        s"${resp.map(p => 
s"${p.word}=${FN.format(normalize(p.score))}").mkString(", ")}"
                     )
                 }
 
@@ -346,19 +343,28 @@ object NCContextWordEnricher extends NCServerEnricher {
 
             val req2Elem = recs.flatMap { case (elemId, recs) => recs.map(p => 
p -> elemId) }
 
-            def mkMap(convert: (NCSuggestionRequest, NCWordSuggestion) => 
String): Map[String, Map[String, Double]] =
-                respsSeq.
+            def mkMap(convert: (NCSuggestionRequest, NCWordSuggestion) => 
String) = {
+                val seq: Seq[(String, Map[String, Seq[Double]])] = respsSeq.
                     map { case (req, suggs) =>
                         (
                             req2Elem(req),
                             suggs.groupBy(sygg => convert(req, sygg)).
-                                map { case (conv, suggs) => conv -> 
normalize(suggs.map(_.score).max) }
-                       )
-                    }.
+                                map { case (key, suggs) => key -> suggs.map(p 
=> normalize(p.score)) }
+                        )
+                    }
+
+                seq.
                     groupBy { case (elemId, _) => elemId }.
-                    map { case (elemId, data) => elemId -> 
data.flatMap(_._2).toMap }
+                    map { case (elemId, data) => elemId -> {
+                        val factors: Seq[(String, Seq[Double])] = 
data.flatMap(_._2)
+
+                        factors.
+                            groupBy{ case (word, _) => word }.
+                            map { case (word, factors) => word -> 
factors.flatMap { case (_, factor) => factor } }
+                    } }
+            }
 
-            val normalMap = mkMap { (_, sugg ) => sugg.word.toLowerCase }
+            val normalMap: Map[String, Map[String, Seq[Double]]] = mkMap { (_, 
sugg ) => sugg.word.toLowerCase }
             val stemMap = mkMap { (_, sugg ) => stem(sugg.word) }
             val lemmaMap = mkMap { (req, sugg ) => getSuggestionLemma(req, 
sugg) }
 
@@ -381,28 +387,30 @@ object NCContextWordEnricher extends NCServerEnricher {
       * @param scores
       * @return
       */
-    private def isMatched(elemScore: NCContextWordElementConfig, scores: 
Double*): Boolean = {
-        require(scores.nonEmpty)
-
-        import NCContextWordElementConfig.NCContextWordElementPolicy._
-
-        val policy = elemScore.getPolicy
-        val elemScoreVal = elemScore.getScore
-
-        policy match {
-            case MEDIAN =>
-                val sorted = scores.sorted
-                val mid = sorted.length / 2
-                val median = if (sorted.length % 2 == 0) (sorted(mid) + 
sorted(mid - 1)) / 2 else sorted(mid)
-
-                median >= elemScoreVal
-            case ALL => scores.forall(_ >= elemScoreVal)
-            case AVERAGE => scores.sum / scores.size >= elemScoreVal
-            case ANY => scores.exists(_ >= elemScoreVal)
-
-            case _ => throw new AssertionError(s"Unexpected policy: $policy")
+    private def isMatched(elemScore: NCContextWordElementConfig, scores: 
Double*): Boolean =
+        if (scores.nonEmpty) {
+            import NCContextWordElementConfig.NCContextWordElementPolicy._
+
+            val policy = elemScore.getPolicy
+            val elemScoreVal = elemScore.getScore
+
+            policy match {
+                case MEDIAN =>
+                    val sorted = scores.sorted
+                    val mid = sorted.length / 2
+                    val median = if (sorted.length % 2 == 0) (sorted(mid) + 
sorted(mid - 1)) / 2
+                    else sorted(mid)
+
+                    median >= elemScoreVal
+                case ALL => scores.forall(_ >= elemScoreVal)
+                case AVERAGE => scores.sum / scores.size >= elemScoreVal
+                case ANY => scores.exists(_ >= elemScoreVal)
+
+                case _ => throw new AssertionError(s"Unexpected policy: 
$policy")
+            }
         }
-    }
+        else
+            false
 
     override def enrich(ns: NCNlpSentence, parent: Span): Unit =
         startScopedSpan("stop", parent) { _ =>
@@ -430,19 +438,21 @@ object NCContextWordEnricher extends NCServerEnricher {
                         val key = ModelProbeKey(cfg.probeId, cfg.modelId)
 
                         // 1. Values. Direct.
-                        val valuesData = getValuesData(cfg, key)
+                        val valsData = getValuesData(cfg, key)
 
                         if (DEBUG_MODE)
-                            logger.info(s"Values loaded [key=$key, 
data=$valuesData]")
+                            logger.info(s"Values loaded [key=$key, 
data=$valsData]")
+
+                        def get(m: Map[String, Set[String]], key: String): 
Set[String] = m.getOrElse(key, Set.empty)
 
                         for (
                             nounTok <- nounToks;
                                 elemId <-
-                                    
valuesData.normal.getOrElse(nounTok.normText, Set.empty) ++
-                                    
valuesData.normal.getOrElse(nounTok.lemma.toLowerCase, Set.empty) ++
-                                    valuesData.stems.getOrElse(nounTok.stem, 
Set.empty)
+                                get(valsData.normal, nounTok.normText) ++
+                                get(valsData.normal, 
nounTok.lemma.toLowerCase) ++
+                                get(valsData.stems, nounTok.stem)
                         )
-                            add(nounTok, elemId, Score(INCL_MAX_SCORE, 
nounTok.normText))
+                            add(nounTok, elemId, Score(INCL_MAX_SCORE))
 
                         // 2. Via examples.
                         val mdlCorpusData: Map[String, ScoreHolder] = 
getCorpusData(cfg, key, parent)
@@ -450,10 +460,10 @@ object NCContextWordEnricher extends NCServerEnricher {
                         if (DEBUG_MODE) {
                             val t = NCAsciiTable()
 
-                            t #= ("Element", "Scores")
+                            t #= ("Element", "Detailed")
 
-                            for ((elemId, scoreHolder) <- mdlCorpusData)
-                                t += (elemId, scoreHolder)
+                            for ((elemId, sh) <- mdlCorpusData)
+                                t += (elemId, sh)
 
                             t.info(logger, Some(s"Model corpus processed 
[key=$key]"))
                         }
@@ -461,17 +471,17 @@ object NCContextWordEnricher extends NCServerEnricher {
                         for (
                             nounTok <- nounToks;
                             (elemId, suggs) <- mdlCorpusData;
-                            scoreOpt = suggs.get(nounTok.normText, 
nounTok.stem, nounTok.lemma)
-                            if scoreOpt.isDefined && 
isMatched(cfg.elements(elemId), scoreOpt.get)
+                            scores = suggs.get(nounTok.normText, nounTok.stem, 
nounTok.lemma)
+                            if isMatched(cfg.elements(elemId), scores :_*);
+                            score <- scores
                         )
-                            add(nounTok, elemId, Score(scoreOpt.get, 
nounTok.normText))
+                            add(nounTok, elemId, Score(score))
 
                         // 3. Ask for sentence.
-                        val idxs = ns.tokens.flatMap(p => if 
(p.pos.startsWith("N")) Some(p.index)
-                        else None).toSeq
+                        val idxs = ns.tokens.flatMap(p => if 
(p.pos.startsWith("N")) Some(p.index)else None).toSeq
                         val reqs = idxs.map(idx => 
NCSuggestionRequest(ns.tokens.map(_.origText).toSeq, idx))
 
-                        val resps =
+                        val resps: Map[NCWordSuggestion, NCSuggestionRequest] =
                             syncExec(
                                 NCSuggestSynonymManager.suggestWords(reqs, 
parent = parent)).
                                 flatMap { case (req, suggs) => suggs.map(_ -> 
req)
@@ -502,14 +512,19 @@ object NCContextWordEnricher extends NCServerEnricher {
                             (sugg, req) <- resps;
                                 senScore = normalize(sugg.score);
                                 (elemId, mdlCorpusSuggs) <- mdlCorpusData;
-                                elemScore = cfg.elements(elemId);
-                                corpusScoreOpt =
+                                elemCfg = cfg.elements(elemId);
+                                corpusScores =
                                     mdlCorpusSuggs.get(
                                         sugg.word.toLowerCase, 
stem(sugg.word), getSuggestionLemma(req, sugg)
                                     )
-                                if corpusScoreOpt.isDefined && 
isMatched(elemScore, corpusScoreOpt.get, senScore)
-                        )
-                            add(ns.tokens(req.index), elemId, Score(senScore, 
sugg.word), Score(corpusScoreOpt.get, sugg.word))
+                                // TODO:
+                                if isMatched(elemCfg, senScore) && 
isMatched(elemCfg, corpusScores :_*)
+                        ) {
+                            add(ns.tokens(req.index), elemId, Score(senScore, 
Some(sugg.word)))
+//
+//                            for (corpusScore <- corpusScores)
+//                                add(ns.tokens(req.index), elemId, 
Score(corpusScore, Some(sugg.word)))
+                        }
                     }
 
                     ns.ctxWordData = detected.map {
@@ -520,7 +535,7 @@ object NCContextWordEnricher extends NCServerEnricher {
                         logger.info("Sentence detected elements:")
 
                         for ((tok, elems) <- detected)
-                            logger.info(s"${tok.origText}: ${elems.mkString(", 
")}")
+                            logger.info(s"${tok.origText}: 
${elems.sortBy(-_.scores.map(_.score).max).mkString(", ")}")
                     }
                 case None => // No-op.
             }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala
index 0b30012..c2c8d23 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala
@@ -78,7 +78,7 @@ object NCSuggestSynonymManager extends NCService {
                 case 200 =>
                     val data: util.List[util.List[NCWordSuggestion]] = 
GSON.fromJson(js, TYPE_RESP)
 
-                    data.asScala.map(p => if (p.isEmpty) Seq.empty else 
p.asScala.tail.toSeq).toSeq
+                    data.asScala.map(p => if (p.isEmpty) Seq.empty else 
p.asScala.toSeq).toSeq
 
                 case _ =>
                     throw new NCE(
@@ -502,8 +502,10 @@ object NCSuggestSynonymManager extends NCService {
                         if (cnt.incrementAndGet() == batches.size) {
                             val min = minScoreOpt.getOrElse(DFLT_MIN_SCORE)
 
-                            val map: Map[NCSuggestionRequest, 
Seq[NCWordSuggestion]] = data.asScala.groupBy(_.request).map { case (req, map) 
=>
-                                req -> 
map.flatMap(_.suggestions.filter(_.score >= min).toSeq)
+                            val map: Map[NCSuggestionRequest, 
Seq[NCWordSuggestion]] =
+                                data.asScala.groupBy(_.request).map {
+                                    case (req, ress) =>
+                                        req -> 
ress.flatMap(_.suggestions.filter(_.score >= min).toSeq).sortBy(-_.score)
                             }
 
                             // TODO ? logic?
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
index a8ac6c3..5a902f9 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
@@ -25,6 +25,7 @@ import org.junit.jupiter.api.Test
 
 import java.util
 import java.util.{Collections, Optional}
+import scala.collection.mutable.ArrayBuffer
 import scala.jdk.CollectionConverters.{CollectionHasAsScala, MapHasAsJava, 
SeqHasAsJava, SetHasAsJava}
 
 object NCContextWordSpecModel {
@@ -49,8 +50,8 @@ class NCContextWordSpecModel extends NCModel {
     override def getName: String = this.getClass.getSimpleName
     override def getVersion: String = "1.0.0"
 
-    val MDL_LEVEL = 0.0
-    val MDL_POLICY = ALL
+    val MDL_LEVEL = 0.4
+    val MDL_POLICY = AVERAGE
 
     override def getContextWordModelConfig: Optional[NCContextWordModelConfig] 
= {
         Optional.of(
@@ -68,18 +69,18 @@ class NCContextWordSpecModel extends NCModel {
 
                 override def getCorpus: util.List[String] =
                     Seq(
-//                        "I like drive my new BMW",
-//                        "BMW has the best engine",
-//                        "Luxury cars like Mercedes and BMW  are prime 
targets",
-//                        "BMW will install side air bags up front",
+                        "I like drive my new BMW",
+                        "BMW has the best engine",
+                        "Luxury cars like Mercedes and BMW  are prime targets",
+                        "BMW will install side air bags up front",
 
                         "A wild cat is very dangerous",
                         "A fox eats hens",
                         "The fox was already in your chicken house",
 
-//                        "What is the local temperature?",
-//                        "This is the first day of heavy rain",
-//                        "It is the beautiful day, the sun is shining"
+                        "What is the local temperature?",
+                        "This is the first day of heavy rain",
+                        "It is the beautiful day, the sun is shining"
                     ).asJava
             }
         )
@@ -87,9 +88,9 @@ class NCContextWordSpecModel extends NCModel {
 
     override def getElements: util.Set[NCElement] =
         Set(
-            //Element("class:cars", MDL_LEVEL, Value("BMW")),
+            Element("class:cars", MDL_LEVEL, Value("BMW")),
             Element("class:animal", MDL_LEVEL, Value("fox"), Value("cat", 
"tomcat")),
-            //Element("class:weather", MDL_LEVEL, Value("temperature"), 
Value("rain"), Value("sun"))
+            Element("class:weather", MDL_LEVEL, Value("temperature"), 
Value("rain"), Value("sun"))
         ).map(p => {
             val e: NCElement = p
 
@@ -97,6 +98,8 @@ class NCContextWordSpecModel extends NCModel {
         }).asJava
 
     override def onContext(ctx: NCContext): NCResult = {
+        val varRes = ArrayBuffer.empty[String]
+
         val ok =
             ctx.getVariants.asScala.exists(v => {
                 val testGroupToks = 
v.asScala.toSeq.filter(_.getGroups.contains("testGroup"))
@@ -104,10 +107,19 @@ class NCContextWordSpecModel extends NCModel {
                 val elemIds = testGroupToks.map(_.getId).distinct.mkString(" ")
                 val words = testGroupToks.map(_.getOriginalText).mkString(" ")
 
+                val res = s"$elemIds $words"
+
+                varRes += res
+
                 NCContextWordSpecModel.expected == s"$elemIds $words"
             })
 
-        NCResult.text(if (ok) "OK" else "ERROR")
+        NCResult.text(
+            if (ok)
+                "OK"
+            else
+                s"ERROR: variant '${NCContextWordSpecModel.expected}' not 
found. Found: ${varRes.mkString(", ")}"
+        )
     }
 }
 
@@ -119,22 +131,24 @@ class NCContextWordSpec extends NCTestContext {
     private def check(txt: String, elemId: String, words: String*): Unit = {
         NCContextWordSpecModel.expected = s"$elemId ${words.mkString(" ")}"
 
-        require(getClient.ask(txt).getResult.get() == "OK")
+        val res = getClient.ask(txt).getResult.get()
+
+        require(res == "OK", s"Unexpected: $res")
     }
 
     @Test
     private[ctxword] def test(): Unit = {
-        //check("I want to have dogs and foxes", "class:animal", "dogs", 
"foxes")
+        check("I want to have dogs and foxes", "class:animal", "dogs", "foxes")
         check("I bought dog's meat", "class:animal", "dog")
-//        check("I bought meat dog's", "class:animal", "dog")
-//
-//        check("I want to have a dog and fox", "class:animal", "dog", "fox")
-//        check("I fed your fish", "class:animal", "fish")
-//
-//        check("I like to drive my Porsche and Volkswagen", "class:cars", 
"Porsche", "Volkswagen")
-//        check("Peugeot added motorcycles to its range in 1901", 
"class:cars", "Peugeot", "motorcycles")
-//
-//        check("The frost is possible today", "class:weather", "frost")
-//        check("There's a very strong wind from the east now", 
"class:weather", "wind")
+        check("I bought meat dog's", "class:animal", "dog")
+
+        check("I want to have a dog and fox", "class:animal", "dog", "fox")
+        check("I fed your fish", "class:animal", "fish")
+
+        check("I like to drive my Porsche and Volkswagen", "class:cars", 
"Porsche", "Volkswagen")
+        check("Peugeot added motorcycles to its range in 1901", "class:cars", 
"Peugeot", "motorcycles")
+
+        check("The frost is possible today", "class:weather", "frost")
+        check("There's a very strong wind from the east now", "class:weather", 
"wind")
     }
 }

[incubator-nlpcraft] branch NLPCRAFT-70_NEW updated: WIP.

Reply via email to