[incubator-nlpcraft] branch NLPCRAFT-70_NEW updated: WIP.

sergeykamov Thu, 08 Jul 2021 02:57:04 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-70_NEW by this push:
     new 486bb34  WIP.
486bb34 is described below

commit 486bb344c0c9dbc63ebc53abe14c200d5ee280bd
Author: Sergey Kamov <[email protected]>
AuthorDate: Thu Jul 8 12:56:49 2021 +0300

    WIP.
---
 .../ctxword/NCContextWordCategoriesEnricher.scala  | 60 ++++++++++------------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordCategoriesEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordCategoriesEnricher.scala
index fe88cdc..ef7ac9e 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordCategoriesEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordCategoriesEnricher.scala
@@ -50,7 +50,7 @@ object NCContextWordCategoriesEnricher extends 
NCServerEnricher {
 
     private case class Reason(word: String, suggestionConfidence: Double, 
corpusConfidence: Double) {
         override def toString: String =
-            s"Word: $word, suggestionConf=${FMT.format(suggestionConfidence)}, 
corpusConf=${FMT.format(corpusConfidence)}"
+            s"Word: $word, confidences: 
suggestion=${FMT.format(suggestionConfidence)}, 
corpus=${FMT.format(corpusConfidence)}"
     }
 
     private case class Confidence(value: Double, reason: Option[Reason] = 
None) {
@@ -88,12 +88,14 @@ object NCContextWordCategoriesEnricher extends 
NCServerEnricher {
     // Service which responsible for all confidences calculations.
     private object ConfMath {
         /**
-          * Squeeze confidences values of some words to one value.
+          * Squeeze word's confidences values list (result of corpus 
processing) to single value.
           *
-          * @param confs Confidences values for some word. It is result of 
corpus processing.
+          * @param confs Word's confidences values for some.
+          * @return Calculated single value. `None` means that this word 
shouldn't ne taken into account for element.
           */
-        def squeeze(confs: Seq[Double]): Option[Double] =
-             // Drops if there is not enough data.
+        def squeeze(confs: Seq[Double]): Option[Double] = {
+            // Drops if there is not enough data.
+            // For one element we have few samples. Each word should be 
offered few times.
             if (confs.length < 3)
                 None
             else {
@@ -104,9 +106,10 @@ object NCContextWordCategoriesEnricher extends 
NCServerEnricher {
 
                 Some(maxN.sum / maxN.length)
             }
+        }
 
         /**
-          * Calculates confidence values based on suggested confidence and 
corpus confidence.
+          * Calculates confidence values based on suggested confidence for 
given word and corpus confidence.
           *
           * @param suggConf Suggestion confidence for noun of given sentence.
           * @param corpusConf Corpus confidence which found via suggestion, 
co-reference.
@@ -196,7 +199,8 @@ object NCContextWordCategoriesEnricher extends 
NCServerEnricher {
                     val idxs =
                         getIndexes(corpusNlp.map(_.normalWord), 
elemSingleValsNorm) ++
                         getIndexes(corpusNlp.map(_.stem), elemSingleValsStem) 
++
-                        // Sample can have word in plural forms. We can 
compare them with synonyms values.
+                        // Sample can have word in plural forms.
+                        // We can compare them with synonyms values (suppose 
that model synonyms value defined as lemma)
                         getIndexes(corpusNlp.map(p => norm(p.lemma)), 
elemSingleValsNorm)
 
                     def mkRequest(idx: Int, syn: String): NCSuggestionRequest 
= {
@@ -222,9 +226,9 @@ object NCContextWordCategoriesEnricher extends 
NCServerEnricher {
             }
 
     /**
+      * Context word server returned values have confidence in range (0..2).
       *
-      * @param conf
-      * @return
+      * @param conf Context word server confidence value.
       */
     private def normalizeConf(conf: Double): Double = conf / MAX_CTXWORD_SCORE
 
@@ -283,9 +287,7 @@ object NCContextWordCategoriesEnricher extends 
NCServerEnricher {
     private def substitute(words: Seq[String], word: String, index: Int): 
Seq[String] = {
         require(index < words.length)
 
-        words.zipWithIndex.map { case (w, i) => if (i != index) w
-        else word
-        }
+        words.zipWithIndex.map { case (w, i) => if (i != index) w else word }
     }
 
     /**
@@ -329,10 +331,7 @@ object NCContextWordCategoriesEnricher extends 
NCServerEnricher {
                 t #= ("Request", "Responses")
 
                 for ((req, resp) <- respsSeq)
-                    t += (
-                        req,
-                        s"${resp.map(p => 
s"${p.word}=${FMT.format(normalizeConf(p.score))}").mkString(", ")}"
-                    )
+                    t += (req, s"${resp.map(p => 
s"${p.word}=${FMT.format(normalizeConf(p.score))}").mkString(", ")}")
 
                 t.info(logger, Some("Corpus requests:"))
             }
@@ -393,13 +392,12 @@ object NCContextWordCategoriesEnricher extends 
NCServerEnricher {
                         val stemsAll = stems -- normals.keySet
                         val lemmasAll = lemmas -- normals.keySet -- 
stems.keySet
 
+                        def mkDebugElementCell(normsSize: Int, stemsSize: Int, 
lemmasSize: Int): String =
+                            s"Element: $elemId [normals=$normsSize, 
stems=$stemsSize, lemmas=$lemmasSize]"
+
                         if (DEBUG_MODE)
                             tabAll += (
-                                s"Element: $elemId [" +
-                                s"normals=${normalsAll.size}, " +
-                                s"stems=${stemsAll.size}, " +
-                                s"lemmas=${lemmasAll.size}" +
-                                s"]",
+                                mkDebugElementCell(normalsAll.size, 
stemsAll.size, lemmasAll.size),
                                 toStr(
                                     normalsAll.toSeq.
                                         sortBy(p => (-p._2.max, 
-p._2.size)).map(
@@ -416,24 +414,21 @@ object NCContextWordCategoriesEnricher extends 
NCServerEnricher {
                                 }
                             }
 
-                        val normalsNorm = squeeze(normalsAll)
-                        val stemsNorm = squeeze(stemsAll)
-                        val lemmasNorm = squeeze(lemmasAll)
+                        val normalsSingle = squeeze(normalsAll)
+                        val stemsSingle = squeeze(stemsAll)
+                        val lemmasSingle = squeeze(lemmasAll)
 
                         if (DEBUG_MODE)
                             tabNorm += (
-                                s"Element: $elemId [" +
-                                s"normals=${normalsNorm.size}, " +
-                                s"stems=${stemsNorm.size}, " +
-                                s"lemmas=${lemmasNorm.size}",
+                                mkDebugElementCell(normalsSingle.size, 
stemsSingle.size, lemmasSingle.size),
                                 toStr(
-                                    normalsNorm.toSeq.sortBy(-_._2).map(
+                                    normalsSingle.toSeq.sortBy(-_._2).map(
                                         { case (k, factor) => 
s"$k=${FMT.format(factor)}" }
                                     )
                                 )
                             )
 
-                        elemId -> ElementData(normalsNorm, stemsNorm, 
lemmasNorm)
+                        elemId -> ElementData(normalsSingle, stemsSingle, 
lemmasSingle)
                     }
 
             if (DEBUG_MODE) {
@@ -509,9 +504,8 @@ object NCContextWordCategoriesEnricher extends 
NCServerEnricher {
                         val reqs = idxs.map(idx => 
NCSuggestionRequest(ns.tokens.map(_.origText).toSeq, idx))
 
                         val resps: Map[NCWordSuggestion, NCSuggestionRequest] =
-                            syncExec(
-                                NCSuggestSynonymManager.suggestWords(reqs, 
parent = parent)
-                            ).flatMap { case (req, suggs) => suggs.map(_ -> 
req) }
+                            
syncExec(NCSuggestSynonymManager.suggestWords(reqs, parent = parent)).
+                                flatMap { case (req, suggs) => suggs.map(_ -> 
req) }
 
                         if (DEBUG_MODE) {
                             val t = NCAsciiTable()

[incubator-nlpcraft] branch NLPCRAFT-70_NEW updated: WIP.

Reply via email to