This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-41
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 6d7d0e2169cf9a96ddac3cb7a7a566d199ac3a8c
Author: Sergey Kamov <[email protected]>
AuthorDate: Sun Apr 26 11:44:54 2020 +0300

    WIP.
---
 .../model/tools/synonyms/NCSynonymsGenerator.scala | 39 +++++++++++-----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git 
a/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
 
b/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
index a0ff611..20b0f18 100644
--- 
a/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
+++ 
b/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
@@ -72,6 +72,7 @@ case class NCSynonymsGenerator(url: String, modelPath: 
String, minFactor: Double
     private def split(s: String): Seq[String] = s.split(" 
").toSeq.map(_.trim).filter(_.nonEmpty)
 
     private def toStem(s: String): String = 
split(s).map(NCNlpPorterStemmer.stem).mkString(" ")
+    private def toStemWord(s: String): String = NCNlpPorterStemmer.stem(s)
 
     // TODO: multithreading.
     private def ask(client: CloseableHttpClient, sen: String): Seq[Suggestion] 
= {
@@ -96,23 +97,28 @@ case class NCSynonymsGenerator(url: String, modelPath: 
String, minFactor: Double
 
         val client = HttpClients.createDefault
 
-        case class Word(word: String) {
+        case class Word(word: String, stem: String) {
             require(!word.contains(" "), s"Word cannot contains spaces: $word")
-            require(word.forall(ch ⇒ ch.isLetterOrDigit || ch == ''' || 
SEPARATORS.contains(ch)), s"Unsupported symbols: $word")
-
-            val stem: String = NCNlpPorterStemmer.stem(word)
+            require(
+                word.forall(ch ⇒
+                    ch.isLetterOrDigit ||
+                    ch == ''' ||
+                    SEPARATORS.contains(ch)
+                ),
+                s"Unsupported symbols: $word"
+            )
         }
 
         val examples =
             mdl.getExamples.asScala.
                 map(s ⇒ SEPARATORS.foldLeft(s)((s, ch) ⇒ 
s.replaceAll(s"\\$ch", s" $ch "))).
                 map(split).
-                map(_.map(Word)).
+                map(_.map(p ⇒ Word(p, toStemWord(p)))).
                 toSeq
 
         val elemSyns =
             mdl.getElements.asScala.map(e ⇒ e.getId → 
e.getSynonyms.asScala.flatMap(parser.expand)).
-                map { case (id, seq) ⇒ id → seq.map(txt ⇒ 
split(txt).map(Word))}.toMap
+                map { case (id, seq) ⇒ id → seq.map(txt ⇒ split(txt).map(p ⇒ 
Word(p, toStemWord(p))))}.toMap
 
         val cache = mutable.HashMap.empty[String, Seq[Suggestion]].withDefault(
             new (String ⇒ Seq[Suggestion]) {
@@ -123,26 +129,19 @@ case class NCSynonymsGenerator(url: String, modelPath: 
String, minFactor: Double
         val allSuggs =
             elemSyns.map {
                 case (elemId, elemSyns) ⇒
-                    val stemsSyns: Seq[(String, String)] =
-                        elemSyns.filter(_.size == 1).map(words ⇒ 
words.head.stem → words.head.word)
+                    val elemSingleSyns = elemSyns.filter(_.size == 
1).map(_.head)
+                    val elemStems = elemSingleSyns.map(_.stem)
 
                     val hs: Seq[Suggestion] =
-                        examples.flatMap(exWords ⇒ {
-                            val exStems = exWords.map(_.stem)
-
-                            val idxs =
-                                exStems.flatMap(stem ⇒
-                                    stemsSyns.find(_._1 == stem) match {
-                                        case Some(p) ⇒ 
Some(exStems.indexOf(p._1))
-                                        case None ⇒ None
-                                    }
-                                )
+                        examples.flatMap(example ⇒ {
+                            val exStems = example.map(_.stem)
+                            val idxs = exStems.flatMap(s ⇒ if 
(elemStems.contains(s)) Some(exStems.indexOf(s)) else None)
 
                             if (idxs.nonEmpty)
-                                stemsSyns.map(_._2).flatMap(syn ⇒
+                                elemSingleSyns.map(_.word).flatMap(syn ⇒
                                     idxs.flatMap(idx ⇒
                                         cache(
-                                            exWords.
+                                            example.
                                             zipWithIndex.map { case (w, i1) ⇒ 
if (idxs.contains(i1)) syn else w.word }.
                                             zipWithIndex.map { case (s, i2) ⇒ 
if (i2 == idx) s"$s#" else s}.
                                             mkString(" "))

Reply via email to