This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-41
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-41 by this push:
new 840ee68 WIP.
840ee68 is described below
commit 840ee683526de3a264b7e98a87d0e9b9b01c6c5a
Author: Sergey Kamov <[email protected]>
AuthorDate: Sat Apr 25 21:48:39 2020 +0300
WIP.
---
.../model/tools/synonyms/NCSynonymsGenerator.scala | 104 ++++++++++++---------
1 file changed, 61 insertions(+), 43 deletions(-)
diff --git
a/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
b/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
index 259b98c..a0ff611 100644
---
a/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
+++
b/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
@@ -39,47 +39,49 @@ case class NCSynonymsGenerator(url: String, modelPath:
String, minFactor: Double
// normalized - normalized bert value.
// score = normalized * weight + ftext * weight
// both `weights` = 1
- case class Suggestion(word: String, bert: String, normalized: String,
ftext: String, score: String) {
- override def toString: String = s"$word [bert=$bert, ftext=$ftext,
normalized=$normalized, score=$score]"
- }
+ case class Suggestion(word: String, bert: String, normalized: String,
ftext: String, score: String)
case class Request(sentence: String, simple: Boolean)
case class Response(data: java.util.ArrayList[Suggestion])
private val GSON = new Gson
private val TYPE_RESP: Type = new TypeToken[Response]() {}.getType
+ private val SEPARATORS = Seq('?', ',', '.', '-', '!')
- private def split(s: String): Seq[String] = s.split("
").toSeq.map(_.trim).filter(_.nonEmpty)
+ private val HANDLER = new ResponseHandler[Seq[Suggestion]]() {
+ override def handleResponse(resp: HttpResponse): Seq[Suggestion] = {
+ val code = resp.getStatusLine.getStatusCode
+ val e = resp.getEntity
- private def ask(client: CloseableHttpClient, sen: String): Seq[Suggestion]
= {
- val post = new HttpPost(url)
+ val js = if (e != null) EntityUtils.toString(e) else null
- post.setHeader("Content-Type", "application/json")
- post.setEntity(new StringEntity(GSON.toJson(Request(sen, simple =
false)), "UTF-8"))
+ if (js == null)
+ throw new RuntimeException(s"Unexpected empty response
[code=$code]")
+
+ code match {
+ case 200 ⇒
+ val data: Response = GSON.fromJson(js, TYPE_RESP)
- val h = new ResponseHandler[Seq[Suggestion]]() {
- override def handleResponse(resp: HttpResponse): Seq[Suggestion] =
{
- val code = resp.getStatusLine.getStatusCode
- val e = resp.getEntity
+ data.data.asScala
- val js = if (e != null) EntityUtils.toString(e) else null
+ case 400 ⇒ throw new RuntimeException(js)
+ case _ ⇒ throw new RuntimeException(s"Unexpected response
[code=$code, text=$js]")
+ }
+ }
+ }
- if (js == null)
- throw new RuntimeException(s"Unexpected empty response
[code=$code]")
+ private def split(s: String): Seq[String] = s.split("
").toSeq.map(_.trim).filter(_.nonEmpty)
- code match {
- case 200 ⇒
- val data: Response = GSON.fromJson(js, TYPE_RESP)
+ private def toStem(s: String): String =
split(s).map(NCNlpPorterStemmer.stem).mkString(" ")
- data.data.asScala
+ // TODO: multithreading.
+ private def ask(client: CloseableHttpClient, sen: String): Seq[Suggestion]
= {
+ val post = new HttpPost(url)
- case 400 ⇒ throw new RuntimeException(js)
- case _ ⇒ throw new RuntimeException(s"Unexpected response
[code=$code, text=$js]")
- }
- }
- }
+ post.setHeader("Content-Type", "application/json")
+ post.setEntity(new StringEntity(GSON.toJson(Request(sen, simple =
false)), "UTF-8"))
try
- client.execute(post, h)
+ client.execute(post, HANDLER)
finally
post.releaseConnection()
}
@@ -95,21 +97,22 @@ case class NCSynonymsGenerator(url: String, modelPath:
String, minFactor: Double
val client = HttpClients.createDefault
case class Word(word: String) {
+ require(!word.contains(" "), s"Word cannot contains spaces: $word")
+ require(word.forall(ch ⇒ ch.isLetterOrDigit || ch == ''' ||
SEPARATORS.contains(ch)), s"Unsupported symbols: $word")
+
val stem: String = NCNlpPorterStemmer.stem(word)
}
- val examples: Seq[Seq[Word]] =
+ val examples =
mdl.getExamples.asScala.
- // TODO: Is it enough?
- map(_.replaceAll("\\?", " ?")).
- map(_.replaceAll("\\.", " .")).
- map(_.replaceAll(",", " ,")).
- map(_.replaceAll("!", " !")).
+ map(s ⇒ SEPARATORS.foldLeft(s)((s, ch) ⇒
s.replaceAll(s"\\$ch", s" $ch "))).
map(split).
map(_.map(Word)).
toSeq
- val elemSyns = mdl.getElements.asScala.map(e ⇒ e.getId →
e.getSynonyms.asScala.flatMap(parser.expand)).toMap
+ val elemSyns =
+ mdl.getElements.asScala.map(e ⇒ e.getId →
e.getSynonyms.asScala.flatMap(parser.expand)).
+ map { case (id, seq) ⇒ id → seq.map(txt ⇒
split(txt).map(Word))}.toMap
val cache = mutable.HashMap.empty[String, Seq[Suggestion]].withDefault(
new (String ⇒ Seq[Suggestion]) {
@@ -121,10 +124,7 @@ case class NCSynonymsGenerator(url: String, modelPath:
String, minFactor: Double
elemSyns.map {
case (elemId, elemSyns) ⇒
val stemsSyns: Seq[(String, String)] =
- elemSyns.
- map(text ⇒ text → split(text).map(Word)).
- filter { case( _, words) ⇒ words.size == 1 }.
- map { case(text, words) ⇒ words.head.stem → text }
+ elemSyns.filter(_.size == 1).map(words ⇒
words.head.stem → words.head.word)
val hs: Seq[Suggestion] =
examples.flatMap(exWords ⇒ {
@@ -144,7 +144,7 @@ case class NCSynonymsGenerator(url: String, modelPath:
String, minFactor: Double
cache(
exWords.
zipWithIndex.map { case (w, i1) ⇒
if (idxs.contains(i1)) syn else w.word }.
- zipWithIndex.map { case (w, i2) ⇒
if (i2 == idx) s"$w#" else w}.
+ zipWithIndex.map { case (s, i2) ⇒
if (i2 == idx) s"$s#" else s}.
mkString(" "))
)
)
@@ -155,7 +155,7 @@ case class NCSynonymsGenerator(url: String, modelPath:
String, minFactor: Double
elemId → hs
}.filter(_._2.nonEmpty)
- val allSyns = elemSyns.flatMap(_._2).toSet
+ val allSynsStems = elemSyns.flatMap(_._2).flatten.map(_.stem).toSet
val table = NCAsciiTable()
@@ -163,12 +163,30 @@ case class NCSynonymsGenerator(url: String, modelPath:
String, minFactor: Double
allSuggs.foreach { case (elemId, elemSuggs) ⇒
elemSuggs.
- groupBy(_.word).
- map { case (_, group) ⇒
group.sortBy(_.score.toDouble).reverse.head }. // Drops repeated.
- toSeq.sortBy(_.score.toDouble).reverse.
- filter(p ⇒ !allSyns.contains(p.word)). // TODO: drop by stem,
not by word as is
+ map(sugg ⇒ (sugg, toStem(sugg.word))).
+ groupBy { case (_, stem) ⇒ stem }.
+ filter { case (stem, _) ⇒ !allSynsStems.contains(stem) }.
+ map { case (_, group) ⇒
+ val seq = group.map { case (sugg, _) ⇒ sugg
}.sortBy(-_.score.toDouble)
+
+ // Drops repeated.
+ (seq.head, seq.length)
+ }.
+ // TODO: develop more intelligent sorting.
+ toSeq.sortBy { case (sugg, cnt) ⇒ (-cnt ,
-sugg.score.toDouble) }.
zipWithIndex.
- foreach { case (sugg, sugIdx) ⇒ table += (if (sugIdx == 0)
elemId else " ", sugg) }
+ foreach { case ((sugg, cnt), sugIdx) ⇒
+ table += (
+ if (sugIdx == 0) elemId else " ",
+ s"${sugg.word} " +
+ s"[count=$cnt, " +
+ s"bert=${sugg.bert}, " +
+ s"ftext=${sugg.ftext}, " +
+ s"norm=${sugg.normalized}, " +
+ s"score=${sugg.score}" +
+ s"]"
+ )
+ }
}
table.render()