This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new 877ce967 WIP.
877ce967 is described below
commit 877ce967f7981c0ab105d580504ec5cc1d00eb9a
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 14 15:45:22 2022 +0400
WIP.
---
.../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 25 ++++++++++------------
1 file changed, 11 insertions(+), 14 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 4312ea54..25dba557 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -31,7 +31,7 @@ import scala.concurrent.ExecutionContext
/**
* Companion helper.
*/
-private object NCEnStopWordsTokenEnricher:
+private object NCEnStopWordsTokenEnricher extends LazyLogging:
// Condition types.
private type Wildcard = (String, String)
private type Word = String
@@ -82,6 +82,11 @@ private object NCEnStopWordsTokenEnricher:
"--" // Synthetic POS.
)
+ // Stemmatization is done already by generator.
+ // It is initialized in the companion for test performance reasons.
+ private val FIRST_WORDS: Set[String] = read("stopwords/first_words.txt.gz")
+ private val NOUN_WORDS: Set[String] = read("stopwords/noun_words.txt.gz")
+
private val STOP_BEFORE_STOP: Seq[Word] = Seq("DT", "PRP", "PRP$", "WDT",
"WP", "WP$", "WRB")
private val Q_POS = Set("``", "''")
private val PERCENTS = Set(
@@ -95,6 +100,7 @@ private object NCEnStopWordsTokenEnricher:
"percent"
)
+ private def read(path: String): Set[String] =
NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
private def getPos(t: NCToken): String = t.get("pos").getOrElse(throw new
NCException(s"POS not found in token: ${t.keysSet}"))
private def getLemma(t: NCToken): String = t.get("lemma").getOrElse(throw
new NCException(s"Lemma not found in token: ${t.keysSet}"))
private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
@@ -193,11 +199,11 @@ class NCEnStopWordsTokenEnricher(
exclSet: Set[String] = Set.empty,
stemmer: NCStemmer = new NCEnStemmer
) extends NCTokenEnricher with LazyLogging:
+ require(stemmer != null, "Stemmer cannot be null.")
+
private var addStems: Set[String] = _
private var exclStems: Set[String] = _
private var percents: Set[String] = _
- private var firstWords: Set[String] = _
- private var nounWords: Set[String] = _
private var stopWords: StopWordHolder = _
private var exceptions: StopWordHolder = _
@@ -206,7 +212,6 @@ class NCEnStopWordsTokenEnricher(
init()
- private def read(path: String): Set[String] =
NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
private def getStem(s: String): String = stemmer.stem(s.toLowerCase)
private def toStemKey(toks: Seq[NCToken]): String =
toks.map(_.getText).map(getStem).mkString(" ")
@@ -316,14 +321,6 @@ class NCEnStopWordsTokenEnricher(
percents = PERCENTS.map(getStem)
- // Stemmatization is done already by generator.
- NCUtils.execPar(
- Seq(
- () => firstWords = read("stopwords/first_words.txt.gz"),
- () => nounWords = read("stopwords/noun_words.txt.gz")
- )
- )(ExecutionContext.Implicits.global)
-
// Case sensitive.
val m = readStopWords(
NCUtils.readResource("stopwords/stop_words.txt", "UTF-8", logger)
@@ -605,7 +602,7 @@ class NCEnStopWordsTokenEnricher(
// All sentence first stopword + first non stop word.
val startToks = toks.takeWhile(isStopWord) ++ toks.find(p =>
!isStopWord(p)).map(p => p)
- for (startTok <- startToks; tup <- origToks.filter(_._1.head ==
startTok); key = tup._2 if firstWords.contains(key) && !isException(tup._1))
+ for (startTok <- startToks; tup <- origToks.filter(_._1.head ==
startTok); key = tup._2 if FIRST_WORDS.contains(key) && !isException(tup._1))
tup._1.foreach(tok => stops += tok)
foundKeys += key
@@ -615,7 +612,7 @@ class NCEnStopWordsTokenEnricher(
// +-------------------------------------------------+
for (tup <- origToks; key = tup._2 if !foundKeys.contains(key) &&
!isException(tup._1))
foundKeys.find(key.startsWith) match
- case Some(s) => if
nounWords.contains(key.substring(s.length).strip) then tup._1.foreach(tok =>
stops += tok)
+ case Some(s) => if
NOUN_WORDS.contains(key.substring(s.length).strip) then tup._1.foreach(tok =>
stops += tok)
case None => ()
// +-------------------------------------------------+