This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new ac03dc8e WIP.
ac03dc8e is described below
commit ac03dc8efc9e7cee4a0d8d229f7e71f0b0fb88c4
Author: Sergey Kamov <[email protected]>
AuthorDate: Sun Dec 18 21:52:22 2022 +0400
WIP.
---
.../nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 10 ++++++----
.../nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala | 4 ++--
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index c4bc6b46..6dfb1b2c 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -207,9 +207,11 @@ class NCEnStopWordsTokenEnricher(
private var stopWords: StopWordHolder = _
private var exceptions: StopWordHolder = _
- private case class TokenExtra(lemma: String, stemTxt: String):
- val stemLemma: String = getStem(lemma)
-
+ private case class TokenExtra(lemma: String, stemTxt: String, stemLemma:
String)
+ private object TokenExtra:
+ def apply(t: NCToken): TokenExtra =
+ val lemma = getLemma(t)
+ new TokenExtra(lemma, getStem(t.getText), getStem(lemma))
init()
private def getStem(s: String): String = stemmer.stem(s.toLowerCase)
@@ -540,7 +542,7 @@ class NCEnStopWordsTokenEnricher(
val extraToks =
scala.collection.mutable.LinkedHashMap.empty[NCToken, TokenExtra]
++=
- toks.map(t => t -> TokenExtra(getLemma(t), getStem(t.getText)))
+ toks.map(t => t -> TokenExtra(t))
for ((tok, extra) <- extraToks)
val idx = tok.getIndex
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala
index 410edcba..adb66e4a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala
@@ -17,8 +17,8 @@
package org.apache.nlpcraft.nlp.enrichers.tools
-import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.stemmer.NCEnStemmer
import scala.collection.mutable
@@ -26,7 +26,7 @@ import scala.collection.mutable
* Generates first word sequences.
*/
object NCEnStopWordGenerator:
- private final lazy val stemmer = new PorterStemmer
+ private final lazy val stemmer = new NCEnStemmer
// Output files.
private val FIRST_WORDS_FILE = "first_words.txt"