This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
     new 863089d6 WIP.
863089d6 is described below

commit 863089d6761e356f23439868ad63648933d390b9
Author: Sergey Kamov <[email protected]>
AuthorDate: Sun Dec 18 21:32:38 2022 +0400

    WIP.
---
 .../apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala    | 5 +++--
 .../apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala    | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index aa7cb9d0..2efc5468 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -34,7 +34,8 @@ import org.apache.nlpcraft.internal.util.NCUtils as U
   * metadata property before this enricher in your [[NCPipeline pipeline]].
   *
   * @param dictRes Relative path, absolute path or URL to the dictionary file. 
The dictionary should have a simple
-  *         plain text format with *one lemma per line* with no empty lines, 
header or other comments allowed.
+  *         plain text format with *one lemma per line*, empty lines are 
skipped, duplicates ignored, header or other comments allowed.
+  *         Headers are lines started by **#** symbol. Search in the 
dictionary is implemented by input words **lemms**, case is ignored.
   */
 //noinspection DuplicatedCode,ScalaWeakerAccess
 class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher with 
LazyLogging:
@@ -42,7 +43,7 @@ class NCDictionaryTokenEnricher(dictRes: String) extends 
NCTokenEnricher with La
 
     init()
 
-    private def init(): Unit = dict = U.readLines(res = dictRes, filterText = 
true, log = logger).toSet
+    private def init(): Unit = dict = U.readLines(res = dictRes, filterText = 
true, convert = _.toLowerCase, log = logger).toSet
 
     /** @inheritdoc */
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
List[NCToken]): Unit =
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
index d393a2d4..85efc02d 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
@@ -36,7 +36,9 @@ import java.util.Objects
   * Read more about stemming [[https://en.wikipedia.org/wiki/Stemming here]].
   * Stemming is used here because it is too difficult to be based on more 
accurate `lemma` approach for swear words.
   *
-  * @param dictRes Path to the swear dictionary. This swear dictionary should 
has a simple plain text format with one dictionary word on one line.
+  * @param dictRes Path to the swear dictionary. The dictionary should have a 
simple
+  *         plain text format with *one word per line*, empty lines are 
skipped, duplicates ignored, header or other comments allowed.
+  *         Headers are lines started by **#** symbol. Search in the 
dictionary is implemented by input words **stems**, case is ignored.
   * @param stemmer Stemmer implementation for the dictionary language.
   */
 //noinspection ScalaWeakerAccess

Reply via email to