This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new 863089d6 WIP.
863089d6 is described below
commit 863089d6761e356f23439868ad63648933d390b9
Author: Sergey Kamov <[email protected]>
AuthorDate: Sun Dec 18 21:32:38 2022 +0400
WIP.
---
.../apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala | 5 +++--
.../apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala | 4 +++-
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index aa7cb9d0..2efc5468 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -34,7 +34,8 @@ import org.apache.nlpcraft.internal.util.NCUtils as U
* metadata property before this enricher in your [[NCPipeline pipeline]].
*
* @param dictRes Relative path, absolute path or URL to the dictionary file.
The dictionary should have a simple
- * plain text format with *one lemma per line* with no empty lines,
header or other comments allowed.
+ * plain text format with *one lemma per line*, empty lines are
skipped, duplicates ignored, header or other comments allowed.
+ * Headers are lines started by **#** symbol. Search in the
dictionary is implemented by input words **lemms**, case is ignored.
*/
//noinspection DuplicatedCode,ScalaWeakerAccess
class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher with
LazyLogging:
@@ -42,7 +43,7 @@ class NCDictionaryTokenEnricher(dictRes: String) extends
NCTokenEnricher with La
init()
- private def init(): Unit = dict = U.readLines(res = dictRes, filterText =
true, log = logger).toSet
+ private def init(): Unit = dict = U.readLines(res = dictRes, filterText =
true, convert = _.toLowerCase, log = logger).toSet
/** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
List[NCToken]): Unit =
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
index d393a2d4..85efc02d 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
@@ -36,7 +36,9 @@ import java.util.Objects
* Read more about stemming [[https://en.wikipedia.org/wiki/Stemming here]].
* Stemming is used here because it is too difficult to be based on more
accurate `lemma` approach for swear words.
*
- * @param dictRes Path to the swear dictionary. This swear dictionary should
has a simple plain text format with one dictionary word on one line.
+ * @param dictRes Path to the swear dictionary. The dictionary should have a
simple
+ * plain text format with *one word per line*, empty lines are
skipped, duplicates ignored, header or other comments allowed.
+ * Headers are lines started by **#** symbol. Search in the
dictionary is implemented by input words **stems**, case is ignored.
* @param stemmer Stemmer implementation for the dictionary language.
*/
//noinspection ScalaWeakerAccess