[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP.

sergeykamov Fri, 16 Dec 2022 22:12:58 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
     new d6e42140 WIP.
d6e42140 is described below

commit d6e42140ad5102b1766d3d53823fa7f09baa15e8
Author: Sergey Kamov <[email protected]>
AuthorDate: Sat Dec 17 10:08:39 2022 +0400

    WIP.
---
 .../apache/nlpcraft/internal/util/NCUtils.scala    | 30 ++++++++++------------
 .../nlp/enrichers/NCDictionaryTokenEnricher.scala  |  2 +-
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala |  4 +--
 .../nlp/enrichers/NCQuotesTokenEnricher.scala      |  4 +--
 .../nlp/enrichers/NCSwearWordsTokenEnricher.scala  |  2 +-
 5 files changed, 20 insertions(+), 22 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index 6c0160a6..8804faac 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -455,18 +455,17 @@ object NCUtils extends LazyLogging:
     /**
       *  Reads lines from given resource.
       *
-      * @param res
-      * @param enc
-      * @param strip
-      * @param convert
-      * @param filterText
-      * @param log
-      * @return
+      * @param res Resource, file absolute or relative path or input stream.
+      * @param enc Encoding. Default value is "UTF-8".
+      * @param strip Strip flag. If `true` it strips all read lines. Default 
value is `true`.
+      * @param convert Line conversion method. Applied after `strip`. By 
default it passes lines as is.
+      * @param filterText. Filtering text flag. If `true` it skips empty lines 
and lines with headers (# symbol). Default value is `false`.
+      * @param log Logger.
       */
     def readLines(
         res: String | File | InputStream,
         enc: String = "UTF-8",
-        strip: Boolean = false,
+        strip: Boolean = true,
         convert: String => String = s => s,
         filterText: Boolean = false,
         log: Logger = logger
@@ -487,19 +486,18 @@ object NCUtils extends LazyLogging:
         catch case e: IOException => E(s"Failed to read stream: $res", e)
 
     /**
-      *
+      * @param res Gzip resource, file absolute or relative path.
       * @param res
-      * @param enc
-      * @param strip
-      * @param convert
-      * @param filterText
-      * @param log
-      * @return
+      * @param enc        Encoding. Default value is "UTF-8".
+      * @param strip      Strip flag. If `true` it strips all read lines. 
Default value is `true`.
+      * @param convert    Line conversion method. Applied after `strip`. By 
default it passes lines as is.
+      * @param filterText . Filtering text flag. If `true` it skips empty 
lines and lines with headers (# symbol). Default value is `false`.
+      * @param log Logger.
       */
     def readGzipLines(
         res: String,
         enc: String = "UTF-8",
-        strip: Boolean = false,
+        strip: Boolean = true,
         convert: String => String = s => s,
         filterText: Boolean = false,
         log: Logger = logger
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index 6a1eb444..aa7cb9d0 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -42,7 +42,7 @@ class NCDictionaryTokenEnricher(dictRes: String) extends 
NCTokenEnricher with La
 
     init()
 
-    private def init(): Unit = dict = U.readLines(res = dictRes, strip = true, 
filterText = true, log = logger).toSet
+    private def init(): Unit = dict = U.readLines(res = dictRes, filterText = 
true, log = logger).toSet
 
     /** @inheritdoc */
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
List[NCToken]): Unit =
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 4ad1f627..c4bc6b46 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -100,7 +100,7 @@ private object NCEnStopWordsTokenEnricher extends 
LazyLogging:
         "percent"
     )
 
-    private def read(path: String): Set[String] = U.readGzipLines(path, strip 
= true, convert = _.toLowerCase, filterText = true, log = logger).toSet
+    private def read(path: String): Set[String] = U.readGzipLines(path, 
convert = _.toLowerCase, filterText = true, log = logger).toSet
     private def getPos(t: NCToken): String = U.getProperty(t, "pos")
     private def getLemma(t: NCToken): String = U.getProperty(t, "lemma")
     private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
@@ -322,7 +322,7 @@ class NCEnStopWordsTokenEnricher(
         percents = PERCENTS.map(getStem)
 
         // Case sensitive.
-        val m = readStopWords(U.readLines(res = "stopwords/stop_words.txt", 
strip = true, filterText = true, log = logger))
+        val m = readStopWords(U.readLines(res = "stopwords/stop_words.txt", 
filterText = true, log = logger))
 
         stopWords = m(false)
         exceptions = m(true)
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
index 30cf49af..8912e178 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
@@ -45,8 +45,8 @@ import NCQuotesTokenEnricher.*
   * `false` value indicates otherwise.
   *
   * Supported quotes are: **«**, **»**, **"**, **'**, **&#96;**.
-  * For any invalid situations, like unexpected quotes count or their invalid 
order detection, for all tokens
-  * property `quoted` value assigned as `false`.
+  * For any invalid cases, like invalid quotes order otr count,
+  * property `quoted` assigned as `false` for all input tokens.
   */
 //noinspection ScalaWeakerAccess
 class NCQuotesTokenEnricher extends NCTokenEnricher with LazyLogging:
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
index 2aef5bbb..d393a2d4 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
@@ -49,7 +49,7 @@ class NCSwearWordsTokenEnricher(dictRes: String, stemmer: 
NCStemmer) extends NCT
     init()
 
     private def init(): Unit =
-        swearWords = NCUtils.readLines(res = dictRes, strip = true, convert = 
s => stemmer.stem(s.toLowerCase), filterText = true, log = logger).toSet
+        swearWords = NCUtils.readLines(res = dictRes, convert = s => 
stemmer.stem(s.toLowerCase), filterText = true, log = logger).toSet
 
     /** @inheritdoc */
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
List[NCToken]): Unit =

[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP.

Reply via email to