This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new d6e42140 WIP.
d6e42140 is described below
commit d6e42140ad5102b1766d3d53823fa7f09baa15e8
Author: Sergey Kamov <[email protected]>
AuthorDate: Sat Dec 17 10:08:39 2022 +0400
WIP.
---
.../apache/nlpcraft/internal/util/NCUtils.scala | 30 ++++++++++------------
.../nlp/enrichers/NCDictionaryTokenEnricher.scala | 2 +-
.../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 4 +--
.../nlp/enrichers/NCQuotesTokenEnricher.scala | 4 +--
.../nlp/enrichers/NCSwearWordsTokenEnricher.scala | 2 +-
5 files changed, 20 insertions(+), 22 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index 6c0160a6..8804faac 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -455,18 +455,17 @@ object NCUtils extends LazyLogging:
/**
* Reads lines from given resource.
*
- * @param res
- * @param enc
- * @param strip
- * @param convert
- * @param filterText
- * @param log
- * @return
+ * @param res Resource, file absolute or relative path or input stream.
+ * @param enc Encoding. Default value is "UTF-8".
+ * @param strip Strip flag. If `true` it strips all read lines. Default
value is `true`.
+ * @param convert Line conversion method. Applied after `strip`. By
default it passes lines as is.
+ * @param filterText. Filtering text flag. If `true` it skips empty lines
and lines with headers (# symbol). Default value is `false`.
+ * @param log Logger.
*/
def readLines(
res: String | File | InputStream,
enc: String = "UTF-8",
- strip: Boolean = false,
+ strip: Boolean = true,
convert: String => String = s => s,
filterText: Boolean = false,
log: Logger = logger
@@ -487,19 +486,18 @@ object NCUtils extends LazyLogging:
catch case e: IOException => E(s"Failed to read stream: $res", e)
/**
- *
+ * @param res Gzip resource, file absolute or relative path.
* @param res
- * @param enc
- * @param strip
- * @param convert
- * @param filterText
- * @param log
- * @return
+ * @param enc Encoding. Default value is "UTF-8".
+ * @param strip Strip flag. If `true` it strips all read lines.
Default value is `true`.
+ * @param convert Line conversion method. Applied after `strip`. By
default it passes lines as is.
+ * @param filterText . Filtering text flag. If `true` it skips empty
lines and lines with headers (# symbol). Default value is `false`.
+ * @param log Logger.
*/
def readGzipLines(
res: String,
enc: String = "UTF-8",
- strip: Boolean = false,
+ strip: Boolean = true,
convert: String => String = s => s,
filterText: Boolean = false,
log: Logger = logger
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index 6a1eb444..aa7cb9d0 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -42,7 +42,7 @@ class NCDictionaryTokenEnricher(dictRes: String) extends
NCTokenEnricher with La
init()
- private def init(): Unit = dict = U.readLines(res = dictRes, strip = true,
filterText = true, log = logger).toSet
+ private def init(): Unit = dict = U.readLines(res = dictRes, filterText =
true, log = logger).toSet
/** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
List[NCToken]): Unit =
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 4ad1f627..c4bc6b46 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -100,7 +100,7 @@ private object NCEnStopWordsTokenEnricher extends
LazyLogging:
"percent"
)
- private def read(path: String): Set[String] = U.readGzipLines(path, strip
= true, convert = _.toLowerCase, filterText = true, log = logger).toSet
+ private def read(path: String): Set[String] = U.readGzipLines(path,
convert = _.toLowerCase, filterText = true, log = logger).toSet
private def getPos(t: NCToken): String = U.getProperty(t, "pos")
private def getLemma(t: NCToken): String = U.getProperty(t, "lemma")
private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
@@ -322,7 +322,7 @@ class NCEnStopWordsTokenEnricher(
percents = PERCENTS.map(getStem)
// Case sensitive.
- val m = readStopWords(U.readLines(res = "stopwords/stop_words.txt",
strip = true, filterText = true, log = logger))
+ val m = readStopWords(U.readLines(res = "stopwords/stop_words.txt",
filterText = true, log = logger))
stopWords = m(false)
exceptions = m(true)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
index 30cf49af..8912e178 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
@@ -45,8 +45,8 @@ import NCQuotesTokenEnricher.*
* `false` value indicates otherwise.
*
* Supported quotes are: **«**, **»**, **"**, **'**, **`**.
- * For any invalid situations, like unexpected quotes count or their invalid
order detection, for all tokens
- * property `quoted` value assigned as `false`.
+ * For any invalid cases, like invalid quotes order otr count,
+ * property `quoted` assigned as `false` for all input tokens.
*/
//noinspection ScalaWeakerAccess
class NCQuotesTokenEnricher extends NCTokenEnricher with LazyLogging:
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
index 2aef5bbb..d393a2d4 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
@@ -49,7 +49,7 @@ class NCSwearWordsTokenEnricher(dictRes: String, stemmer:
NCStemmer) extends NCT
init()
private def init(): Unit =
- swearWords = NCUtils.readLines(res = dictRes, strip = true, convert =
s => stemmer.stem(s.toLowerCase), filterText = true, log = logger).toSet
+ swearWords = NCUtils.readLines(res = dictRes, convert = s =>
stemmer.stem(s.toLowerCase), filterText = true, log = logger).toSet
/** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
List[NCToken]): Unit =