This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new cfb460fd WIP.
cfb460fd is described below
commit cfb460fd8c9243feb5d245bb1e76eb39381de6ef
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Dec 16 21:35:37 2022 +0400
WIP.
---
.../internal/intent/compiler/NCIDLCompiler.scala | 6 +-
.../apache/nlpcraft/internal/util/NCUtils.scala | 150 +++++++--------------
.../nlp/enrichers/NCDictionaryTokenEnricher.scala | 9 +-
.../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 17 +--
.../nlp/enrichers/NCSwearWordsTokenEnricher.scala | 4 +-
.../nlpcraft/internal/util/NCUtilsSpec.scala | 36 +++++
6 files changed, 99 insertions(+), 123 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/intent/compiler/NCIDLCompiler.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/intent/compiler/NCIDLCompiler.scala
index 30ca8474..dded76da 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/intent/compiler/NCIDLCompiler.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/intent/compiler/NCIDLCompiler.scala
@@ -322,20 +322,20 @@ class NCIDLCompiler(cfg: NCModelConfig) extends
LazyLogging with mutable.Cloneab
// First, try absolute path.
if file.exists() then
- val idl = NCUtils.readFile(file).mkString("\n")
+ val idl = NCUtils.readLines(file).mkString("\n")
imports = compile(idl, x)
// Second, try as a classloader resource.
if imports == null then
val in =
cfg.getClass.getClassLoader.getResourceAsStream(x)
if in != null then
- val idl = NCUtils.readStream(in).mkString("\n")
+ val idl = NCUtils.readLines(in).mkString("\n")
imports = compile(idl, x)
// Finally, try as URL resource.
if imports == null then
try
- val idl = NCUtils.readStream(new
URL(x).openStream()).mkString("\n")
+ val idl = NCUtils.readLines(new
URL(x).openStream()).mkString("\n")
imports = compile(idl, x)
catch case _: Exception => throw
newRuntimeError(s"Invalid or unknown import location: $x")(ctx.qstring())
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index ca2b7635..594fa050 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -570,31 +570,6 @@ object NCUtils extends LazyLogging:
// Ack.
println(s"File generated: $path")
- /**
- * Reads lines from given file.
- *
- * @param path Zipped file path to read from.
- * @param enc Encoding.
- * @param log Logger to use.
- */
- def readGzipPath(path: String, enc: String = "UTF-8", log: Logger =
logger): List[String] =
- readGzipFile(new File(path), enc, log)
-
- /**
- * Reads lines from given file.
- *
- * @param f Zipped file to read from.
- * @param enc Encoding.
- * @param log Logger to use.
- */
- def readGzipFile(f: File, enc: String, log: Logger = logger): List[String]
=
- try
- Using.resource(Source.fromInputStream(new GZIPInputStream(new
FileInputStream(f)), enc)) { src =>
- getAndLog(src.getLines().map(p => p).toList, f, log)
- }
- catch
- case e: IOException => E(s"Failed to read GZIP file:
${f.getAbsolutePath}", e)
-
/**
* Reads bytes from given file.
*
@@ -652,96 +627,65 @@ object NCUtils extends LazyLogging:
data
/**
- * Reads lines from given file.
+ * Reads lines from given resource.
*
- * @param f File to read from.
- * @param enc Encoding.
- * @param log Logger to use.
- */
- def readFile(f: File, enc: String = "UTF-8", log: Logger = logger):
List[String] =
- try
- Using.resource(Source.fromFile(f, enc)) { src =>
- getAndLog(src.getLines().map(p => p).toList, f, log)
- }
- catch case e: IOException => E(s"Failed to read file:
${f.getAbsolutePath}", e)
-
- /**
- * Maps lines from the given stream to an object.
- *
- * @param in Stream to read from.
- * @param enc Encoding.
- * @param log Logger to use.
- * @param mapper Function to read lines.
- */
- def mapStream[T](in: InputStream, enc: String, log: Logger = logger,
mapper: Iterator[String] => T): T =
- try Using.resource(Source.fromInputStream(in, enc)) { src =>
mapper(src.getLines()) }
- catch case e: IOException => E(s"Failed to read stream.", e)
-
- /**
- * Reads lines from given stream.
- *
- * @param in Stream to read from.
- * @param enc Encoding.
- * @param log Logger to use.
- */
- def readStream(in: InputStream, enc: String = "UTF-8", log: Logger =
logger): List[String] =
- mapStream(in, enc, log, _.map(p => p).toList)
-
- /**
- * Reads lines from given resource.
- *
- * @param res Resource path to read from.
- * @param enc Encoding.
- * @param log Logger to use.
+ * @param res
+ * @param enc
+ * @param strip
+ * @param convert
+ * @param filterText
+ * @param log
+ * @return
*/
- def readResource(res: String, enc: String = "UTF-8", log: Logger =
logger): List[String] =
- val list =
- try Using.resource(Source.fromInputStream(getStream(res),
enc))(_.getLines().toSeq).toList
+ def readLines(
+ res: String | File | InputStream,
+ enc: String = "UTF-8",
+ strip: Boolean = false,
+ convert: String => String = s => s,
+ filterText: Boolean = false,
+ log: Logger = logger
+ ): Iterator[String] =
+ def process(is: InputStream, name: String) =
+ try
+ val out = Source.fromInputStream(is, enc).getLines().flatMap(p
=>
+ var x = if strip then p.strip else p
+ x = convert(p)
+ Option.when(!filterText || x.nonEmpty && x.head != '#')(x)
+ )
+ log.info(s"Loaded resource: $name")
+ out
catch case e: IOException => E(s"Failed to read stream: $res", e)
-
- log.trace(s"Loaded resource: $res")
- list
+ res match
+ case is: InputStream => process(is, is.getClass.getName)
+ case s: String => process(new BufferedInputStream(getStream(s)), s)
+ case f: File => process(new BufferedInputStream(new
FileInputStream(f)), f.getAbsolutePath)
/**
*
- * @param in
- */
- private def readLcTrimFilter(in: BufferedSource): List[String] =
- in.getLines().map(_.toLowerCase.strip).filter(s => s.nonEmpty &&
s.head!= '#').toList
-
- /**
- * Reads lines from given stream converting to lower case, trimming, and
filtering
- * out empty lines and comments (starting with '#').
- *
- * @param res Zipped resource to read from.
- * @param enc Encoding.
- * @param log Logger to use.
+ * @param res
+ * @param enc
+ * @param strip
+ * @param convert
+ * @param filterText
+ * @param log
+ * @return
*/
- def readTextGzipResource(res: String, enc: String, log: Logger = logger):
List[String] =
- val list =
- try Using.resource(Source.fromInputStream(new
GZIPInputStream(getStream(res)), enc))(readLcTrimFilter)
- catch case e: IOException => E(s"Failed to read stream: $res", e)
-
- log.trace(s"Loaded resource: $res")
-
- list
+ def readGzipLines(
+ res: String,
+ enc: String = "UTF-8",
+ strip: Boolean = false,
+ convert: String => String = s => s,
+ filterText: Boolean = false,
+ log: Logger = logger
+ ): Iterator[String] = readLines(new GZIPInputStream(getStream(res)), enc,
strip, convert, filterText, log)
/**
- * Reads lines from given stream converting to lower case, trimming, and
filtering
- * out empty lines and comments (starting with '#').
*
- * @param in Stream to read from.
- * @param enc Encoding.
+ * @param s
+ * @return
*/
- def readTextStream(in: InputStream, enc: String): List[String] =
- try
- Using.resource(Source.fromInputStream(in, enc)) { src =>
- readLcTrimFilter(src)
- }
- catch
- case e: IOException => E(s"Failed to read stream.", e)
-
+ def hasMeaning(s: String): Boolean = s.nonEmpty && s.head != '#'
/**
*
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index 413e296d..6a1eb444 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -17,8 +17,9 @@
package org.apache.nlpcraft.nlp.enrichers
+import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
-import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.internal.util.NCUtils as U
/**
* "Known-word" [[NCTokenEnricher token enricher]].
@@ -36,13 +37,13 @@ import org.apache.nlpcraft.internal.util.NCUtils
* plain text format with *one lemma per line* with no empty lines,
header or other comments allowed.
*/
//noinspection DuplicatedCode,ScalaWeakerAccess
-class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher:
+class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher with
LazyLogging:
private var dict: Set[String] = _
init()
- private def init(): Unit = dict = NCUtils.readResource(dictRes).toSet
+ private def init(): Unit = dict = U.readLines(res = dictRes, strip = true,
filterText = true, log = logger).toSet
/** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
List[NCToken]): Unit =
- toks.foreach(t => t.put("dict", dict.contains(NCUtils.getProperty(t,
"lemma"))))
+ toks.foreach(t => t.put("dict", dict.contains(U.getProperty(t,
"lemma"))))
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 0b66b52b..4ad1f627 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -19,8 +19,8 @@ package org.apache.nlpcraft.nlp.enrichers
import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
-import org.apache.nlpcraft.internal.util.NCUtils
-import org.apache.nlpcraft.nlp.stemmer.{NCEnStemmer, NCStemmer}
+import org.apache.nlpcraft.internal.util.NCUtils as U
+import org.apache.nlpcraft.nlp.stemmer.*
import java.io.*
import java.util
@@ -100,9 +100,9 @@ private object NCEnStopWordsTokenEnricher extends
LazyLogging:
"percent"
)
- private def read(path: String): Set[String] =
NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
- private def getPos(t: NCToken): String = NCUtils.getProperty(t, "pos")
- private def getLemma(t: NCToken): String = NCUtils.getProperty(t, "lemma")
+ private def read(path: String): Set[String] = U.readGzipLines(path, strip
= true, convert = _.toLowerCase, filterText = true, log = logger).toSet
+ private def getPos(t: NCToken): String = U.getProperty(t, "pos")
+ private def getLemma(t: NCToken): String = U.getProperty(t, "lemma")
private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
private def toLemmaKey(toks: Seq[NCToken]): String =
toks.map(getLemma).mkString(" ")
private def toOriginalKey(toks: Seq[NCToken]): String =
toks.map(_.getText).mkString(" ")
@@ -322,10 +322,7 @@ class NCEnStopWordsTokenEnricher(
percents = PERCENTS.map(getStem)
// Case sensitive.
- val m = readStopWords(
- NCUtils.readResource("stopwords/stop_words.txt", "UTF-8", logger)
- .map(_.strip).filter(s => s.nonEmpty && !s.startsWith("#"))
- )
+ val m = readStopWords(U.readLines(res = "stopwords/stop_words.txt",
strip = true, filterText = true, log = logger))
stopWords = m(false)
exceptions = m(true)
@@ -336,7 +333,7 @@ class NCEnStopWordsTokenEnricher(
* @param lines Configuration file content.
* @return Holder and is-exception flag.
*/
- private def readStopWords(lines: Seq[String]): Map[Boolean,
StopWordHolder] =
+ private def readStopWords(lines: Iterator[String]): Map[Boolean,
StopWordHolder] =
// 1. Prepares accumulation data structure.
enum WordForm:
case STEM, LEM, ORIG
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
index 54efbf50..2aef5bbb 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
@@ -49,9 +49,7 @@ class NCSwearWordsTokenEnricher(dictRes: String, stemmer:
NCStemmer) extends NCT
init()
private def init(): Unit =
- swearWords = NCUtils.readTextStream(NCUtils.getStream(dictRes),
"UTF-8").
- map(p => stemmer.stem(p.toLowerCase)).toSet
- logger.trace(s"Loaded resource: $dictRes")
+ swearWords = NCUtils.readLines(res = dictRes, strip = true, convert =
s => stemmer.stem(s.toLowerCase), filterText = true, log = logger).toSet
/** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
List[NCToken]): Unit =
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/internal/util/NCUtilsSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/internal/util/NCUtilsSpec.scala
new file mode 100644
index 00000000..eeb70057
--- /dev/null
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/internal/util/NCUtilsSpec.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.util
+
+import org.scalatest.funsuite.AnyFunSuite
+
+import java.io.FileInputStream
+
+/**
+ *
+ */
+class NCUtilsSpec extends AnyFunSuite:
+ test("test") {
+ val res = "moby/354984si.ngl"
+ val file = NCResourceReader.get("badfilter/swear_words.txt")
+ val is = new FileInputStream(file)
+
+ require(NCUtils.readLines(res).toList.nonEmpty)
+ require(NCUtils.readLines(file).toList.nonEmpty)
+ require(NCUtils.readLines(is).toList.nonEmpty)
+ }
\ No newline at end of file