[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP.

sergeykamov Fri, 16 Dec 2022 09:37:55 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
     new cfb460fd WIP.
cfb460fd is described below

commit cfb460fd8c9243feb5d245bb1e76eb39381de6ef
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Dec 16 21:35:37 2022 +0400

    WIP.
---
 .../internal/intent/compiler/NCIDLCompiler.scala   |   6 +-
 .../apache/nlpcraft/internal/util/NCUtils.scala    | 150 +++++++--------------
 .../nlp/enrichers/NCDictionaryTokenEnricher.scala  |   9 +-
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala |  17 +--
 .../nlp/enrichers/NCSwearWordsTokenEnricher.scala  |   4 +-
 .../nlpcraft/internal/util/NCUtilsSpec.scala       |  36 +++++
 6 files changed, 99 insertions(+), 123 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/intent/compiler/NCIDLCompiler.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/intent/compiler/NCIDLCompiler.scala
index 30ca8474..dded76da 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/intent/compiler/NCIDLCompiler.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/intent/compiler/NCIDLCompiler.scala
@@ -322,20 +322,20 @@ class NCIDLCompiler(cfg: NCModelConfig) extends 
LazyLogging with mutable.Cloneab
 
                     // First, try absolute path.
                     if file.exists() then
-                        val idl = NCUtils.readFile(file).mkString("\n")
+                        val idl = NCUtils.readLines(file).mkString("\n")
                         imports = compile(idl, x)
 
                     // Second, try as a classloader resource.
                     if imports == null then
                         val in = 
cfg.getClass.getClassLoader.getResourceAsStream(x)
                         if in != null then
-                            val idl = NCUtils.readStream(in).mkString("\n")
+                            val idl = NCUtils.readLines(in).mkString("\n")
                             imports = compile(idl, x)
 
                     // Finally, try as URL resource.
                     if imports == null then
                         try
-                            val idl = NCUtils.readStream(new 
URL(x).openStream()).mkString("\n")
+                            val idl = NCUtils.readLines(new 
URL(x).openStream()).mkString("\n")
                             imports = compile(idl,  x)
                         catch case _: Exception => throw 
newRuntimeError(s"Invalid or unknown import location: $x")(ctx.qstring())
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index ca2b7635..594fa050 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -570,31 +570,6 @@ object NCUtils extends LazyLogging:
         // Ack.
         println(s"File generated: $path")
 
-    /**
-      * Reads lines from given file.
-      *
-      * @param path Zipped file path to read from.
-      * @param enc Encoding.
-      * @param log Logger to use.
-      */
-    def readGzipPath(path: String, enc: String = "UTF-8", log: Logger = 
logger): List[String] =
-        readGzipFile(new File(path), enc, log)
-
-    /**
-      * Reads lines from given file.
-      *
-      * @param f Zipped file to read from.
-      * @param enc Encoding.
-      * @param log Logger to use.
-      */
-    def readGzipFile(f: File, enc: String, log: Logger = logger): List[String] 
=
-        try
-            Using.resource(Source.fromInputStream(new GZIPInputStream(new 
FileInputStream(f)), enc)) { src =>
-                getAndLog(src.getLines().map(p => p).toList, f, log)
-            }
-        catch
-            case e: IOException => E(s"Failed to read GZIP file: 
${f.getAbsolutePath}", e)
-
     /**
       * Reads bytes from given file.
       *
@@ -652,96 +627,65 @@ object NCUtils extends LazyLogging:
         data
 
     /**
-      * Reads lines from given file.
+      *  Reads lines from given resource.
       *
-      * @param f File to read from.
-      * @param enc Encoding.
-      * @param log Logger to use.
-      */
-    def readFile(f: File, enc: String = "UTF-8", log: Logger = logger): 
List[String] =
-        try
-            Using.resource(Source.fromFile(f, enc)) { src =>
-                getAndLog(src.getLines().map(p => p).toList, f, log)
-            }
-        catch case e: IOException => E(s"Failed to read file: 
${f.getAbsolutePath}", e)
-
-    /**
-      * Maps lines from the given stream to an object.
-      *
-      * @param in Stream to read from.
-      * @param enc Encoding.
-      * @param log Logger to use.
-      * @param mapper Function to read lines.
-      */
-    def mapStream[T](in: InputStream, enc: String, log: Logger = logger, 
mapper: Iterator[String] => T): T =
-        try Using.resource(Source.fromInputStream(in, enc)) { src => 
mapper(src.getLines()) }
-        catch case e: IOException => E(s"Failed to read stream.", e)
-
-    /**
-      * Reads lines from given stream.
-      *
-      * @param in Stream to read from.
-      * @param enc Encoding.
-      * @param log Logger to use.
-      */
-    def readStream(in: InputStream, enc: String = "UTF-8", log: Logger = 
logger): List[String] =
-        mapStream(in, enc, log, _.map(p => p).toList)
-
-    /**
-      * Reads lines from given resource.
-      *
-      * @param res Resource path to read from.
-      * @param enc Encoding.
-      * @param log Logger to use.
+      * @param res
+      * @param enc
+      * @param strip
+      * @param convert
+      * @param filterText
+      * @param log
+      * @return
       */
-    def readResource(res: String, enc: String = "UTF-8", log: Logger = 
logger): List[String] =
-        val list =
-            try Using.resource(Source.fromInputStream(getStream(res), 
enc))(_.getLines().toSeq).toList
+    def readLines(
+        res: String | File | InputStream,
+        enc: String = "UTF-8",
+        strip: Boolean = false,
+        convert: String => String = s => s,
+        filterText: Boolean = false,
+        log: Logger = logger
+    ): Iterator[String] =
+        def process(is: InputStream, name: String) =
+            try
+                val out = Source.fromInputStream(is, enc).getLines().flatMap(p 
=>
+                    var x = if strip then p.strip else p
+                    x = convert(p)
+                    Option.when(!filterText || x.nonEmpty && x.head != '#')(x)
+                )
+                log.info(s"Loaded resource: $name")
+                out
             catch case e: IOException => E(s"Failed to read stream: $res", e)
-    
-        log.trace(s"Loaded resource: $res")
 
-        list
+        res match
+            case is: InputStream => process(is, is.getClass.getName)
+            case s: String => process(new BufferedInputStream(getStream(s)), s)
+            case f: File => process(new BufferedInputStream(new 
FileInputStream(f)), f.getAbsolutePath)
 
     /**
       *
-      * @param in
-      */
-    private def readLcTrimFilter(in: BufferedSource): List[String] =
-        in.getLines().map(_.toLowerCase.strip).filter(s => s.nonEmpty && 
s.head!= '#').toList
-
-    /**
-      * Reads lines from given stream converting to lower case, trimming, and 
filtering
-      * out empty lines and comments (starting with '#').
-      *
-      * @param res Zipped resource to read from.
-      * @param enc Encoding.
-      * @param log Logger to use.
+      * @param res
+      * @param enc
+      * @param strip
+      * @param convert
+      * @param filterText
+      * @param log
+      * @return
       */
-    def readTextGzipResource(res: String, enc: String, log: Logger = logger): 
List[String] =
-        val list =
-            try Using.resource(Source.fromInputStream(new 
GZIPInputStream(getStream(res)), enc))(readLcTrimFilter)
-            catch case e: IOException => E(s"Failed to read stream: $res", e)
-
-        log.trace(s"Loaded resource: $res")
-
-        list
+    def readGzipLines(
+        res: String,
+        enc: String = "UTF-8",
+        strip: Boolean = false,
+        convert: String => String = s => s,
+        filterText: Boolean = false,
+        log: Logger = logger
+    ): Iterator[String] = readLines(new GZIPInputStream(getStream(res)), enc, 
strip, convert, filterText, log)
 
     /**
-      * Reads lines from given stream converting to lower case, trimming, and 
filtering
-      * out empty lines and comments (starting with '#').
       *
-      * @param in Stream to read from.
-      * @param enc Encoding.
+      * @param s
+      * @return
       */
-    def readTextStream(in: InputStream, enc: String): List[String] =
-        try
-            Using.resource(Source.fromInputStream(in, enc)) { src =>
-                readLcTrimFilter(src)
-            }
-        catch
-            case e: IOException => E(s"Failed to read stream.", e)
-
+    def hasMeaning(s: String): Boolean = s.nonEmpty && s.head != '#'
 
     /**
       *
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index 413e296d..6a1eb444 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -17,8 +17,9 @@
 
 package org.apache.nlpcraft.nlp.enrichers
 
+import com.typesafe.scalalogging.LazyLogging
 import org.apache.nlpcraft.*
-import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.internal.util.NCUtils as U
 
 /**
   * "Known-word" [[NCTokenEnricher token enricher]].
@@ -36,13 +37,13 @@ import org.apache.nlpcraft.internal.util.NCUtils
   *         plain text format with *one lemma per line* with no empty lines, 
header or other comments allowed.
   */
 //noinspection DuplicatedCode,ScalaWeakerAccess
-class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher:
+class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher with 
LazyLogging:
     private var dict: Set[String] = _
 
     init()
 
-    private def init(): Unit = dict = NCUtils.readResource(dictRes).toSet
+    private def init(): Unit = dict = U.readLines(res = dictRes, strip = true, 
filterText = true, log = logger).toSet
 
     /** @inheritdoc */
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
List[NCToken]): Unit =
-        toks.foreach(t => t.put("dict", dict.contains(NCUtils.getProperty(t, 
"lemma"))))
+        toks.foreach(t => t.put("dict", dict.contains(U.getProperty(t, 
"lemma"))))
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 0b66b52b..4ad1f627 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -19,8 +19,8 @@ package org.apache.nlpcraft.nlp.enrichers
 
 import com.typesafe.scalalogging.LazyLogging
 import org.apache.nlpcraft.*
-import org.apache.nlpcraft.internal.util.NCUtils
-import org.apache.nlpcraft.nlp.stemmer.{NCEnStemmer, NCStemmer}
+import org.apache.nlpcraft.internal.util.NCUtils as U
+import org.apache.nlpcraft.nlp.stemmer.*
 
 import java.io.*
 import java.util
@@ -100,9 +100,9 @@ private object NCEnStopWordsTokenEnricher extends 
LazyLogging:
         "percent"
     )
 
-    private def read(path: String): Set[String] = 
NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
-    private def getPos(t: NCToken): String = NCUtils.getProperty(t, "pos")
-    private def getLemma(t: NCToken): String = NCUtils.getProperty(t, "lemma")
+    private def read(path: String): Set[String] = U.readGzipLines(path, strip 
= true, convert = _.toLowerCase, filterText = true, log = logger).toSet
+    private def getPos(t: NCToken): String = U.getProperty(t, "pos")
+    private def getLemma(t: NCToken): String = U.getProperty(t, "lemma")
     private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
     private def toLemmaKey(toks: Seq[NCToken]): String = 
toks.map(getLemma).mkString(" ")
     private def toOriginalKey(toks: Seq[NCToken]): String = 
toks.map(_.getText).mkString(" ")
@@ -322,10 +322,7 @@ class NCEnStopWordsTokenEnricher(
         percents = PERCENTS.map(getStem)
 
         // Case sensitive.
-        val m = readStopWords(
-            NCUtils.readResource("stopwords/stop_words.txt", "UTF-8", logger)
-            .map(_.strip).filter(s => s.nonEmpty && !s.startsWith("#"))
-        )
+        val m = readStopWords(U.readLines(res = "stopwords/stop_words.txt", 
strip = true, filterText = true, log = logger))
 
         stopWords = m(false)
         exceptions = m(true)
@@ -336,7 +333,7 @@ class NCEnStopWordsTokenEnricher(
       * @param lines Configuration file content.
       * @return Holder and is-exception flag.
       */
-    private def readStopWords(lines: Seq[String]): Map[Boolean, 
StopWordHolder] =
+    private def readStopWords(lines: Iterator[String]): Map[Boolean, 
StopWordHolder] =
         // 1. Prepares accumulation data structure.
         enum WordForm:
             case STEM, LEM, ORIG
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
index 54efbf50..2aef5bbb 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
@@ -49,9 +49,7 @@ class NCSwearWordsTokenEnricher(dictRes: String, stemmer: 
NCStemmer) extends NCT
     init()
 
     private def init(): Unit =
-        swearWords = NCUtils.readTextStream(NCUtils.getStream(dictRes), 
"UTF-8").
-            map(p => stemmer.stem(p.toLowerCase)).toSet
-        logger.trace(s"Loaded resource: $dictRes")
+        swearWords = NCUtils.readLines(res = dictRes, strip = true, convert = 
s => stemmer.stem(s.toLowerCase), filterText = true, log = logger).toSet
 
     /** @inheritdoc */
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
List[NCToken]): Unit =
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/internal/util/NCUtilsSpec.scala 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/internal/util/NCUtilsSpec.scala
new file mode 100644
index 00000000..eeb70057
--- /dev/null
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/internal/util/NCUtilsSpec.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.util
+
+import org.scalatest.funsuite.AnyFunSuite
+
+import java.io.FileInputStream
+
+/**
+  *
+  */
+class NCUtilsSpec extends AnyFunSuite:
+    test("test") {
+        val res = "moby/354984si.ngl"
+        val file = NCResourceReader.get("badfilter/swear_words.txt")
+        val is = new FileInputStream(file)
+
+        require(NCUtils.readLines(res).toList.nonEmpty)
+        require(NCUtils.readLines(file).toList.nonEmpty)
+        require(NCUtils.readLines(is).toList.nonEmpty)
+    }
\ No newline at end of file

[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP.

Reply via email to