[incubator-nlpcraft] branch NLPCRAFT-469 updated: WIP.

sergeykamov Fri, 24 Dec 2021 12:29:34 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
     new c129873  WIP.
c129873 is described below

commit c1298731e32df94fec1e882fe9dc65d282035450
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Dec 24 23:29:22 2021 +0300

    WIP.
---
 .../nlp/token/enricher/impl/NCEnBracketsImpl.scala | 24 +++++----
 .../nlp/token/enricher/impl/NCEnQuotesImpl.scala   | 27 +++++-----
 .../token/enricher/impl/NCEnSwearWordsImpl.scala   | 11 +++--
 .../parser/opennlp/impl/NCEnOpenNlpImpl.scala      |  4 +-
 .../parser/opennlp/impl/NCEnStopWordsFinder.scala  | 40 ++++++++++++---
 .../apache/nlpcraft/internal/util/NCUtils.scala    | 57 +++++++++-------------
 .../opennlp/NCEnOpenNlpTokenParserSpec.scala       | 22 ++++++++-
 7 files changed, 113 insertions(+), 72 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
index 51609b7..a4b94a8 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
@@ -17,6 +17,7 @@
 
 package org.apache.nlpcraft.internal.nlp.token.enricher.impl
 
+import com.typesafe.scalalogging.LazyLogging
 import org.apache.nlpcraft.*
 
 import java.io.*
@@ -25,23 +26,26 @@ import scala.jdk.CollectionConverters.CollectionHasAsScala
 /**
   *
   */
-class NCEnBracketsImpl extends NCTokenEnricher:
+class NCEnBracketsImpl extends NCTokenEnricher with LazyLogging:
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
java.util.List[NCToken]): Unit =
-        val s = new java.util.Stack[String]()
+        val stack = new java.util.Stack[String]()
+        val map = mutable.HashMap.empty[NCToken, Boolean]
+        var ok = true
 
-        // TODO: text
-        def mkError: NCException = new NCException(s"Invalid brackets in text: 
${req.getOriginalText}")
-        def check(expected: String): Unit = if s.empty() || s.pop() != 
expected then throw mkError
-        def mark(t: NCToken): Unit = t.put("brackets:en", !s.isEmpty)
+        def check(expected: String): Unit = if stack.empty() || stack.pop() != 
expected then ok = false
+        def mark(t: NCToken): Unit = map += t -> !stack.isEmpty
 
-        toks.forEach(t =>
+        for (t <- toks.asScala if ok)
             t.getOriginalText match {
-                case "(" | "{" | "[" => mark(t); s.push(t.getOriginalText)
+                case "(" | "{" | "[" | "<" => mark(t); 
stack.push(t.getOriginalText)
                 case ")" => check("("); mark(t)
                 case "}" => check("{"); mark(t)
                 case "]" => check("["); mark(t)
+                case ">" => check("<"); mark(t)
                 case _ => mark(t)
             }
-        )
 
-        if !s.isEmpty then throw mkError
\ No newline at end of file
+        if ok && stack.isEmpty then
+            map.foreach { (tok, b) => tok.put("brackets:en", b) }
+        else
+            logger.trace(s"Invalid brackets: ${req.getOriginalText}")
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
index b3d8893..8935864 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
@@ -17,6 +17,7 @@
 
 package org.apache.nlpcraft.internal.nlp.token.enricher.impl
 
+import com.typesafe.scalalogging.LazyLogging
 import org.apache.nlpcraft.*
 
 import scala.jdk.CollectionConverters.*
@@ -29,8 +30,7 @@ import NCEnQuotesImpl.*
 /**
   *
   */
-class NCEnQuotesImpl extends NCTokenEnricher:
-
+class NCEnQuotesImpl extends NCTokenEnricher with LazyLogging:
     /**
       *
       * @param req
@@ -41,15 +41,14 @@ class NCEnQuotesImpl extends NCTokenEnricher:
         val toksSeq = toks.asScala
         val quotes = toksSeq.filter(isQuote)
 
-        // TODO:
-        if quotes.size % 2 != 0 then throw new NCException(s"Invalid quotes in 
text: ${req.getOriginalText}")
-
-        val m = toksSeq.zipWithIndex.toMap
-        val pairs = quotes.zipWithIndex.drop(1).flatMap {
-            case (t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx - 
1))) else None
-        }
-
-        // Do not rewrite it with case, it works only such way, with tuples 
(scala 3.1 error?)
-        toksSeq.zipWithIndex.foreach(p =>
-            p._1.put("quoted:en", pairs.exists { case (from, to) => from > 
p._2 && to < p._2 })
-        )
\ No newline at end of file
+        if quotes.size % 2 == 0 then
+            val m = toksSeq.zipWithIndex.toMap
+            val pairs = quotes.zipWithIndex.drop(1).flatMap {
+                (t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx - 
1))) else None
+            }
+
+            toksSeq.zipWithIndex.foreach { (tok, idx) =>
+                tok.put("quoted:en", pairs.exists { case (from, to) => from > 
idx && to < idx })
+            }
+        else
+            logger.trace(s"Invalid quotes: ${req.getOriginalText}")
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
index 1bf4ba3..cd645f5 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
@@ -17,10 +17,12 @@
 
 package org.apache.nlpcraft.internal.nlp.token.enricher.impl
 
+import com.typesafe.scalalogging.LazyLogging
 import opennlp.tools.stemmer.PorterStemmer
 import org.apache.nlpcraft.*
 import 
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnOpenNlpImpl
 import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.internal.util.NCUtils.getStream
 
 import java.io.*
 
@@ -33,24 +35,27 @@ object NCEnSwearWordsImpl:
       * @param mdlFile
       * @return
       */
-    def apply(mdlFile: File): NCEnSwearWordsImpl = new NCEnSwearWordsImpl(new 
BufferedInputStream(new FileInputStream(mdlFile)))
+    def apply(mdlFile: File): NCEnSwearWordsImpl = new NCEnSwearWordsImpl(
+        new BufferedInputStream(new FileInputStream(mdlFile)), mdlFile.getPath
+    )
 
     /**
       *
       * @param mdlSrc
       * @return
       */
-    def apply(mdlSrc: String): NCEnSwearWordsImpl = new 
NCEnSwearWordsImpl(NCUtils.getStream(mdlSrc))
+    def apply(mdlSrc: String): NCEnSwearWordsImpl = new 
NCEnSwearWordsImpl(NCUtils.getStream(mdlSrc), mdlSrc)
 
 /**
   *
   */
-class NCEnSwearWordsImpl(is: InputStream) extends NCTokenEnricher:
+class NCEnSwearWordsImpl(is: InputStream, res: String) extends NCTokenEnricher 
with LazyLogging:
     @volatile private var swearWords: Set[String] = _
 
     override def start(): Unit =
         val stemmer = new PorterStemmer
         swearWords = NCUtils.readTextStream(is, 
"UTF-8").map(stemmer.stem).toSet
+        logger.trace(s"Loaded resource: $res")
 
     override def stop(): Unit = swearWords = null
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
index 91fd44f..38db56a 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
@@ -155,10 +155,10 @@ class NCEnOpenNlpImpl(
                     lemmatize(suspIdxs.map(i => words(i)).toArray, 
suspIdxs.map(_ => "NNN").toArray).
                     zipWithIndex.
                     flatMap {
-                        case (lemma, i) => if lemma != "0" then 
Some(suspIdxs(i) -> lemma) else None
+                        (lemma, i) => if lemma != "0" then Some(suspIdxs(i) -> 
lemma) else None
                     }.toMap
                 lemmas = lemmas.zipWithIndex.map {
-                    case (lemma, idx) => fixes.getOrElse(idx, lemma)
+                    (lemma, idx) => fixes.getOrElse(idx, lemma)
                 }
 
             val res: Seq[NCToken] = 
holders.zip(posTags).zip(lemmas).toIndexedSeq.map { case ((h, pos), lemma) =>
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
index ecbe1c1..fc8cf6a 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
@@ -119,17 +119,17 @@ private[impl] object NCEnStopWordsFinder:
         includes: Map[String, Set[Wildcard]],
         excludes: Map[String, Set[Wildcard]]
     ):
-        require(!any.exists { case (begin, end) => begin.isEmpty && 
end.isEmpty })
+        require(!any.exists { (begin, end) => begin.isEmpty && end.isEmpty })
 
         // Optimization for full wildcard cases (configurations like * | DT)
         private val inclPoses = filterPoses(includes)
         private val exclPoses = filterPoses(excludes)
 
         private def filterPoses(m: Map[String, Set[Wildcard]]): Set[String] =
-            m.filter { case(_, pair) => pair.exists { case (begin, end) => 
begin.isEmpty && end.isEmpty } }.keySet
+            m.filter { case(_, pair) => pair.exists { (begin, end) => 
begin.isEmpty && end.isEmpty } }.keySet
 
         private def matches(s: String, set: Set[Wildcard]): Boolean =
-            set.exists { case (b, e) => (b.isEmpty || s.startsWith(b)) && 
(e.isEmpty || s.endsWith(e)) }
+            set.exists { (b, e) => (b.isEmpty || s.startsWith(b)) && 
(e.isEmpty || s.endsWith(e)) }
 
         def matches(s: String, posOpt: Option[String]): Boolean =
             if s.contains(' ') then
@@ -544,12 +544,36 @@ private[impl] class NCEnStopWordsFinder(addStems: 
Set[String], exclStems: Set[St
         if quotes.nonEmpty then
             val m = toks.zipWithIndex.toMap
             val pairs = quotes.zipWithIndex.drop(1).flatMap {
-                case (t, idx) => if idx % 2 != 0 then Some(m(t) -> 
m(quotes(idx - 1))) else None
+                (t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx - 
1))) else None
             }
-            stops --= stops.filter(t => pairs.exists {
-                case (from, to) =>
-                    val idx = m(t)
-                    from > idx && to < idx
+            stops --= stops.filter(t => pairs.exists { (from, to) =>
+                val idx = m(t)
+                from > idx && to < idx
             })
 
+        // +-------------------------------------------------+
+        // | Pass #9.                                        |
+        // | Deletes stop words if they are brackets.        |
+        // +-------------------------------------------------+
+        val stack = new java.util.Stack[String]()
+        val set = mutable.HashSet.empty[NCToken]
+        var ok = true
+
+        def check(expected: String): Unit = if stack.empty() || stack.pop() != 
expected then ok = false
+        def mark(t: NCToken): Unit = if (!stack.isEmpty) set += t
+
+        for (t <- toks if ok)
+            t.getOriginalText match {
+                case "(" | "{" | "[" | "<" => mark(t); 
stack.push(t.getOriginalText)
+                case ")" => check("("); mark(t)
+                case "}" => check("{"); mark(t)
+                case "]" => check("["); mark(t)
+                case ">" => check("<"); mark(t)
+                case _ => mark(t)
+            }
+
+        // Just ignore invalid brackets.
+        if ok && stack.isEmpty then
+            stops --= stops.intersect(set)
+
         stops.toSeq.sortBy(_.getStartCharIndex)
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index 6041741..e79caa1 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -38,7 +38,7 @@ import scala.util.control.Exception.ignoring
 import scala.io.BufferedSource
 
 /**
-  * TODO: logs for all files reading methods (fix or remove)
+  * 
   */
 object NCUtils extends LazyLogging:
     final val NL = System getProperty "line.separator"
@@ -870,33 +870,18 @@ object NCUtils extends LazyLogging:
       * @param log Logger to use.
       */
     def readResource(res: String, enc: String = "UTF-8", log: Logger = 
logger): List[String] =
-        readStream(getStream(res), enc, log)
+        val list = 
+            try
+                Using.resource(Source.fromInputStream(getStream(res), enc)) { 
src =>
+                    src.getLines().toList
+                }
+            catch
+                case e: IOException => throw new NCException(s"Failed to read 
stream.", e)
+    
+        log.trace(s"Loaded resource: $res")
 
-    /**
-      * Reads lines from given stream.
-      *
-      * @param in Stream to read from.
-      * @param enc Encoding.
-      * @param log Logger to use.
-      */
-    def readStream(in: InputStream, enc: String = "UTF-8", log: Logger = 
logger): List[String] =
-        mapStream(in, enc, log, _.map(p => p).toList)
+        list
 
-    /**
-      * Maps lines from the given stream to an object.
-      *
-      * @param in Stream to read from.
-      * @param enc Encoding.
-      * @param log Logger to use.
-      * @param mapper Function to read lines.
-      */
-    private def mapStream[T](in: InputStream, enc: String, log: Logger = 
logger, mapper: Iterator[String] => T): T =
-        try
-            Using.resource(Source.fromInputStream(in, enc)) { src =>
-                mapper(src.getLines())
-            }
-        catch
-            case e: IOException => throw new NCException(s"Failed to read 
stream.", e)
 
     /**
       *
@@ -915,12 +900,17 @@ object NCUtils extends LazyLogging:
       * @param log Logger to use.
       */
     def readTextGzipResource(res: String, enc: String, log: Logger = logger): 
List[String] =
-        try
-            Using.resource(Source.fromInputStream(new 
GZIPInputStream(getStream(res)), enc)) { src =>
-                readLcTrimFilter(src)
-            }
-        catch
-            case e: IOException => throw new NCException(s"Failed to read 
stream.", e)
+        val list =
+            try
+                Using.resource(Source.fromInputStream(new 
GZIPInputStream(getStream(res)), enc)) { src =>
+                    readLcTrimFilter(src)
+                }
+            catch
+                case e: IOException => throw new NCException(s"Failed to read 
stream.", e)
+
+        log.trace(s"Loaded resource: $res")
+
+        list
 
     /**
       * Reads lines from given stream converting to lower case, trimming, and 
filtering
@@ -928,9 +918,8 @@ object NCUtils extends LazyLogging:
       *
       * @param in Stream to read from.
       * @param enc Encoding.
-      * @param log Logger to use.
       */
-    def readTextStream(in: InputStream, enc: String, log: Logger = logger): 
List[String] =
+    def readTextStream(in: InputStream, enc: String): List[String] =
         try
             Using.resource(Source.fromInputStream(in, enc)) { src =>
                 readLcTrimFilter(src)
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
index 88c807c..90ab8e6 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
@@ -75,4 +75,24 @@ class NCEnOpenNlpTokenParserSpec:
                 require(toks.head.isStopWord);
                 require(toks.last.isStopWord);
                 require(toks.drop(1).reverse.drop(1).forall(!_.isStopWord))
-        )
\ No newline at end of file
+        )
+        test(
+            // First and last are stop words,
+            // Third and fourth are not because brackets.
+            "A ( A A ) A",
+            toks =>
+                require(toks.length == 6);
+                require(toks.head.isStopWord);
+                require(toks.last.isStopWord);
+                require(toks.drop(1).reverse.drop(1).forall(!_.isStopWord))
+        )
+        test(
+            // Invalid brackets.
+            "A ( A A A",
+            toks => toks.filter(_.getNormalizedText != 
"(").forall(_.isStopWord)
+        )
+        test(
+            // Nested brackets.
+            "< < [ A ] > >",
+            toks => require(!toks.find(_.getNormalizedText == 
"a").get.isStopWord)
+        )

[incubator-nlpcraft] branch NLPCRAFT-469 updated: WIP.

Reply via email to