[incubator-nlpcraft] branch NLPCRAFT-469 updated: WIP/CR.

aradzinski Sat, 25 Dec 2021 11:21:47 -0800

This is an automated email from the ASF dual-hosted git repository.

aradzinski pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
     new 070bd55  WIP/CR.
070bd55 is described below

commit 070bd5557a8351fe31d8dde068cac996e0c5bf7b
Author: Aaron Radzinski <[email protected]>
AuthorDate: Sat Dec 25 11:21:35 2021 -0800

    WIP/CR.
---
 .../nlp/token/enricher/NCEnBracketsTokenEnricher.java     |  5 +----
 .../nlp/token/enricher/NCEnDictionaryTokenEnricher.java   |  2 +-
 ...sTokenEnricher.java => NCEnLanguageTokenEnricher.java} | 15 +++++----------
 .../nlp/token/enricher/NCEnSwearWordsTokenEnricher.java   |  5 +++--
 .../nlp/token/enricher/impl/NCEnBracketsImpl.scala        |  7 +++----
 .../nlp/token/enricher/impl/NCEnDictionaryImpl.scala      |  1 -
 .../{NCEnWordsImpl.scala => NCEnLanguageWordsImpl.scala}  |  4 ++--
 .../internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala |  5 ++---
 .../nlp/token/enricher/impl/NCEnSwearWordsImpl.scala      |  6 +++---
 .../nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java  |  1 +
 .../token/parser/opennlp/impl/NCEnStopWordsFinder.scala   |  5 +----
 .../token/enricher/NCEnBracketsTokenEnricherSpec.scala    |  9 +++++----
 .../nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala  |  9 +++++----
 .../nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala   | 14 +++++++-------
 14 files changed, 39 insertions(+), 49 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
index ec5310a..43920a6 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
@@ -17,10 +17,7 @@
 
 package org.apache.nlpcraft.internal.nlp.token.enricher;
 
-import org.apache.nlpcraft.NCModelConfig;
-import org.apache.nlpcraft.NCRequest;
-import org.apache.nlpcraft.NCToken;
-import org.apache.nlpcraft.NCTokenEnricher;
+import org.apache.nlpcraft.*;
 import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnBracketsImpl;
 
 import java.util.List;
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
index 3f9d5c4..a845eeb 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
@@ -22,7 +22,7 @@ import 
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnDictionaryImpl;
 import java.util.List;
 
 /**
- * TODO: enriches with <code>swear:en</code> property. Download 
swear_words.txt for use.
+ * TODO: enriches with <code>dict:en</code> property. 
  */
 public class NCEnDictionaryTokenEnricher implements NCTokenEnricher {
     private final NCEnDictionaryImpl impl = new NCEnDictionaryImpl();
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricher.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricher.java
similarity index 76%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricher.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricher.java
index 3e87910..687f9a1 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricher.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricher.java
@@ -17,21 +17,16 @@
 
 package org.apache.nlpcraft.internal.nlp.token.enricher;
 
-import org.apache.nlpcraft.NCModelConfig;
-import org.apache.nlpcraft.NCRequest;
-import org.apache.nlpcraft.NCToken;
-import org.apache.nlpcraft.NCTokenEnricher;
-import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnWordsImpl;
+import org.apache.nlpcraft.*;
+import 
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnLanguageWordsImpl;
 
-import java.io.File;
 import java.util.List;
-import java.util.Objects;
 
 /**
- * TODO: enriches with <code>word:en</code> property.
+ * TODO: enriches with <code>lang:en</code> property.
  */
-public class NCEnWordsTokenEnricher implements NCTokenEnricher {
-    private final NCEnWordsImpl impl = new NCEnWordsImpl();
+public class NCEnLanguageTokenEnricher implements NCTokenEnricher {
+    private final NCEnLanguageWordsImpl impl = new NCEnLanguageWordsImpl();
 
     @Override
     public void start() {
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricher.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricher.java
index 2a2776e..3e9e192 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricher.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricher.java
@@ -39,17 +39,18 @@ public class NCEnSwearWordsTokenEnricher implements 
NCTokenEnricher {
     public NCEnSwearWordsTokenEnricher(File mdlFile) {
         Objects.requireNonNull(mdlFile, "Swear words model file cannot be 
null.");
 
-        this.impl = NCEnSwearWordsImpl.apply(mdlFile);
+        impl = NCEnSwearWordsImpl.apply(mdlFile);
     }
 
     /**
      * TODO: swear_words.txt
+     * 
      * @param mdlSrc
      */
     public NCEnSwearWordsTokenEnricher(String mdlSrc) {
         Objects.requireNonNull(mdlSrc, "Swear words model file cannot be 
null.");
 
-        this.impl = NCEnSwearWordsImpl.apply(mdlSrc);
+        impl = NCEnSwearWordsImpl.apply(mdlSrc);
     }
 
     @Override
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
index a4b94a8..cf485b0 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
@@ -23,6 +23,7 @@ import org.apache.nlpcraft.*
 import java.io.*
 import scala.collection.mutable
 import scala.jdk.CollectionConverters.CollectionHasAsScala
+
 /**
   *
   */
@@ -36,16 +37,14 @@ class NCEnBracketsImpl extends NCTokenEnricher with 
LazyLogging:
         def mark(t: NCToken): Unit = map += t -> !stack.isEmpty
 
         for (t <- toks.asScala if ok)
-            t.getOriginalText match {
+            t.getOriginalText match
                 case "(" | "{" | "[" | "<" => mark(t); 
stack.push(t.getOriginalText)
                 case ")" => check("("); mark(t)
                 case "}" => check("{"); mark(t)
                 case "]" => check("["); mark(t)
                 case ">" => check("<"); mark(t)
                 case _ => mark(t)
-            }
 
-        if ok && stack.isEmpty then
-            map.foreach { (tok, b) => tok.put("brackets:en", b) }
+        if ok && stack.isEmpty then map.foreach { (tok, b) => 
tok.put("brackets:en", b) }
         else
             logger.trace(s"Invalid brackets: ${req.getOriginalText}")
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
index 43b15f7..2c22323 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
@@ -28,6 +28,5 @@ class NCEnDictionaryImpl extends NCTokenEnricher:
 
     override def start(): Unit = dict = 
NCUtils.readResource("moby/354984si.ngl", "iso-8859-1").toSet
     override def stop(): Unit = dict = null
-
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
java.util.List[NCToken]): Unit =
         toks.forEach(t => t.put("dict:en", dict.contains(t.getLemma)))
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnWordsImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnLanguageWordsImpl.scala
similarity index 87%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnWordsImpl.scala
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnLanguageWordsImpl.scala
index db6fe7e..4c983f3 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnWordsImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnLanguageWordsImpl.scala
@@ -23,6 +23,6 @@ import java.io.*
 /**
   *
   */
-class NCEnWordsImpl extends NCTokenEnricher:
+class NCEnLanguageWordsImpl extends NCTokenEnricher:
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
java.util.List[NCToken]): Unit =
-        toks.forEach(t => t.put("word:en", 
t.getOriginalText.matches("""[\s\w\p{Punct}]+""")))
\ No newline at end of file
+        toks.forEach(t => t.put("lang:en", 
t.getOriginalText.matches("""[\s\w\p{Punct}]+""")))
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
index c016886..e37dce4 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
@@ -27,6 +27,7 @@ object NCEnQuotesImpl:
     private def isQuote(t: NCToken): Boolean = Q_POS.contains(t.getPos)
 
 import NCEnQuotesImpl.*
+
 /**
   *
   */
@@ -44,13 +45,11 @@ class NCEnQuotesImpl extends NCTokenEnricher with 
LazyLogging:
         // Start and end quote mustn't be same ("a` processed as valid)
         if quotes.nonEmpty && quotes.size % 2 == 0 then
             val m = toksSeq.zipWithIndex.toMap
-
             val pairs = quotes.zipWithIndex.drop(1).flatMap {
                 (t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx - 
1))) else None
             }
-
             toksSeq.zipWithIndex.foreach { (tok, idx) =>
                 tok.put("quoted:en", pairs.exists { case (from, to) => from > 
idx && to < idx })
             }
         else
-            logger.trace(s"Invalid quotes: ${req.getOriginalText}")
\ No newline at end of file
+            logger.warn(s"Invalid quotes: ${req.getOriginalText}")
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
index cd645f5..af79096 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
@@ -44,7 +44,9 @@ object NCEnSwearWordsImpl:
       * @param mdlSrc
       * @return
       */
-    def apply(mdlSrc: String): NCEnSwearWordsImpl = new 
NCEnSwearWordsImpl(NCUtils.getStream(mdlSrc), mdlSrc)
+    def apply(mdlSrc: String): NCEnSwearWordsImpl = new NCEnSwearWordsImpl(
+        NCUtils.getStream(mdlSrc), mdlSrc
+    )
 
 /**
   *
@@ -56,9 +58,7 @@ class NCEnSwearWordsImpl(is: InputStream, res: String) 
extends NCTokenEnricher w
         val stemmer = new PorterStemmer
         swearWords = NCUtils.readTextStream(is, 
"UTF-8").map(stemmer.stem).toSet
         logger.trace(s"Loaded resource: $res")
-
     override def stop(): Unit = swearWords = null
-
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
java.util.List[NCToken]): Unit =
         toks.forEach(t => t.put("swear:en", swearWords.contains(t.getStem)))
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java
index 44d8140..b8659e4 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java
@@ -34,6 +34,7 @@ import java.util.Set;
  *  - tagger: http://opennlp.sourceforge.net/models-1.5/en-pos-maxent.bin
  *  - lemmatizer: 
https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict
  */
+
 /**
  *
  */
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
index 202b6a1..be2f1d6 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
@@ -444,10 +444,8 @@ private[impl] class NCEnStopWordsFinder(addStems: 
Set[String], exclStems: Set[St
 
             def isFirst: Boolean = idx == 0
             def isLast: Boolean = idx == toks.length - 1
-
             def next(): NCToken = toks(idx + 1)
             def prev(): NCToken = toks(idx - 1)
-
             def isCommonVerbs(firstVerb: String, secondVerb: String): Boolean =
                 isVerb(pos) && lemma == secondVerb ||
                     (isVerb(pos) && lemma == firstVerb && !isLast && 
isVerb(next().getPos) && next().getLemma == secondVerb)
@@ -564,14 +562,13 @@ private[impl] class NCEnStopWordsFinder(addStems: 
Set[String], exclStems: Set[St
         def mark(t: NCToken): Unit = if (!stack.isEmpty) set += t
 
         for (t <- toks if ok)
-            t.getOriginalText match {
+            t.getOriginalText match
                 case "(" | "{" | "[" | "<" => mark(t); 
stack.push(t.getOriginalText)
                 case ")" => check("("); mark(t)
                 case "}" => check("{"); mark(t)
                 case "]" => check("["); mark(t)
                 case ">" => check("<"); mark(t)
                 case _ => mark(t)
-            }
 
         // Just ignore invalid brackets.
         if ok && stack.isEmpty then
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
index d735331..8fdbdaf 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
@@ -42,15 +42,16 @@ class NCEnBracketsTokenEnricherSpec:
         )
         NCTestUtils.makeAndStart(new NCEnBracketsTokenEnricher())
 
+    /**
+      *
+      * @param txt
+      * @param brackets
+      */
     private def check(txt: String, brackets: Set[Integer]): Unit =
         val toks = parser.parse(NCTestRequest(txt))
-
         enricher.enrich(NCTestRequest(txt), null, toks)
-
         val seq = toks.asScala.toSeq
-
         NCTestUtils.printTokens(seq, "brackets:en")
-
         seq.zipWithIndex.foreach { case (tok, idx) =>
             require(!(tok.get[Boolean]("brackets:en") ^ 
brackets.contains(idx)))
         }
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
index 7e2e113..a7048ea 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
@@ -42,15 +42,16 @@ class NCEnQuotesTokenEnricherSpec:
         )
         enricher = NCTestUtils.makeAndStart(new NCEnQuotesTokenEnricher)
 
+    /**
+      *
+      * @param txt
+      * @param quotes
+      */
     private def check(txt: String, quotes: Set[Integer]): Unit =
         val toks = parser.parse(NCTestRequest(txt))
-
         val toksSeq = toks.asScala.toSeq
-
         enricher.enrich(NCTestRequest(txt), null, toks)
-
         NCTestUtils.printTokens(toksSeq, "quoted:en")
-
         toksSeq.zipWithIndex.foreach { case (tok, idx) =>
             require(!(tok.get[Boolean]("quoted:en") ^ quotes.contains(idx)))
         }
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
index 5a28898..ab2d125 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
@@ -27,10 +27,10 @@ import scala.jdk.CollectionConverters.SeqHasAsJava
   *
   */
 class NCEnWordsTokenEnricherSpec:
-    private var enricher: NCEnWordsTokenEnricher = _
+    private var enricher: NCEnLanguageTokenEnricher = _
 
     @BeforeEach
-    def start(): Unit = enricher = NCTestUtils.makeAndStart(new 
NCEnWordsTokenEnricher)
+    def start(): Unit = enricher = NCTestUtils.makeAndStart(new 
NCEnLanguageTokenEnricher)
 
     @Test
     def test(): Unit =
@@ -39,12 +39,12 @@ class NCEnWordsTokenEnricherSpec:
             NCTestToken(txt = "русский", stem = "русский") // Not english word.
         )
 
-        require(toks.head.getOpt[Boolean]("word:en").isEmpty)
-        require(toks.last.getOpt[Boolean]("word:en").isEmpty)
+        require(toks.head.getOpt[Boolean]("lang:en").isEmpty)
+        require(toks.last.getOpt[Boolean]("lang:en").isEmpty)
 
         enricher.enrich(null, null, toks.asJava)
 
-        NCTestUtils.printTokens(toks, "word:en")
+        NCTestUtils.printTokens(toks, "lang:en")
 
-        require(toks.head.get[Boolean]("word:en"))
-        require(!toks.last.get[Boolean]("word:en"))
\ No newline at end of file
+        require(toks.head.get[Boolean]("lang:en"))
+        require(!toks.last.get[Boolean]("lang:en"))
\ No newline at end of file

[incubator-nlpcraft] branch NLPCRAFT-469 updated: WIP/CR.

Reply via email to