This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new 070bd55 WIP/CR.
070bd55 is described below
commit 070bd5557a8351fe31d8dde068cac996e0c5bf7b
Author: Aaron Radzinski <[email protected]>
AuthorDate: Sat Dec 25 11:21:35 2021 -0800
WIP/CR.
---
.../nlp/token/enricher/NCEnBracketsTokenEnricher.java | 5 +----
.../nlp/token/enricher/NCEnDictionaryTokenEnricher.java | 2 +-
...sTokenEnricher.java => NCEnLanguageTokenEnricher.java} | 15 +++++----------
.../nlp/token/enricher/NCEnSwearWordsTokenEnricher.java | 5 +++--
.../nlp/token/enricher/impl/NCEnBracketsImpl.scala | 7 +++----
.../nlp/token/enricher/impl/NCEnDictionaryImpl.scala | 1 -
.../{NCEnWordsImpl.scala => NCEnLanguageWordsImpl.scala} | 4 ++--
.../internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala | 5 ++---
.../nlp/token/enricher/impl/NCEnSwearWordsImpl.scala | 6 +++---
.../nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java | 1 +
.../token/parser/opennlp/impl/NCEnStopWordsFinder.scala | 5 +----
.../token/enricher/NCEnBracketsTokenEnricherSpec.scala | 9 +++++----
.../nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala | 9 +++++----
.../nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala | 14 +++++++-------
14 files changed, 39 insertions(+), 49 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
index ec5310a..43920a6 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
@@ -17,10 +17,7 @@
package org.apache.nlpcraft.internal.nlp.token.enricher;
-import org.apache.nlpcraft.NCModelConfig;
-import org.apache.nlpcraft.NCRequest;
-import org.apache.nlpcraft.NCToken;
-import org.apache.nlpcraft.NCTokenEnricher;
+import org.apache.nlpcraft.*;
import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnBracketsImpl;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
index 3f9d5c4..a845eeb 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
@@ -22,7 +22,7 @@ import
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnDictionaryImpl;
import java.util.List;
/**
- * TODO: enriches with <code>swear:en</code> property. Download
swear_words.txt for use.
+ * TODO: enriches with <code>dict:en</code> property.
*/
public class NCEnDictionaryTokenEnricher implements NCTokenEnricher {
private final NCEnDictionaryImpl impl = new NCEnDictionaryImpl();
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricher.java
similarity index 76%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricher.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricher.java
index 3e87910..687f9a1 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricher.java
@@ -17,21 +17,16 @@
package org.apache.nlpcraft.internal.nlp.token.enricher;
-import org.apache.nlpcraft.NCModelConfig;
-import org.apache.nlpcraft.NCRequest;
-import org.apache.nlpcraft.NCToken;
-import org.apache.nlpcraft.NCTokenEnricher;
-import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnWordsImpl;
+import org.apache.nlpcraft.*;
+import
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnLanguageWordsImpl;
-import java.io.File;
import java.util.List;
-import java.util.Objects;
/**
- * TODO: enriches with <code>word:en</code> property.
+ * TODO: enriches with <code>lang:en</code> property.
*/
-public class NCEnWordsTokenEnricher implements NCTokenEnricher {
- private final NCEnWordsImpl impl = new NCEnWordsImpl();
+public class NCEnLanguageTokenEnricher implements NCTokenEnricher {
+ private final NCEnLanguageWordsImpl impl = new NCEnLanguageWordsImpl();
@Override
public void start() {
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricher.java
index 2a2776e..3e9e192 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricher.java
@@ -39,17 +39,18 @@ public class NCEnSwearWordsTokenEnricher implements
NCTokenEnricher {
public NCEnSwearWordsTokenEnricher(File mdlFile) {
Objects.requireNonNull(mdlFile, "Swear words model file cannot be
null.");
- this.impl = NCEnSwearWordsImpl.apply(mdlFile);
+ impl = NCEnSwearWordsImpl.apply(mdlFile);
}
/**
* TODO: swear_words.txt
+ *
* @param mdlSrc
*/
public NCEnSwearWordsTokenEnricher(String mdlSrc) {
Objects.requireNonNull(mdlSrc, "Swear words model file cannot be
null.");
- this.impl = NCEnSwearWordsImpl.apply(mdlSrc);
+ impl = NCEnSwearWordsImpl.apply(mdlSrc);
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
index a4b94a8..cf485b0 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
@@ -23,6 +23,7 @@ import org.apache.nlpcraft.*
import java.io.*
import scala.collection.mutable
import scala.jdk.CollectionConverters.CollectionHasAsScala
+
/**
*
*/
@@ -36,16 +37,14 @@ class NCEnBracketsImpl extends NCTokenEnricher with
LazyLogging:
def mark(t: NCToken): Unit = map += t -> !stack.isEmpty
for (t <- toks.asScala if ok)
- t.getOriginalText match {
+ t.getOriginalText match
case "(" | "{" | "[" | "<" => mark(t);
stack.push(t.getOriginalText)
case ")" => check("("); mark(t)
case "}" => check("{"); mark(t)
case "]" => check("["); mark(t)
case ">" => check("<"); mark(t)
case _ => mark(t)
- }
- if ok && stack.isEmpty then
- map.foreach { (tok, b) => tok.put("brackets:en", b) }
+ if ok && stack.isEmpty then map.foreach { (tok, b) =>
tok.put("brackets:en", b) }
else
logger.trace(s"Invalid brackets: ${req.getOriginalText}")
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
index 43b15f7..2c22323 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
@@ -28,6 +28,5 @@ class NCEnDictionaryImpl extends NCTokenEnricher:
override def start(): Unit = dict =
NCUtils.readResource("moby/354984si.ngl", "iso-8859-1").toSet
override def stop(): Unit = dict = null
-
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
java.util.List[NCToken]): Unit =
toks.forEach(t => t.put("dict:en", dict.contains(t.getLemma)))
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnWordsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnLanguageWordsImpl.scala
similarity index 87%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnWordsImpl.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnLanguageWordsImpl.scala
index db6fe7e..4c983f3 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnWordsImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnLanguageWordsImpl.scala
@@ -23,6 +23,6 @@ import java.io.*
/**
*
*/
-class NCEnWordsImpl extends NCTokenEnricher:
+class NCEnLanguageWordsImpl extends NCTokenEnricher:
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
java.util.List[NCToken]): Unit =
- toks.forEach(t => t.put("word:en",
t.getOriginalText.matches("""[\s\w\p{Punct}]+""")))
\ No newline at end of file
+ toks.forEach(t => t.put("lang:en",
t.getOriginalText.matches("""[\s\w\p{Punct}]+""")))
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
index c016886..e37dce4 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
@@ -27,6 +27,7 @@ object NCEnQuotesImpl:
private def isQuote(t: NCToken): Boolean = Q_POS.contains(t.getPos)
import NCEnQuotesImpl.*
+
/**
*
*/
@@ -44,13 +45,11 @@ class NCEnQuotesImpl extends NCTokenEnricher with
LazyLogging:
// Start and end quote mustn't be same ("a` processed as valid)
if quotes.nonEmpty && quotes.size % 2 == 0 then
val m = toksSeq.zipWithIndex.toMap
-
val pairs = quotes.zipWithIndex.drop(1).flatMap {
(t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx -
1))) else None
}
-
toksSeq.zipWithIndex.foreach { (tok, idx) =>
tok.put("quoted:en", pairs.exists { case (from, to) => from >
idx && to < idx })
}
else
- logger.trace(s"Invalid quotes: ${req.getOriginalText}")
\ No newline at end of file
+ logger.warn(s"Invalid quotes: ${req.getOriginalText}")
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
index cd645f5..af79096 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
@@ -44,7 +44,9 @@ object NCEnSwearWordsImpl:
* @param mdlSrc
* @return
*/
- def apply(mdlSrc: String): NCEnSwearWordsImpl = new
NCEnSwearWordsImpl(NCUtils.getStream(mdlSrc), mdlSrc)
+ def apply(mdlSrc: String): NCEnSwearWordsImpl = new NCEnSwearWordsImpl(
+ NCUtils.getStream(mdlSrc), mdlSrc
+ )
/**
*
@@ -56,9 +58,7 @@ class NCEnSwearWordsImpl(is: InputStream, res: String)
extends NCTokenEnricher w
val stemmer = new PorterStemmer
swearWords = NCUtils.readTextStream(is,
"UTF-8").map(stemmer.stem).toSet
logger.trace(s"Loaded resource: $res")
-
override def stop(): Unit = swearWords = null
-
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
java.util.List[NCToken]): Unit =
toks.forEach(t => t.put("swear:en", swearWords.contains(t.getStem)))
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java
index 44d8140..b8659e4 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java
@@ -34,6 +34,7 @@ import java.util.Set;
* - tagger: http://opennlp.sourceforge.net/models-1.5/en-pos-maxent.bin
* - lemmatizer:
https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict
*/
+
/**
*
*/
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
index 202b6a1..be2f1d6 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
@@ -444,10 +444,8 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
def isFirst: Boolean = idx == 0
def isLast: Boolean = idx == toks.length - 1
-
def next(): NCToken = toks(idx + 1)
def prev(): NCToken = toks(idx - 1)
-
def isCommonVerbs(firstVerb: String, secondVerb: String): Boolean =
isVerb(pos) && lemma == secondVerb ||
(isVerb(pos) && lemma == firstVerb && !isLast &&
isVerb(next().getPos) && next().getLemma == secondVerb)
@@ -564,14 +562,13 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
def mark(t: NCToken): Unit = if (!stack.isEmpty) set += t
for (t <- toks if ok)
- t.getOriginalText match {
+ t.getOriginalText match
case "(" | "{" | "[" | "<" => mark(t);
stack.push(t.getOriginalText)
case ")" => check("("); mark(t)
case "}" => check("{"); mark(t)
case "]" => check("["); mark(t)
case ">" => check("<"); mark(t)
case _ => mark(t)
- }
// Just ignore invalid brackets.
if ok && stack.isEmpty then
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
index d735331..8fdbdaf 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
@@ -42,15 +42,16 @@ class NCEnBracketsTokenEnricherSpec:
)
NCTestUtils.makeAndStart(new NCEnBracketsTokenEnricher())
+ /**
+ *
+ * @param txt
+ * @param brackets
+ */
private def check(txt: String, brackets: Set[Integer]): Unit =
val toks = parser.parse(NCTestRequest(txt))
-
enricher.enrich(NCTestRequest(txt), null, toks)
-
val seq = toks.asScala.toSeq
-
NCTestUtils.printTokens(seq, "brackets:en")
-
seq.zipWithIndex.foreach { case (tok, idx) =>
require(!(tok.get[Boolean]("brackets:en") ^
brackets.contains(idx)))
}
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
index 7e2e113..a7048ea 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
@@ -42,15 +42,16 @@ class NCEnQuotesTokenEnricherSpec:
)
enricher = NCTestUtils.makeAndStart(new NCEnQuotesTokenEnricher)
+ /**
+ *
+ * @param txt
+ * @param quotes
+ */
private def check(txt: String, quotes: Set[Integer]): Unit =
val toks = parser.parse(NCTestRequest(txt))
-
val toksSeq = toks.asScala.toSeq
-
enricher.enrich(NCTestRequest(txt), null, toks)
-
NCTestUtils.printTokens(toksSeq, "quoted:en")
-
toksSeq.zipWithIndex.foreach { case (tok, idx) =>
require(!(tok.get[Boolean]("quoted:en") ^ quotes.contains(idx)))
}
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
index 5a28898..ab2d125 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
@@ -27,10 +27,10 @@ import scala.jdk.CollectionConverters.SeqHasAsJava
*
*/
class NCEnWordsTokenEnricherSpec:
- private var enricher: NCEnWordsTokenEnricher = _
+ private var enricher: NCEnLanguageTokenEnricher = _
@BeforeEach
- def start(): Unit = enricher = NCTestUtils.makeAndStart(new
NCEnWordsTokenEnricher)
+ def start(): Unit = enricher = NCTestUtils.makeAndStart(new
NCEnLanguageTokenEnricher)
@Test
def test(): Unit =
@@ -39,12 +39,12 @@ class NCEnWordsTokenEnricherSpec:
NCTestToken(txt = "русский", stem = "русский") // Not english word.
)
- require(toks.head.getOpt[Boolean]("word:en").isEmpty)
- require(toks.last.getOpt[Boolean]("word:en").isEmpty)
+ require(toks.head.getOpt[Boolean]("lang:en").isEmpty)
+ require(toks.last.getOpt[Boolean]("lang:en").isEmpty)
enricher.enrich(null, null, toks.asJava)
- NCTestUtils.printTokens(toks, "word:en")
+ NCTestUtils.printTokens(toks, "lang:en")
- require(toks.head.get[Boolean]("word:en"))
- require(!toks.last.get[Boolean]("word:en"))
\ No newline at end of file
+ require(toks.head.get[Boolean]("lang:en"))
+ require(!toks.last.get[Boolean]("lang:en"))
\ No newline at end of file