This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new 26edc92 WIP.
26edc92 is described below
commit 26edc92448b8eead5315a66a2dd24973d3eeaf58
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Dec 21 18:05:28 2021 +0300
WIP.
---
.../internal/nlp/token/parser/opennlp/NCEnStopWordsFinder.java | 7 +++++--
.../internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java | 9 +++++++++
.../nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala | 4 ++++
.../nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala | 9 +++++----
4 files changed, 23 insertions(+), 6 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnStopWordsFinder.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnStopWordsFinder.java
index f631fc2..80d11e3 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnStopWordsFinder.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnStopWordsFinder.java
@@ -34,7 +34,7 @@ public class NCEnStopWordsFinder implements NCStopWordsFinder
{
*
*/
public NCEnStopWordsFinder() {
- impl = new NCEnStopWordsFinderImpl(Collections.emptySet(),
Collections.emptySet());
+ this(Collections.emptySet(), Collections.emptySet());
}
/**
@@ -43,7 +43,10 @@ public class NCEnStopWordsFinder implements
NCStopWordsFinder {
* @param exclStopWords
*/
public NCEnStopWordsFinder(Set<String> addStopWords, Set<String>
exclStopWords) {
- impl = new NCEnStopWordsFinderImpl(addStopWords, exclStopWords);
+ impl = new NCEnStopWordsFinderImpl(
+ addStopWords == null ? Collections.emptySet() : addStopWords,
+ exclStopWords == null ? Collections.emptySet() : exclStopWords
+ );
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
index 075b030..86d25be 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
@@ -25,6 +25,7 @@ import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCOpenNlpImpl;
import java.io.File;
import java.util.List;
+import java.util.Objects;
import java.util.Set;
/*
@@ -58,6 +59,10 @@ public class NCOpenNlpTokenParser implements NCTokenParser {
* @throws NCException
*/
public NCOpenNlpTokenParser(File tokMdl, File posMdl, File lemmaDic,
NCStopWordsFinder swFinder) {
+ Objects.requireNonNull(tokMdl, "Tonenizer model cannot be null");
+ Objects.requireNonNull(posMdl, "POS model cannot be null");
+ Objects.requireNonNull(lemmaDic, "Lemmatizer model cannot be null");
+
try {
impl = NCOpenNlpImpl.apply(tokMdl, posMdl, lemmaDic, swFinder);
}
@@ -75,6 +80,10 @@ public class NCOpenNlpTokenParser implements NCTokenParser {
* @throws NCException
*/
public NCOpenNlpTokenParser(String tokMdlSrc, String posMdlSrc, String
lemmaDicSrc, NCStopWordsFinder swFinder) {
+ Objects.requireNonNull(tokMdlSrc, "Tonenizer model cannot be null");
+ Objects.requireNonNull(posMdlSrc, "POS model cannot be null");
+ Objects.requireNonNull(lemmaDicSrc, "Lemmatizer model cannot be null");
+
try {
impl = NCOpenNlpImpl.apply(tokMdlSrc, posMdlSrc, lemmaDicSrc,
swFinder);
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
index e41798d..68f9b5e 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
@@ -577,6 +577,10 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
// +-------------------------------------------------+
processCommonStops(ns, stops)
+ // +-------------------------------------------------+
+ // | Pass #9. |
+ // | Deletes stop words if they are marked as quoted.|
+ // +-------------------------------------------------+
val quotes = ns.filter(isQuote)
if (quotes.nonEmpty && quotes.size % 2 == 0)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
index 1c1ee2c..59d2184 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
@@ -24,11 +24,14 @@ import org.junit.jupiter.api.{BeforeEach, Test}
import scala.jdk.CollectionConverters.*
import java.util
+/**
+ *
+ */
class NCOpenNlpTokenParserSpec:
private var parser: NCOpenNlpTokenParser = _
@BeforeEach
- def start(): Unit = {
+ def start(): Unit =
parser =
new NCOpenNlpTokenParser(
"opennlp/en-token.bin",
@@ -37,9 +40,7 @@ class NCOpenNlpTokenParserSpec:
new NCEnStopWordsFinder()
)
- parser.start()
- }
-
+ parser.start
private def request(txt: String): Seq[NCToken] =
val toks = parser.parse(