This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new 1579b65 WIP.
1579b65 is described below
commit 1579b6540d28b2dac68c2f9327c52df22048cd20
Author: Sergey Kamov <[email protected]>
AuthorDate: Sun Jan 2 16:36:25 2022 +0300
WIP.
---
.../scala/org/apache/nlpcraft/NCModelClient.java | 7 +-
.../scala/org/apache/nlpcraft/NCModelConfig.java | 10 +-
.../org/apache/nlpcraft/NCModelConfigAdapter.java | 38 +---
.../main/scala/org/apache/nlpcraft/NCRequest.java | 6 -
.../main/scala/org/apache/nlpcraft/NCToken.java | 16 +-
.../scala/org/apache/nlpcraft/NCTokenParser.java | 29 ++-
.../scala/org/apache/nlpcraft/NCTokenizer.java | 33 ----
...ariantsFilter.java => NCVariantsValidator.java} | 2 +-
.../src/main/scala/org/apache/nlpcraft/NCWord.java | 53 ------
.../apache/nlpcraft/internal/util/NCUtils.scala | 68 +------
.../parser/opennlp/NCOpenNlpEntityParser.java | 6 +-
.../parser/semantic/NCSemanticEntityParser.java | 20 +-
.../parser/semantic/NCSemanticTextStemmer.java | 30 ---
.../semantic/en/NCEnSemanticEntityParser.java | 38 ----
.../semantic/impl/NCSemanticEntityParserImpl.scala | 31 ++--
.../impl/NCSemanticSynonymsProcessor.scala | 14 +-
.../semantic/impl/en/NCEnSemanticTextStemmer.java | 30 ---
.../enricher/en/NCEnBracketsTokenEnricher.java | 5 +-
.../enricher/en/NCEnDictionaryTokenEnricher.java | 5 +-
.../enricher/en/NCEnLanguageTokenEnricher.java | 5 +-
.../token/enricher/en/NCEnQuotesTokenEnricher.java | 5 +-
...richer.java => NCEnStopWordsTokenEnricher.java} | 23 ++-
.../enricher/en/NCEnSwearWordsTokenEnricher.java | 5 +-
.../token/enricher/en/impl/NCEnBracketsImpl.scala | 3 +-
.../en/impl/NCEnStopWordGenerator.scala | 3 +-
.../en/impl/NCEnStopWordsImpl.scala} | 202 ++++++++++++++-------
.../enricher/en/impl/NCEnSwearWordsImpl.scala | 4 +-
.../parser/opennlp/en/NCEnOpenNlpTokenParser.java | 52 ++----
.../parser/opennlp/en/impl/NCEnOpenNlpImpl.scala | 142 ++++++---------
.../nlp/tokenizer/opennlp/NCOpenNlpTokenizer.java | 64 -------
.../opennlp/impl/NCOpenNlpTokenizerImpl.scala | 45 -----
.../nlpcraft/nlp/benchmark/NCBenchmarkAdapter.java | 4 +-
.../opennlp/NCEnOpenNlpTokenParserBenchmark.java | 3 +-
.../parser/opennlp/NCOpenNlpEntityParserSpec.scala | 5 +-
.../semantic/NCSemanticEntityParserJsonSpec.scala | 12 +-
.../semantic/NCSemanticEntityParserSpec.scala | 14 +-
.../semantic/NCSemanticEntityParserYamlSpec.scala | 12 +-
.../en/NCEnBracketsTokenEnricherSpec.scala | 5 +-
.../en/NCEnDictionaryTokenEnricherSpec.scala | 3 +-
.../enricher/en/NCEnQuotesTokenEnricherSpec.scala | 5 +-
.../opennlp/en/NCEnOpenNlpTokenParserSpec.scala | 51 ++++--
.../apache/nlpcraft/nlp/util/NCTestConfig.scala | 54 +++---
.../apache/nlpcraft/nlp/util/NCTestRequest.scala | 1 -
.../org/apache/nlpcraft/nlp/util/NCTestToken.scala | 9 +-
.../org/apache/nlpcraft/nlp/util/NCTestUtils.scala | 32 +++-
45 files changed, 447 insertions(+), 757 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelClient.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelClient.java
index a5482ff..3fe9fdb 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelClient.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelClient.java
@@ -74,12 +74,13 @@ public class NCModelClient implements NCLifecycle {
public void start(NCModelConfig cfg) {
verify();
- cfg.getTokenizer().start(cfg);
+ cfg.getTokenParser().start(cfg);
ExecutorService s = getExecutorService();
+ // TODO: start and stop validators.
+
try {
- start(s, cfg.getTokenParsers(), cfg);
start(s, cfg.getEntityParsers(), cfg);
start(s, cfg.getEntityEnrichers(), cfg);
start(s, cfg.getTokenEnrichers(), cfg);
@@ -104,7 +105,7 @@ public class NCModelClient implements NCLifecycle {
stopExecutorService(s);
}
- cfg.getTokenizer().stop();
+ cfg.getTokenParser().stop();
}
/**
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java
index 78aa42e..97653a2 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java
@@ -27,13 +27,7 @@ public interface NCModelConfig extends NCPropertyMap {
*
* @return
*/
- NCTokenizer getTokenizer();
-
- /**
- *
- * @return
- */
- List<NCTokenParser> getTokenParsers();
+ NCTokenParser getTokenParser();
/**
*
@@ -69,7 +63,7 @@ public interface NCModelConfig extends NCPropertyMap {
*
* @return
*/
- List<NCVariantsFilter> getVariantsFilters();
+ List<NCVariantsValidator> getVariantValidators();
/**
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
index 1ffb978..5db1716 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
@@ -23,19 +23,18 @@ import java.util.*;
*
*/
// TODO: validation for constructor and all setters.
+ // TODO: do builder instaed of it.
public class NCModelConfigAdapter extends NCPropertyMapAdapter implements
NCModelConfig {
private final String id;
private final String name;
private final String version;
- private final NCTokenizer tokenizer;
- private final List<NCTokenParser> tokParsers = new ArrayList<>();
+ private final NCTokenParser tokParser;
private final List<NCTokenEnricher> tokEnrichers = new ArrayList<>();
private final List<NCEntityEnricher> entEnrichers = new ArrayList<>();
private final List<NCEntityParser> entParsers = new ArrayList<>();
private final List<NCTokenValidator> tokenValidators = new ArrayList<>();
private final List<NCEntityValidator> entityValidators = new ArrayList<>();
- private final List<NCVariantsFilter> variantsFilters = new ArrayList<>();
-
+ private final List<NCVariantsValidator> variantsFilters = new
ArrayList<>();
/**
*
@@ -44,35 +43,23 @@ public class NCModelConfigAdapter extends
NCPropertyMapAdapter implements NCMode
* @param version
* @param tokParser
*/
- public NCModelConfigAdapter(String id, String name, String version,
NCTokenizer tokenizer, NCTokenParser tokParser, NCEntityParser entParser) {
+ public NCModelConfigAdapter(String id, String name, String version,
NCTokenParser tokParser, NCEntityParser entParser) {
Objects.requireNonNull(id, "ID cannot be null.");
Objects.requireNonNull(name, "Name cannot be null.");
Objects.requireNonNull(version, "Version cannot be null.");
- Objects.requireNonNull(tokenizer, "Tokenizer cannot be null.");
Objects.requireNonNull(tokParser, "Token parser cannot be null.");
Objects.requireNonNull(entParser, "Entity parser cannot be null.");
this.id = id;
this.name = name;
this.version = version;
- this.tokenizer = tokenizer;
-
- tokParsers.add(tokParser);
+ this.tokParser = tokParser;
+
entParsers.add(entParser);
}
/**
*
- * @param tokParser
- */
- public void addTokenParser(NCTokenParser tokParser) {
- Objects.requireNonNull(tokParser, "Token parser cannot be null.");
-
- tokParsers.add(tokParser);
- }
-
- /**
- *
* @param entParser
*/
public void addEntityParser(NCEntityParser entParser) {
@@ -125,7 +112,7 @@ public class NCModelConfigAdapter extends
NCPropertyMapAdapter implements NCMode
*
* @param variantFilter
*/
- public void addVariantFilter(NCVariantsFilter variantFilter) {
+ public void addVariantFilter(NCVariantsValidator variantFilter) {
Objects.requireNonNull(variantFilter, "Variant filter cannot be
null.");
variantsFilters.add(variantFilter);
@@ -157,8 +144,8 @@ public class NCModelConfigAdapter extends
NCPropertyMapAdapter implements NCMode
}
@Override
- public List<NCTokenParser> getTokenParsers() {
- return tokParsers;
+ public NCTokenParser getTokenParser() {
+ return tokParser;
}
@Override
@@ -167,11 +154,6 @@ public class NCModelConfigAdapter extends
NCPropertyMapAdapter implements NCMode
}
@Override
- public NCTokenizer getTokenizer() {
- return tokenizer;
- }
-
- @Override
public List<NCTokenValidator> getTokenValidators() {
return tokenValidators;
}
@@ -182,7 +164,7 @@ public class NCModelConfigAdapter extends
NCPropertyMapAdapter implements NCMode
}
@Override
- public List<NCVariantsFilter> getVariantsFilters() {
+ public List<NCVariantsValidator> getVariantValidators() {
return variantsFilters;
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java
index 72faad8..468f8ac 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java
@@ -52,12 +52,6 @@ public interface NCRequest {
String getText();
/**
- *
- * @return
- */
- List<NCWord> getWords();
-
- /**
* Gets UTC/GMT timestamp in millis when user input was received.
*
* @return UTC/GMT timestamp in ms when user input was received.
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
index 4eeacc5..23a6205 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
@@ -20,28 +20,34 @@ package org.apache.nlpcraft;
/**
*
*/
-public interface NCToken extends NCWord, NCPropertyMap {
+public interface NCToken extends NCPropertyMap {
/**
*
* @return
*/
- String getLemma();
+ String getText();
/**
*
* @return
*/
- String getStem();
+ int getIndex();
/**
*
* @return
*/
- String getPos();
+ String getLemma();
/**
*
* @return
*/
- boolean isStopWord();
+ String getStem();
+
+ /**
+ *
+ * @return
+ */
+ String getPos();
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenParser.java
index 669df6e..91aa783 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenParser.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenParser.java
@@ -25,11 +25,30 @@ import java.util.List;
public interface NCTokenParser extends NCLifecycle {
/**
*
- * @param req
- * @param cfg
+ * @param text
* @return
- * @throws
- * @throws NCException
*/
- List<NCToken> parse(NCRequest req, NCModelConfig cfg);
+ List<String> tokenize(String text);
+
+ /**
+ *
+ * @param s
+ * @return
+ */
+ String getStem(String s);
+
+ /**
+ *
+ * @param toks
+ * @return
+ */
+ List<String> getPoses(List<String> toks);
+
+ /**
+ *
+ * @param toks
+ * @param poses
+ * @return
+ */
+ List<String> getLemmas(List<String> toks, List<String> poses);
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenizer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenizer.java
deleted file mode 100644
index 1ee784d..0000000
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenizer.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft;
-
-import java.util.List;
-
-/**
- *
- */
-public interface NCTokenizer extends NCLifecycle {
- /**
- *
- * @param cfg
- * @param txt
- * @return
- */
- List<NCWord> tokenize(NCModelConfig cfg, String txt);
-}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCVariantsFilter.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCVariantsValidator.java
similarity index 94%
rename from nlpcraft/src/main/scala/org/apache/nlpcraft/NCVariantsFilter.java
rename to nlpcraft/src/main/scala/org/apache/nlpcraft/NCVariantsValidator.java
index 09ec82c..8829e2f 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCVariantsFilter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCVariantsValidator.java
@@ -22,7 +22,7 @@ import java.util.List;
/**
*
*/
-public interface NCVariantsFilter extends NCLifecycle {
+public interface NCVariantsValidator extends NCLifecycle {
/**
* Filters all found entities variants.
*
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java
deleted file mode 100644
index a13840a..0000000
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft;
-
-/**
- *
- */
-public interface NCWord {
- /**
- *
- * @return
- */
- String getText();
-
- /**
- *
- * @return
- */
- int getStartCharIndex();
-
- /**
- *
- * @return
- */
- int getEndCharIndex();
-
- /**
- *
- * @return
- */
- int getLength();
-
- /**
- *
- * @return
- */
- int getIndex();
-}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index dcf24cb..ffaaa77 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -19,23 +19,21 @@ package org.apache.nlpcraft.internal.util
import com.google.gson.GsonBuilder
import com.typesafe.scalalogging.*
-import org.apache.nlpcraft.NCToken
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.ansi.NCAnsi.*
import java.io.*
import java.net.*
-import java.util.{Random, UUID}
import java.util.regex.Pattern
import java.util.zip.*
+import java.util.{Random, UUID}
import scala.annotation.tailrec
import scala.collection.{IndexedSeq, Seq}
-import scala.concurrent.duration.Duration
import scala.concurrent.*
-import scala.io.Source
+import scala.concurrent.duration.Duration
+import scala.io.*
import scala.sys.SystemProperties
import scala.util.Using
-import scala.io.BufferedSource
/**
*
@@ -921,63 +919,3 @@ object NCUtils extends LazyLogging:
*/
def genUUID(): UUID = UUID.randomUUID()
- /**
- * Gets all sequential permutations of tokens in this NLP sentence.
- *
- * For example, if NLP sentence contains "a, b, c, d" tokens, then
- * this function will return the sequence of following token sequences in
this order:
- * "a b c d"
- * "a b c"
- * "b c d"
- * "a b"
- * "b c"
- * "c d"
- * "a"
- * "b"
- * "c"
- * "d"
- *
- * NOTE: this method will not return any permutations with a quoted token.
- *
- * @param tokens Tokens.
- * @param stopWords Whether or not include tokens marked as stop words.
- * @param maxLen Maximum number of tokens in the sequence.
- */
- def tokenMix(tokens: Seq[NCToken], stopWords: Boolean = false, maxLen: Int
= Integer.MAX_VALUE): Seq[Seq[NCToken]] =
- val toks = tokens.filter(t => stopWords || (!stopWords &&
!t.isStopWord))
-
- (for (n <- toks.length until 0 by -1 if n <= maxLen) yield
toks.sliding(n)).flatten
-
- /**
- * Gets all sequential permutations of tokens in this NLP sentence.
- * This method is like a 'tokenMix', but with all combinations of
stop-words (with and without)
- *
- * @param tokens Tokens.
- * @param maxLen Maximum number of tokens in the sequence.
- */
- def tokenMixWithStopWords(tokens: Seq[NCToken], maxLen: Int =
Integer.MAX_VALUE): Seq[Seq[NCToken]] =
- /**
- * Gets all combinations for sequence of mandatory tokens with
stop-words and without.
- *
- * Example:
- * 'A (stop), B, C(stop) -> [A, B, C]; [A, B]; [B, C], [B]
- * 'A, B(stop), C(stop) -> [A, B, C]; [A, B]; [A, C], [A].
- *
- * @param toks Tokens.
- */
- def permutations(toks: Seq[NCToken]): Seq[Seq[NCToken]] =
- def multiple(seq: Seq[Seq[Option[NCToken]]], t: NCToken):
Seq[Seq[Option[NCToken]]] =
- if seq.isEmpty then
- if t.isStopWord then IndexedSeq(IndexedSeq(Some(t)),
IndexedSeq(None)) else IndexedSeq(IndexedSeq(Some(t)))
- else
- (for (subSeq <- seq) yield subSeq :+ Some(t)) ++ (if
t.isStopWord then for (subSeq <- seq) yield subSeq :+ None else Seq.empty)
-
- var res: Seq[Seq[Option[NCToken]]] = Seq.empty
- for (t <- toks) res = multiple(res, t)
- res.map(_.flatten).filter(_.nonEmpty)
-
- tokenMix(tokens, stopWords = true, maxLen).
- flatMap(permutations).
- filter(_.nonEmpty).
- distinct.
- sortBy(seq => (-seq.length, seq.head.getStartCharIndex))
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParser.java
index 1ea0930..f92f8ba 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParser.java
@@ -17,7 +17,11 @@
package org.apache.nlpcraft.nlp.entity.parser.opennlp;
-import org.apache.nlpcraft.*;
+import org.apache.nlpcraft.NCEntity;
+import org.apache.nlpcraft.NCEntityParser;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
import
org.apache.nlpcraft.nlp.entity.parser.opennlp.impl.NCOpenNlpEntityParserImpl;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
index 9c09111..c28f03e 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
@@ -24,7 +24,10 @@ import org.apache.nlpcraft.NCRequest;
import org.apache.nlpcraft.NCToken;
import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl;
-import java.util.*;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
/**
*
@@ -37,11 +40,10 @@ public class NCSemanticEntityParser implements
NCEntityParser {
* @param stemmer
* @param elems
*/
- public NCSemanticEntityParser(NCSemanticTextStemmer stemmer,
List<NCSemanticElement> elems) {
- Objects.requireNonNull(stemmer, "Stemmer cannot be null");
+ public NCSemanticEntityParser(List<NCSemanticElement> elems) {
Objects.requireNonNull(elems, "Elements cannot be null");
- impl = NCSemanticEntityParserImpl.apply(stemmer,
Collections.emptyMap(), elems);
+ impl = NCSemanticEntityParserImpl.apply(Collections.emptyMap(), elems);
}
/**
@@ -50,11 +52,10 @@ public class NCSemanticEntityParser implements
NCEntityParser {
* @param macros
* @param elems
*/
- public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, Map<String,
String> macros, List<NCSemanticElement> elems) {
- Objects.requireNonNull(stemmer, "Stemmer cannot be null");
+ public NCSemanticEntityParser(Map<String, String> macros,
List<NCSemanticElement> elems) {
Objects.requireNonNull(elems, "Elements cannot be null");
- impl = NCSemanticEntityParserImpl.apply(stemmer, macros, elems);
+ impl = NCSemanticEntityParserImpl.apply(macros, elems);
}
/**
@@ -62,11 +63,10 @@ public class NCSemanticEntityParser implements
NCEntityParser {
* @param stemmer
* @param mdlSrc
*/
- public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, String
mdlSrc) {
- Objects.requireNonNull(stemmer, "Stemmer cannot be null");
+ public NCSemanticEntityParser(String mdlSrc) {
Objects.requireNonNull(mdlSrc, "Source cannot be null");
- impl = NCSemanticEntityParserImpl.apply(stemmer, mdlSrc);
+ impl = NCSemanticEntityParserImpl.apply(mdlSrc);
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticTextStemmer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticTextStemmer.java
deleted file mode 100644
index 5ef08d3..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticTextStemmer.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.nlp.entity.parser.semantic;
-
-/**
- *
- */
-public interface NCSemanticTextStemmer {
- /**
- *
- * @param text
- * @return
- */
- String stem(String text);
-}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/en/NCEnSemanticEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/en/NCEnSemanticEntityParser.java
deleted file mode 100644
index ae21ccb..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/en/NCEnSemanticEntityParser.java
+++ /dev/null
@@ -1,38 +0,0 @@
-package org.apache.nlpcraft.nlp.entity.parser.semantic.en;
-
-import org.apache.nlpcraft.nlp.entity.parser.semantic.NCSemanticElement;
-import org.apache.nlpcraft.nlp.entity.parser.semantic.NCSemanticEntityParser;
-import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en.NCEnSemanticTextStemmer;
-
-import java.util.List;
-import java.util.Map;
-
-/**
- * TODO: Do we need it?
- */
-public class NCEnSemanticEntityParser extends NCSemanticEntityParser {
- /**
- *
- * @param elems
- */
- public NCEnSemanticEntityParser(List<NCSemanticElement> elems) {
- super(new NCEnSemanticTextStemmer(), elems);
- }
-
- /**
- *
- * @param macros
- * @param elems
- */
- public NCEnSemanticEntityParser(Map<String, String> macros,
List<NCSemanticElement> elems) {
- super(new NCEnSemanticTextStemmer(), macros, elems);
- }
-
- /**
- *
- * @param mdlSrc
- */
- public NCEnSemanticEntityParser(String mdlSrc) {
- super(new NCEnSemanticTextStemmer(), mdlSrc);
- }
-}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index 1a3dfbe..8680193 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -22,8 +22,8 @@ import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.makro.NCMacroParser
import org.apache.nlpcraft.internal.util.NCUtils
import org.apache.nlpcraft.nlp.entity.parser.semantic.*
-import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticSourceType.*
import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticChunkKind.*
+import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticSourceType.*
import java.io.*
import java.util.regex.*
@@ -32,19 +32,17 @@ import scala.collection.mutable
import scala.jdk.CollectionConverters.*
object NCSemanticEntityParserImpl:
- def apply(stemmer: NCSemanticTextStemmer, macros: Jmap[String, String],
elems: JList[NCSemanticElement]): NCSemanticEntityParserImpl =
- require(stemmer != null)
+ def apply(macros: Jmap[String, String], elems: JList[NCSemanticElement]):
NCSemanticEntityParserImpl =
require(elems != null)
new NCSemanticEntityParserImpl(
- stemmer, macros = if macros == null then null else
macros.asScala.toMap, elements = elems.asScala.toSeq
+ macros = if macros == null then null else macros.asScala.toMap,
elements = elems.asScala.toSeq
)
- def apply(stemmer: NCSemanticTextStemmer, mdlSrc: String):
NCSemanticEntityParserImpl =
- require(stemmer != null)
+ def apply(mdlSrc: String): NCSemanticEntityParserImpl =
require(mdlSrc != null)
- new NCSemanticEntityParserImpl(stemmer, mdlSrc = mdlSrc, typ =
NCSemanticSourceType(mdlSrc))
+ new NCSemanticEntityParserImpl(mdlSrc = mdlSrc, scrType =
NCSemanticSourceType(mdlSrc))
/**
* @param baseTokens Tokens.
@@ -52,6 +50,9 @@ object NCSemanticEntityParserImpl:
*/
private case class Piece(baseTokens: Seq[NCToken], variants:
Seq[Seq[NCToken]])
+ // TODO: error?
+ private def isStopWord(t: NCToken): Boolean = t.get[Boolean]("stopword")
+
/**
*
* 1. Prepares combination of tokens (sliding).
@@ -68,7 +69,7 @@ object NCSemanticEntityParserImpl:
*/
private def getPieces(toks: Seq[NCToken]): Seq[Piece] =
(for (n <- toks.size until 0 by -1) yield
toks.sliding(n)).flatten.map(p => p).map(combo => {
- val stops = combo.filter(s => s.isStopWord && s != combo.head && s
!= combo.last)
+ val stops = combo.filter(s => isStopWord(s) && s != combo.head &&
s != combo.last)
val slides =
mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NCToken]]
for (stop <- stops)
@@ -111,31 +112,31 @@ import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParse
/**
*
- * @param stemmer
* @param macros
* @param elements
*/
class NCSemanticEntityParserImpl(
- stemmer: NCSemanticTextStemmer,
macros: Map[String, String] = null,
elements: Seq[NCSemanticElement] = null,
mdlSrc: String = null,
- typ: NCSemanticSourceType = null
+ scrType: NCSemanticSourceType = null
) extends NCEntityParser with LazyLogging:
- require(stemmer != null)
- require(macros != null && elements != null || mdlSrc != null && typ !=
null)
+ require(macros != null && elements != null || mdlSrc != null && scrType !=
null)
@volatile private var h: NCSemanticSynonymsHolder = _
override def start(cfg: NCModelConfig): Unit =
val (macros, elements) =
if mdlSrc != null then
- val src = NCSemanticDataReader.read(new
BufferedInputStream(NCUtils.getStream(mdlSrc)), typ)
+ val src = NCSemanticDataReader.read(new
BufferedInputStream(NCUtils.getStream(mdlSrc)), scrType)
+
+ logger.trace(s"Loaded resource: $mdlSrc")
+
(src.macros, src.elements)
else
(this.macros, this.elements)
- h = NCSemanticSynonymsProcessor.prepare(cfg, stemmer, macros, elements)
+ h = NCSemanticSynonymsProcessor.prepare(cfg, macros, elements)
override def stop(): Unit = h = null
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
index 4a49dae..16e3a46 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
@@ -19,11 +19,11 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
import com.fasterxml.jackson.databind.*
import com.fasterxml.jackson.dataformat.yaml.*
import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.makro.NCMacroParser
import org.apache.nlpcraft.nlp.entity.parser.semantic.*
import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticChunkKind.*
-import com.typesafe.scalalogging.LazyLogging
import java.io.InputStream
import java.util
@@ -137,7 +137,7 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
checkSynonyms(v.getSynonyms, elemId, Some(name))
private def startsAndEnds(fix: String, s: String): Boolean =
s.startsWith(fix) && s.endsWith(fix)
- private def mkChunk(stemmer: NCSemanticTextStemmer, chunk: String):
NCSemanticSynonymChunk =
+ private def mkChunk(p: NCTokenParser, chunk: String):
NCSemanticSynonymChunk =
def stripSuffix(fix: String, s: String): String = s.slice(fix.length,
s.length - fix.length)
// Regex synonym.
@@ -152,23 +152,21 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
else
throw new NCException(s"Empty regex synonym detected
[chunk=$chunk]")
else
- NCSemanticSynonymChunk(kind = TEXT, text = chunk, stem =
stemmer.stem(chunk))
+ NCSemanticSynonymChunk(kind = TEXT, text = chunk, stem =
p.getStem(chunk))
/**
*
* @param cfg
- * @param stemmer
* @param macros
* @param elements
* @throws NCException
*/
def prepare(
cfg: NCModelConfig,
- stemmer: NCSemanticTextStemmer,
macros: Map[String, String],
elements: Seq[NCSemanticElement]
): NCSemanticSynonymsHolder =
- require(cfg != null && stemmer != null)
+ require(cfg != null)
checkElements(elements)
checkMacros(macros, elements)
@@ -188,8 +186,8 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
def add(syns: Seq[NCSemanticSynonym]): Unit = buf ++=
syns.map(Holder(_, elemId))
def convert(syns: JList[String]): Seq[Seq[NCSemanticSynonymChunk]]
=
syns.asScala.flatMap(p.expand).
- map(t => cfg.getTokenizer.tokenize(cfg, t).asScala.map(w
=> mkChunk(stemmer, w.getText)).toSeq).toSeq
- def mkSpecChunk(id: String): NCSemanticSynonymChunk =
NCSemanticSynonymChunk(TEXT, id, stemmer.stem(id))
+ map(t => cfg.getTokenParser.tokenize(t).asScala.map(w =>
mkChunk(cfg.getTokenParser, w)).toSeq).toSeq
+ def mkSpecChunk(id: String): NCSemanticSynonymChunk =
NCSemanticSynonymChunk(TEXT, id, cfg.getTokenParser.getStem(id))
// TODO:
add(Seq(NCSemanticSynonym(Seq(mkSpecChunk(elemId)), isElementId =
true)))
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCEnSemanticTextStemmer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCEnSemanticTextStemmer.java
deleted file mode 100644
index 4571053..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCEnSemanticTextStemmer.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en;
-
-import opennlp.tools.stemmer.PorterStemmer;
-import org.apache.nlpcraft.nlp.entity.parser.semantic.NCSemanticTextStemmer;
-
-public class NCEnSemanticTextStemmer implements NCSemanticTextStemmer {
- private final PorterStemmer s = new PorterStemmer();
-
- @Override
- public synchronized String stem(String text) {
- return s.stem(text);
- }
-}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricher.java
index 219018f..14ee3a2 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricher.java
@@ -17,7 +17,10 @@
package org.apache.nlpcraft.nlp.token.enricher.en;
-import org.apache.nlpcraft.*;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import org.apache.nlpcraft.NCTokenEnricher;
import org.apache.nlpcraft.nlp.token.enricher.en.impl.NCEnBracketsImpl;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricher.java
index f54d4e1..8c3275f 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricher.java
@@ -17,7 +17,10 @@
package org.apache.nlpcraft.nlp.token.enricher.en;
-import org.apache.nlpcraft.*;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import org.apache.nlpcraft.NCTokenEnricher;
import org.apache.nlpcraft.nlp.token.enricher.en.impl.NCEnDictionaryImpl;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnLanguageTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnLanguageTokenEnricher.java
index b52350c..9ecbd90 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnLanguageTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnLanguageTokenEnricher.java
@@ -17,7 +17,10 @@
package org.apache.nlpcraft.nlp.token.enricher.en;
-import org.apache.nlpcraft.*;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import org.apache.nlpcraft.NCTokenEnricher;
import org.apache.nlpcraft.nlp.token.enricher.en.impl.NCEnLanguageWordsImpl;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricher.java
index fe8516f..c38f29e 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricher.java
@@ -17,7 +17,10 @@
package org.apache.nlpcraft.nlp.token.enricher.en;
-import org.apache.nlpcraft.*;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import org.apache.nlpcraft.NCTokenEnricher;
import org.apache.nlpcraft.nlp.token.enricher.en.impl.NCEnQuotesImpl;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnStopWordsTokenEnricher.java
similarity index 71%
copy from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
copy to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnStopWordsTokenEnricher.java
index aca1786..e431a6a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnStopWordsTokenEnricher.java
@@ -17,27 +17,26 @@
package org.apache.nlpcraft.nlp.token.enricher.en;
-import org.apache.nlpcraft.*;
-import org.apache.nlpcraft.nlp.token.enricher.en.impl.NCEnSwearWordsImpl;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import org.apache.nlpcraft.NCTokenEnricher;
+import org.apache.nlpcraft.nlp.token.enricher.en.impl.NCEnStopWordsImpl;
import java.util.List;
-import java.util.Objects;
+import java.util.Set;
/**
* TODO: enriches with <code>dict:en</code> property.
*/
-public class NCEnSwearWordsTokenEnricher implements NCTokenEnricher {
- private final NCEnSwearWordsImpl impl;
+public class NCEnStopWordsTokenEnricher implements NCTokenEnricher {
+ private final NCEnStopWordsImpl impl;
/**
- * TODO: swear_words.txt
- *
- * @param mdlSrc
+ *
*/
- public NCEnSwearWordsTokenEnricher(String mdlSrc) {
- Objects.requireNonNull(mdlSrc, "Swear words model file cannot be
null.");
-
- impl = new NCEnSwearWordsImpl(mdlSrc);
+ public NCEnStopWordsTokenEnricher(Set<String> addStems, Set<String>
exclStems) {
+ impl = new NCEnStopWordsImpl(addStems, exclStems);
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
index aca1786..2de4d0b 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
@@ -17,7 +17,10 @@
package org.apache.nlpcraft.nlp.token.enricher.en;
-import org.apache.nlpcraft.*;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import org.apache.nlpcraft.NCTokenEnricher;
import org.apache.nlpcraft.nlp.token.enricher.en.impl.NCEnSwearWordsImpl;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnBracketsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnBracketsImpl.scala
index 6b34033..80f6e62 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnBracketsImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnBracketsImpl.scala
@@ -46,5 +46,4 @@ class NCEnBracketsImpl extends NCTokenEnricher with
LazyLogging:
case _ => mark(t)
if ok && stack.isEmpty then map.foreach { (tok, b) =>
tok.put("brackets:en", b) }
- else
- logger.trace(s"Invalid brackets: ${req.getText}")
\ No newline at end of file
+ else logger.trace(s"Invalid brackets: ${req.getText}")
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnStopWordGenerator.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnStopWordGenerator.scala
similarity index 99%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnStopWordGenerator.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnStopWordGenerator.scala
index 71cdd28..4b3ac8a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnStopWordGenerator.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnStopWordGenerator.scala
@@ -1,4 +1,4 @@
-package org.apache.nlpcraft.nlp.token.parser.opennlp.en.impl
+package org.apache.nlpcraft.nlp.token.enricher.en.impl
import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.internal.util.NCUtils
@@ -9,6 +9,7 @@ import scala.collection.mutable
* Generates first word sequences.
*/
object NCEnStopWordGenerator:
+ // TODO: ?
private final lazy val stemmer = new PorterStemmer
// Output files.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnStopWordsFinder.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnStopWordsImpl.scala
similarity index 77%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnStopWordsFinder.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnStopWordsImpl.scala
index e7806c2..248fac8 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnStopWordsFinder.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnStopWordsImpl.scala
@@ -15,24 +15,21 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.nlp.token.parser.opennlp.en.impl
+package org.apache.nlpcraft.nlp.token.enricher.en.impl
import com.typesafe.scalalogging.LazyLogging
-import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
+import java.io.*
import java.util
import java.util.{List as JList, Set as JSet}
import scala.annotation.tailrec
-import scala.collection.{Seq, mutable}
+import scala.collection.{IndexedSeq, Seq, mutable}
import scala.concurrent.ExecutionContext
-import scala.jdk.CollectionConverters.SetHasAsScala
+import scala.jdk.CollectionConverters.*
-/**
- *
- */
-private[impl] object NCEnStopWordsFinder:
+object NCEnStopWordsImpl:
// Condition types.
type Wildcard = (String, String)
type Word = String
@@ -102,7 +99,7 @@ private[impl] object NCEnStopWordsFinder:
posOpt match
case Some(pos) =>
!excludes.getOrElse(pos, Set.empty).contains(s) &&
- (any.contains(s) || includes.getOrElse(pos,
Set.empty).contains(s))
+ (any.contains(s) || includes.getOrElse(pos,
Set.empty).contains(s))
case _ => any.contains(s)
/**
@@ -168,11 +165,73 @@ private[impl] object NCEnStopWordsFinder:
// Hash access.
stems.matches(toStemKey(toks), posOpt) ||
- lemmas.matches(toLemmaKey(toks), posOpt) ||
- origins.matches(toOriginalKey(toks), posOpt) ||
- // Scan access.
- wildcardsLemmas.matches(toLemmaKey(toks), posOpt) ||
- wildcardsOrigins.matches(toOriginalKey(toks), posOpt)
+ lemmas.matches(toLemmaKey(toks), posOpt) ||
+ origins.matches(toOriginalKey(toks), posOpt) ||
+ // Scan access.
+ wildcardsLemmas.matches(toLemmaKey(toks), posOpt) ||
+ wildcardsOrigins.matches(toOriginalKey(toks), posOpt)
+
+ /**
+ * Gets all sequential permutations of tokens in this NLP sentence.
+ * This method is like a 'tokenMix', but with all combinations of
stop-words (with and without)
+ *
+ * @param tokens Tokens.
+ * @param maxLen Maximum number of tokens in the sequence.
+ */
+ private def tokenMixWithStopWords(tokens: Seq[NCToken], maxLen: Int =
Integer.MAX_VALUE): Seq[Seq[NCToken]] =
+ /**
+ * Gets all combinations for sequence of mandatory tokens with
stop-words and without.
+ *
+ * Example:
+ * 'A (stop), B, C(stop) -> [A, B, C]; [A, B]; [B, C], [B]
+ * 'A, B(stop), C(stop) -> [A, B, C]; [A, B]; [A, C], [A].
+ *
+ * @param toks Tokens.
+ */
+ def permutations(toks: Seq[NCToken]): Seq[Seq[NCToken]] =
+ def multiple(seq: Seq[Seq[Option[NCToken]]], t: NCToken):
Seq[Seq[Option[NCToken]]] =
+ if seq.isEmpty then
+ if isStopWord(t) then IndexedSeq(IndexedSeq(Some(t)),
IndexedSeq(None)) else IndexedSeq(IndexedSeq(Some(t)))
+ else
+ (for (subSeq <- seq) yield subSeq :+ Some(t)) ++ (if
isStopWord(t) then for (subSeq <- seq) yield subSeq :+ None else Seq.empty)
+
+ var res: Seq[Seq[Option[NCToken]]] = Seq.empty
+ for (t <- toks) res = multiple(res, t)
+ res.map(_.flatten).filter(_.nonEmpty)
+
+ tokenMix(tokens, stopWords = true, maxLen).
+ flatMap(permutations).
+ filter(_.nonEmpty).
+ distinct.
+ sortBy(seq => (-seq.length, seq.head.getIndex))
+
+ /**
+ * Gets all sequential permutations of tokens in this NLP sentence.
+ *
+ * For example, if NLP sentence contains "a, b, c, d" tokens, then
+ * this function will return the sequence of following token sequences in
this order:
+ * "a b c d"
+ * "a b c"
+ * "b c d"
+ * "a b"
+ * "b c"
+ * "c d"
+ * "a"
+ * "b"
+ * "c"
+ * "d"
+ *
+ * NOTE: this method will not return any permutations with a quoted token.
+ *
+ * @param tokens Tokens.
+ * @param stopWords Whether or not include tokens marked as stop words.
+ * @param maxLen Maximum number of tokens in the sequence.
+ */
+ private def tokenMix(tokens: Seq[NCToken], stopWords: Boolean = false,
maxLen: Int = Integer.MAX_VALUE): Seq[Seq[NCToken]] =
+ val toks = tokens.filter(t => stopWords || (!stopWords &&
!isStopWord(t)))
+
+ (for (n <- toks.length until 0 by -1 if n <= maxLen) yield
toks.sliding(n)).flatten
+
private def isQuote(t: NCToken): Boolean = Q_POS.contains(t.getPos)
private def toStemKey(toks: Seq[NCToken]): String =
toks.map(_.getStem).mkString(" ")
@@ -180,62 +239,72 @@ private[impl] object NCEnStopWordsFinder:
private def toValueKey(toks: Seq[NCToken]): String =
toks.map(_.getText.toLowerCase).mkString(" ")
private def toOriginalKey(toks: Seq[NCToken]): String =
toks.map(_.getText).mkString(" ")
-/**
- *
- * @param addStems
- * @param exclStems
- */
-private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems:
Set[String]) extends LazyLogging:
- import NCEnStopWordsFinder.*
-
- require(addStems != null)
- require(exclStems != null)
-
- private val stemmer = new PorterStemmer
-
- private val percents = Set(
- "%",
- "pct",
- "pc",
- "percentage",
- "proportion",
- "interest",
- "rate",
- "percent"
- ).map(stemmer.stem)
+ // TODO: error?
+ private def isStopWord(t: NCToken): Boolean = t.get[Boolean]("stopword")
+import org.apache.nlpcraft.nlp.token.enricher.en.impl.NCEnStopWordsImpl.*
+
+class NCEnStopWordsImpl(addStemsSet: util.Set[String], exclStemsSet:
util.Set[String]) extends NCTokenEnricher with LazyLogging:
+ private val addStems: Set[String] = if addStemsSet == null then Set.empty
else addStemsSet.asScala.toSet
+ private val exclStems: Set[String] = if exclStemsSet == null then
Set.empty else exclStemsSet.asScala.toSet
+
+ @volatile private var percents: Set[String] = _
@volatile private var firstWords: Set[String] = _
@volatile private var nounWords: Set[String] = _
+ @volatile private var stopWords: StopWordHolder = _
+ @volatile private var exceptions: StopWordHolder = _
- // Stemmatization is done already by generator.
- NCUtils.execPar(
- () => firstWords = read("stopwords/first_words.txt.gz"),
- () => nounWords = read("stopwords/noun_words.txt.gz")
- )(ExecutionContext.Implicits.global)
+ private def read(path: String): Set[String] =
NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
- // Case sensitive.
- private val (stopWords, exceptions) =
+ override def start(cfg: NCModelConfig): Unit =
+ percents = Set(
+ "%",
+ "pct",
+ "pc",
+ "percentage",
+ "proportion",
+ "interest",
+ "rate",
+ "percent"
+ ).map(cfg.getTokenParser.getStem)
+
+ // Stemmatization is done already by generator. TODO:
+ NCUtils.execPar(
+ () => firstWords = read("stopwords/first_words.txt.gz"),
+ () => nounWords = read("stopwords/noun_words.txt.gz")
+ )(ExecutionContext.Implicits.global)
+
+ // Case sensitive.
val m =
readStopWords(
+ cfg.getTokenParser,
NCUtils.readResource("stopwords/stop_words.txt", "UTF-8",
logger).
map(_.strip).filter(s => s.nonEmpty && !s.startsWith("#"))
)
- (m(false), m(true))
- private def read(path: String): Set[String] =
NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
+ stopWords = m(false)
+ exceptions = m(true)
+
+ override def stop(): Unit =
+ percents = null
+ firstWords = null
+ nounWords = null
+ stopWords = null
+ exceptions = null
/**
* Parses configuration template.
*
+ * @param p Token parser.
* @param lines Configuration file content.
* @return Holder and `is-exception` flag.
*/
- private def readStopWords(lines: Seq[String]): Map[Boolean,
StopWordHolder] =
+ private def readStopWords(p: NCTokenParser, lines: Seq[String]):
Map[Boolean, StopWordHolder] =
// 1. Prepares accumulation data structure.
enum WordForm:
case STEM, LEM, ORIG
- import WordForm.*
+ import WordForm._
class Condition[T]:
val any = mutable.HashSet.empty[T]
@@ -252,8 +321,8 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
case Some(set) => set.add(cond)
case _ =>
val set = mutable.HashSet.empty[T]
- set += cond
- m += pos -> set
+ set += cond
+ m += pos -> set
)
add(includes, incl = true)
@@ -267,7 +336,7 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
m += tuple._1 -> tuple._2
WordForm.values.foreach(f =>
add(f, mkT, isExc = true)
- add(f, mkT, isExc = false)
+ add(f, mkT, isExc = false)
)
m.toMap
@@ -327,7 +396,7 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
val (word, form) =
if isCase then (s, ORIG)
else
- if !hasPoses then (stemmer.stem(s), STEM) else
(stemmer.stem(s), LEM)
+ if !hasPoses then (p.getStem(s), STEM) else
(p.getStem(s), LEM)
mHash((isExc, form)).addCondition(word, poses)
else
val b = s.take(idxWild)
@@ -349,13 +418,13 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
val incl = toImmutable(m((isExc, form)).includes)
val excl = toImmutable(m((isExc, form)).excludes)
- mkInstance(any ++ excl.values.flatten, incl, excl)
+ mkInstance(any ++ excl.values.flatten, incl, excl)
end mkHolder
def mkHash(form: WordForm): HashHolder = mkHolder(mHash, form,
HashHolder.apply)
def mkScan(form: WordForm):
ScanHolder = mkHolder(mScan, form, ScanHolder.apply)
- isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM), mkHash(ORIG),
mkScan(LEM), mkScan(ORIG))
+ isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM),
mkHash(ORIG), mkScan(LEM), mkScan(ORIG))
).toMap
private def isVerb(pos: String): Boolean = pos.head == 'V'
@@ -379,8 +448,8 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
): Boolean =
var stop = true
- for ((tok, idx) <- ns.zipWithIndex if idx != lastIdx &&
!tok.isStopWord && !isException(Seq(tok)) &&
- stopPoses.contains(tok.getPos) && ns(idx + 1).isStopWord)
+ for ((tok, idx) <- ns.zipWithIndex if idx != lastIdx &&
!isStopWord(tok) && !isException(Seq(tok)) &&
+ stopPoses.contains(tok.getPos) && isStopWord(ns(idx + 1)))
stops += tok
stop = false
@@ -413,8 +482,8 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
val max = ns.size - 1
var stop = true
- for ((tok, idx) <- ns.zipWithIndex if idx != max &&
!tok.isStopWord && !exclStems.contains(tok.getStem) &&
- POSES.contains(tok.getPos) && ns(idx + 1).isStopWord)
+ for ((tok, idx) <- ns.zipWithIndex if idx != max &&
!isStopWord(tok) && !exclStems.contains(tok.getStem) &&
+ POSES.contains(tok.getPos) && isStopWord(ns(idx + 1)))
stops += tok
stop = false
@@ -422,11 +491,9 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
processCommonStops0(ns)
- /**
- *
- * @param toks
- */
- def find(toks: Seq[NCToken]): Seq[NCToken] =
+ override def enrich(req: NCRequest, cfg: NCModelConfig, toksList:
JList[NCToken]): Unit =
+ val toks = toksList.asScala
+
// Stop words and exceptions caches for this sentence.
val cacheSw = mutable.HashMap.empty[Seq[NCToken], Boolean]
val cacheEx = mutable.HashMap.empty[Seq[NCToken], Boolean]
@@ -462,7 +529,7 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
!isFirst && prev().getPos == "CD" &&
// 3. It's last word or any words after except numbers.
(isLast || next().getPos != "CD")
- ) ||
+ ) ||
// be, was, is etc. or has been etc.
isCommonVerbs("have", "be") ||
// be, was, is etc. or have done etc.
@@ -474,7 +541,7 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
// | Find all words from predefined list. |
// +--------------------------------------+
val buf = mutable.Buffer.empty[Seq[NCToken]]
- val mix = NCUtils.tokenMixWithStopWords(toks)
+ val mix = tokenMixWithStopWords(toks)
for (toks <- mix if !buf.exists(_.containsSlice(toks)) && isStop(toks)
&& !isException(toks))
toks.foreach(tok => stops += tok)
@@ -492,7 +559,7 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
val foundKeys = new mutable.HashSet[String]()
// All sentence first stop words + first non stop word.
- val startToks = toks.takeWhile(_.isStopWord) ++
toks.find(!_.isStopWord).map(p => p)
+ val startToks = toks.takeWhile(isStopWord) ++ toks.find(p =>
!isStopWord(p)).map(p => p)
for (startTok <- startToks; tup <- origToks.filter(_._1.head ==
startTok); key = tup._2 if firstWords.contains(key) && !isException(tup._1))
tup._1.foreach(tok => stops += tok)
foundKeys += key
@@ -574,4 +641,5 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
if ok && stack.isEmpty then
stops --= stops.intersect(set)
- stops.toSeq.sortBy(_.getStartCharIndex)
\ No newline at end of file
+ // TODO: name is important and language independent.
+ toks.foreach(t => t.put("stopword", stops.contains(t)))
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnSwearWordsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnSwearWordsImpl.scala
index ea11dc0..c7e2534 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnSwearWordsImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnSwearWordsImpl.scala
@@ -18,7 +18,6 @@
package org.apache.nlpcraft.nlp.token.enricher.en.impl
import com.typesafe.scalalogging.LazyLogging
-import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
@@ -28,8 +27,7 @@ class NCEnSwearWordsImpl(res: String) extends NCTokenEnricher
with LazyLogging:
@volatile private var swearWords: Set[String] = _
override def start(cfg: NCModelConfig): Unit =
- val stemmer = new PorterStemmer
- swearWords = NCUtils.readTextStream(NCUtils.getStream(res),
"UTF-8").map(stemmer.stem).toSet
+ swearWords = NCUtils.readTextStream(NCUtils.getStream(res),
"UTF-8").map(cfg.getTokenParser.getStem).toSet
logger.trace(s"Loaded resource: $res")
override def stop(): Unit = swearWords = null
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
java.util.List[NCToken]): Unit =
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParser.java
index cb0a8b4..2c59def 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParser.java
@@ -17,11 +17,13 @@
package org.apache.nlpcraft.nlp.token.parser.opennlp.en;
-import org.apache.nlpcraft.*;
+import org.apache.nlpcraft.NCException;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCTokenParser;
import org.apache.nlpcraft.nlp.token.parser.opennlp.en.impl.NCEnOpenNlpImpl;
-import java.io.*;
-import java.util.*;
+import java.util.List;
+import java.util.Objects;
/*
* Models can be downloaded from the following resources:
@@ -53,52 +55,36 @@ public class NCEnOpenNlpTokenParser implements
NCTokenParser {
* @param lemmaDicSrc Local filesystem path, resources file path or URL
for OpenNLP lemmatizer dictionary.
* @throws NCException
*/
- public NCEnOpenNlpTokenParser(String posMdlSrc, String lemmaDicSrc) {
+ public NCEnOpenNlpTokenParser(String tokMdlSrc, String posMdlSrc, String
lemmaDicSrc) {
+ Objects.requireNonNull(tokMdlSrc, "Tokenizer model path cannot be
null.");
Objects.requireNonNull(posMdlSrc, "POS model path cannot be null.");
Objects.requireNonNull(lemmaDicSrc, "Lemmatizer model path cannot be
null.");
try {
- impl = new NCEnOpenNlpImpl(posMdlSrc, lemmaDicSrc);
+ impl = new NCEnOpenNlpImpl(tokMdlSrc, posMdlSrc, lemmaDicSrc);
}
catch (Exception e) {
throw new NCException("Failed to create OpenNLP token parser.", e);
}
}
- /**
- *
- * @return
- */
- public Set<String> getAdditionalStopWords() {
- return impl.getAdditionalStopWords();
- }
-
- /**
- *
- * @param addStopWords
- */
- public void setAdditionalStopWords(Set<String> addStopWords) {
- impl.setAdditionalStopWords(addStopWords);
+ @Override
+ public List<String> tokenize(String text) {
+ return impl.tokenize(text);
}
- /**
- *
- * @return
- */
- public Set<String> getExcludedStopWords() {
- return impl.getExcludedStopWords();
+ @Override
+ public String getStem(String s) {
+ return impl.getStem(s);
}
- /**
- *
- * @param exclStopWords
- */
- public void setExcludedStopWords(Set<String> exclStopWords) {
- impl.setExcludedStopWords(exclStopWords);
+ @Override
+ public List<String> getPoses(List<String> toks) {
+ return impl.getPoses(toks);
}
@Override
- public List<NCToken> parse(NCRequest req, NCModelConfig cfg) {
- return impl.parse(req, cfg);
+ public List<String> getLemmas(List<String> toks, List<String> poses) {
+ return impl.getLemmas(toks, poses);
}
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
index 1496de2..f7714a3 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
@@ -17,6 +17,7 @@
package org.apache.nlpcraft.nlp.token.parser.opennlp.en.impl
+import com.typesafe.scalalogging.LazyLogging
import opennlp.tools.lemmatizer.*
import opennlp.tools.postag.*
import opennlp.tools.stemmer.*
@@ -37,115 +38,76 @@ import scala.jdk.CollectionConverters.*
* @param posMdlSrc
* @param lemmaDicSrc
*/
-class NCEnOpenNlpImpl(posMdlSrc: String, lemmaDicSrc: String) extends
NCTokenParser :
- private val stemmer = new PorterStemmer
-
+class NCEnOpenNlpImpl(tokMdl: String, posMdlSrc: String, lemmaDicSrc: String)
extends NCTokenParser with LazyLogging:
+ @volatile private var stemmer: PorterStemmer = _
@volatile var tagger: POSTaggerME = _
@volatile var lemmatizer: DictionaryLemmatizer = _
- @volatile var swFinder: NCEnStopWordsFinder = _
-
- private var addStopWords: JSet[String] = _
- private var exclStopWords: JSet[String] = _
+ @volatile var tokenizer: TokenizerME = _
override def start(cfg: NCModelConfig): Unit =
NCUtils.execPar(
- () => tagger = new POSTaggerME(new
POSModel(NCUtils.getStream(posMdlSrc))),
- () => lemmatizer = new
DictionaryLemmatizer(NCUtils.getStream(lemmaDicSrc)),
- () => swFinder = new NCEnStopWordsFinder(stem(addStopWords),
stem(exclStopWords))
+ () => stemmer = new PorterStemmer,
+ () =>
+ tagger = new POSTaggerME(new
POSModel(NCUtils.getStream(posMdlSrc)))
+ logger.trace(s"Loaded resource: $posMdlSrc")
+ ,
+ () =>
+ lemmatizer = new
DictionaryLemmatizer(NCUtils.getStream(lemmaDicSrc))
+ logger.trace(s"Loaded resource: $lemmaDicSrc")
+ ,
+ () =>
+ tokenizer = new TokenizerME(new
TokenizerModel(NCUtils.getStream(tokMdl)))
+ logger.trace(s"Loaded resource: $tokMdl")
+
)(ExecutionContext.Implicits.global)
override def stop(): Unit =
- swFinder = null
lemmatizer = null
tagger = null
+ tokenizer = null
+ stemmer = null
/**
*
- * @param addStopWords
+ * @param set
*/
- def setAdditionalStopWords(addStopWords: JSet[String]): Unit =
this.addStopWords = addStopWords
+ private def stem(set: JSet[String]): Set[String] =
+ if set == null then Set.empty else set.asScala.toSet.map(stemmer.stem)
- /**
- *
- * @return
- */
- def getAdditionalStopWords: JSet[String] = addStopWords
+ override def tokenize(text: String): JList[String] =
+ this.synchronized { tokenizer.tokenizePos(text)
}.map(_.getCoveredText(text).toString).toSeq.asJava
- /**
- *
- * @param exclStopWords
- */
- def setExcludedStopWords(exclStopWords: JSet[String]): Unit =
this.exclStopWords = exclStopWords
+ override def getStem(s: String): String = this.synchronized {
stemmer.stem(s) }
- /**
- *
- * @return
- */
- def getExcludedStopWords: JSet[String] = exclStopWords
+ override def getPoses(toksList: JList[String]): JList[String] =
+ val toks = toksList.asScala.toArray
- /**
- *
- * @param set
- */
- private def stem(set: JSet[String]): Set[String] =
- if set == null then Set.empty else set.asScala.toSet.map(stemmer.stem)
+ this.synchronized { tagger.tag(toks) }.toSeq.asJava
- override def parse(req: NCRequest, cfg: NCModelConfig): JList[NCToken] =
- // OpenNLP classes are not thread-safe.
- this.synchronized {
- val words = req.getWords.asScala
- val wordsTxts = words.map(_.getText).toArray
- val posTags = tagger.tag(wordsTxts)
- var lemmas = lemmatizer.lemmatize(wordsTxts, posTags).toSeq
-
- require(words.length == posTags.length)
-
- // For some reasons lemmatizer (en-lemmatizer.dict) marks some
words with non-existent POS 'NNN'
- // Valid POS list:
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
- val suspIdxs = lemmas.zip(posTags).zipWithIndex.flatMap {
- // "0" is flag that lemma cannot be obtained for some reasons.
- case ((lemma, pos), i) => Option.when(lemma == "O" && pos ==
"NN")(i)
- }
+ override def getLemmas(toksList: JList[String], posesList: JList[String]):
JList[String] =
+ require(toksList.size() == posesList.size())
+
+ val toks = toksList.asScala
+ val poses = posesList.asScala
+
+ var lemmas = this.synchronized { lemmatizer.lemmatize(toks.toArray,
poses.toArray).toSeq }
+
+ // For some reasons lemmatizer (en-lemmatizer.dict) marks some words
with non-existent POS 'NNN'
+ // Valid POS list:
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
+ val suspIdxs = lemmas.zip(poses).zipWithIndex.flatMap {
+ // "0" is flag that lemma cannot be obtained for some reasons.
+ case ((lemma, pos), i) => Option.when(lemma == "O" && pos ==
"NN")(i)
+ }
- if suspIdxs.nonEmpty then
- val fixes: Map[Int, String] = lemmatizer.
- lemmatize(suspIdxs.map(i => wordsTxts(i)).toArray,
suspIdxs.map(_ => "NNN").toArray).
- zipWithIndex.
- flatMap {
- (lemma, i) => Option.when(lemma != "0")(suspIdxs(i) ->
lemma)
- }.toMap
- lemmas = lemmas.zipWithIndex.map {
- (lemma, idx) => fixes.getOrElse(idx, lemma)
- }
-
- val res: Seq[NCToken] =
words.zip(posTags).zip(lemmas).toIndexedSeq.zipWithIndex.map { case (((w, pos),
lemma), idx) =>
- new NCPropertyMapAdapter with NCToken:
- override def getText: String = w.getText
- override def getLemma: String = lemma
- override def getStem: String =
stemmer.stem(w.getText.toLowerCase)
- override def getPos: String = pos
- override def isStopWord: Boolean = false
- override def getStartCharIndex: Int = w.getStartCharIndex
- override def getEndCharIndex: Int = w.getEndCharIndex
- override def getLength: Int = w.getLength
- override def getIndex: Int = w.getIndex
+ if suspIdxs.nonEmpty then
+ val fixes: Map[Int, String] = lemmatizer.
+ lemmatize(suspIdxs.map(i => toks(i)).toArray, suspIdxs.map(_
=> "NNN").toArray).
+ zipWithIndex.
+ flatMap {
+ (lemma, i) => Option.when(lemma != "0")(suspIdxs(i) ->
lemma)
+ }.toMap
+ lemmas = lemmas.zipWithIndex.map {
+ (lemma, idx) => fixes.getOrElse(idx, lemma)
}
- val stops = swFinder.find(res)
-
- res.map(tok =>
- if stops.contains(tok) then
- new NCPropertyMapAdapter with NCToken:
- override def getText: String = tok.getText
- override def getLemma: String = tok.getLemma
- override def getStem: String = tok.getStem
- override def getPos: String = tok.getPos
- override def isStopWord: Boolean = true
- override def getStartCharIndex: Int =
tok.getStartCharIndex
- override def getEndCharIndex: Int = tok.getEndCharIndex
- override def getLength: Int = tok.getLength
- override def getIndex: Int = tok.getIndex
- else
- tok
- ).asJava
- }
\ No newline at end of file
+ lemmas.asJava
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/NCOpenNlpTokenizer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/NCOpenNlpTokenizer.java
deleted file mode 100644
index bf63f0d..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/NCOpenNlpTokenizer.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.nlp.tokenizer.opennlp;
-
-import org.apache.nlpcraft.NCException;
-import org.apache.nlpcraft.NCModelConfig;
-import org.apache.nlpcraft.NCTokenizer;
-import org.apache.nlpcraft.NCWord;
-import org.apache.nlpcraft.nlp.tokenizer.opennlp.impl.NCOpenNlpTokenizerImpl;
-
-import java.util.List;
-import java.util.Objects;
-
-/**
- *
- */
-public class NCOpenNlpTokenizer implements NCTokenizer {
- private final NCOpenNlpTokenizerImpl impl;
-
- /**
- *
- * @param tokMdl
- */
- public NCOpenNlpTokenizer(String tokMdl) {
- Objects.requireNonNull(tokMdl, "Tokenizer model source cannot be
null.");
-
- try {
- impl = new NCOpenNlpTokenizerImpl(tokMdl);
- }
- catch (Exception e) {
- throw new NCException("Failed to create OpenNLP tokenizer from: "
+ tokMdl, e);
- }
- }
-
- @Override
- public List<NCWord> tokenize(NCModelConfig cfg, String txt) {
- return impl.tokenize(cfg, txt);
- }
-
- @Override
- public void start(NCModelConfig cfg) {
- impl.start(cfg);
- }
-
- @Override
- public void stop() {
- impl.stop();
- }
-}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala
deleted file mode 100644
index 49ac329..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.nlp.tokenizer.opennlp.impl
-
-import opennlp.tools.tokenize.*
-import org.apache.nlpcraft.*
-import org.apache.nlpcraft.internal.util.NCUtils
-
-import java.io.*
-import java.util
-import scala.jdk.CollectionConverters.*
-
-/**
- *
- * @param src
- */
-class NCOpenNlpTokenizerImpl(src: String) extends NCTokenizer:
- @volatile var tokenizer: TokenizerME = _
-
- override def start(cfg: NCModelConfig): Unit = tokenizer = new
TokenizerME(new TokenizerModel(NCUtils.getStream(src)))
- override def stop(): Unit = tokenizer = null
- override def tokenize(cfg: NCModelConfig, txt: String): util.List[NCWord] =
- this.synchronized { tokenizer.tokenizePos(txt) }.zipWithIndex.map {
(span, idx) =>
- new NCWord:
- override def getText: String =
span.getCoveredText(txt).toString
- override def getStartCharIndex: Int = span.getStart
- override def getEndCharIndex: Int = span.getEnd
- override def getLength: Int = span.length()
- override def getIndex: Int = idx
- }.toSeq.asJava
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/NCBenchmarkAdapter.java
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/NCBenchmarkAdapter.java
index f5096e5..c4d3ea5 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/NCBenchmarkAdapter.java
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/NCBenchmarkAdapter.java
@@ -36,7 +36,6 @@ import java.util.concurrent.TimeUnit;
@Fork(value = 1, jvmArgs = {"-Xms2G", "-Xmx2G"})
@Warmup(iterations = 5, time = 10)
@Measurement(iterations = 5, time = 5)
-@Disabled
public class NCBenchmarkAdapter {
@State(Scope.Thread)
public static class NCBenchmarkAdapterState {
@@ -50,7 +49,8 @@ public class NCBenchmarkAdapter {
* @param args
* @throws RunnerException
*/
- @Test
+ // @Test
+ // TODO:
public void benchmark() throws RunnerException {
new Runner(new
OptionsBuilder().include(this.getClass().getSimpleName()).build()).run();
}
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/token/parser/opennlp/NCEnOpenNlpTokenParserBenchmark.java
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/token/parser/opennlp/NCEnOpenNlpTokenParserBenchmark.java
index f345dde..c30f536 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/token/parser/opennlp/NCEnOpenNlpTokenParserBenchmark.java
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/token/parser/opennlp/NCEnOpenNlpTokenParserBenchmark.java
@@ -44,7 +44,7 @@ public class NCEnOpenNlpTokenParserBenchmark extends
NCBenchmarkAdapter {
@Benchmark
public void testParse(Blackhole bh, NCBenchmarkAdapterState state) {
- bh.consume(parser.parse(state.request, null));
+ bh.consume(parser.tokenize(state.request.getText()));
}
/**
@@ -52,6 +52,7 @@ public class NCEnOpenNlpTokenParserBenchmark extends
NCBenchmarkAdapter {
*/
private static NCEnOpenNlpTokenParser prepareParser() {
NCEnOpenNlpTokenParser p = new NCEnOpenNlpTokenParser(
+ "opennlp/en-token.bin",
"opennlp/en-pos-maxent.bin",
"opennlp/en-lemmatizer.dict"
);
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParserSpec.scala
index a9cfb14..842ba4f 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParserSpec.scala
@@ -22,7 +22,6 @@ import org.apache.nlpcraft.internal.util.NCUtils
import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
import org.apache.nlpcraft.nlp.util.*
-import org.apache.nlpcraft.nlp.util.NCTestConfig.*
import org.junit.jupiter.api.*
import java.util
@@ -53,8 +52,8 @@ class NCOpenNlpEntityParserSpec:
private def checkSingleEntity(txt: String, expected: String): Unit =
val req = NCTestRequest(txt)
- val toks = EN_PARSER.parse(req, EN_MDL_CFG)
- val resSeq = parsers.map(_.parse(req, EN_MDL_CFG,
toks).asScala.toSeq).filter(_.size == 1)
+ val toks = NCTestUtils.mkTokens(NCTestConfig.EN.getTokenParser,
req.txt)
+ val resSeq = parsers.map(_.parse(req, NCTestConfig.EN,
toks).asScala.toSeq).filter(_.size == 1)
require(resSeq.size == 1)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserJsonSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserJsonSpec.scala
index 77a002b..e4c6323 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserJsonSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserJsonSpec.scala
@@ -20,10 +20,8 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
-import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en.*
import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
import org.apache.nlpcraft.nlp.util.*
-import org.apache.nlpcraft.nlp.util.NCTestConfig.*
import org.junit.jupiter.api.*
import java.util
@@ -40,14 +38,14 @@ class NCSemanticEntityParserJsonSpec:
@BeforeEach
def start(): Unit =
- parser =
- NCTestUtils.makeAndStart(
- new NCSemanticEntityParser(new NCEnSemanticTextStemmer,
"models/alarm_model.json")
- )
+ parser = NCTestUtils.makeAndStart(new
NCSemanticEntityParser("models/alarm_model.json"))
private def checkSingleEntity(txt: String, expected: String): Unit =
val req = NCTestRequest(txt)
- val res = parser.parse(req, EN_MDL_CFG, EN_PARSER.parse(req,
EN_MDL_CFG)).asScala.toSeq
+ val res = parser.parse(
+ req,
+ NCTestConfig.EN,
NCTestUtils.mkTokens(NCTestConfig.EN.getTokenParser, req.txt)
+ ).asScala.toSeq
NCTestUtils.printEntities(txt, res)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
index 09fc3b4..2bdad65 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
@@ -20,10 +20,8 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
-import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en.*
import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
import org.apache.nlpcraft.nlp.util.*
-import org.apache.nlpcraft.nlp.util.NCTestConfig.*
import org.junit.jupiter.api.*
import java.util
@@ -56,16 +54,14 @@ class NCSemanticEntityParserSpec:
@BeforeEach
def start(): Unit =
parser =
- NCTestUtils.makeAndStart(
- new NCSemanticEntityParser(
- new NCEnSemanticTextStemmer,
- Seq(Element("testId", synonyms = Seq("test"))).asJava
- )
- )
+ NCTestUtils.makeAndStart(new
NCSemanticEntityParser(Seq(Element("testId", synonyms = Seq("test"))).asJava))
private def checkSingleEntity(txt: String, expected: String): Unit =
val req = NCTestRequest(txt)
- val res = parser.parse(req, EN_MDL_CFG, EN_PARSER.parse(req,
EN_MDL_CFG)).asScala.toSeq
+ val res = parser.parse(
+ req,
+ NCTestConfig.EN,
NCTestUtils.mkTokens(NCTestConfig.EN.getTokenParser, req.txt)
+ ).asScala.toSeq
require(res.size == 1)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserYamlSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserYamlSpec.scala
index 273f7d1..36d4960 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserYamlSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserYamlSpec.scala
@@ -20,10 +20,8 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
-import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en.*
import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
import org.apache.nlpcraft.nlp.util.*
-import org.apache.nlpcraft.nlp.util.NCTestConfig.*
import org.junit.jupiter.api.*
import java.util
@@ -39,14 +37,14 @@ class NCSemanticEntityParserYamlSpec:
@BeforeEach
def start(): Unit =
- parser =
- NCTestUtils.makeAndStart(
- new NCSemanticEntityParser(new NCEnSemanticTextStemmer,
"models/lightswitch_model.yaml")
- )
+ parser = NCTestUtils.makeAndStart(new
NCSemanticEntityParser("models/lightswitch_model.yaml"))
private def checkSingleEntity(txt: String, expected: String): Unit =
val req = NCTestRequest(txt)
- val res = parser.parse(req, EN_MDL_CFG, EN_PARSER.parse(req,
EN_MDL_CFG)).asScala.toSeq
+ val res = parser.parse(
+ req,
+ NCTestConfig.EN,
NCTestUtils.mkTokens(NCTestConfig.EN.getTokenParser, req.txt)
+ ).asScala.toSeq
NCTestUtils.printEntities(txt, res)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricherSpec.scala
index a4d07b1..dc63f06 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricherSpec.scala
@@ -21,7 +21,6 @@ import org.apache.nlpcraft.*
import org.apache.nlpcraft.nlp.token.enricher.en.NCEnBracketsTokenEnricher
import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
import org.apache.nlpcraft.nlp.util.*
-import org.apache.nlpcraft.nlp.util.NCTestConfig.*
import org.junit.jupiter.api.*
import scala.jdk.CollectionConverters.*
@@ -42,8 +41,8 @@ class NCEnBracketsTokenEnricherSpec:
* @param brackets
*/
private def check(txt: String, brackets: Set[Integer]): Unit =
- val toks = EN_PARSER.parse(NCTestRequest(txt), EN_MDL_CFG)
- enricher.enrich(NCTestRequest(txt), EN_MDL_CFG, toks)
+ val toks = NCTestUtils.mkTokens(NCTestConfig.EN.getTokenParser, txt)
+ enricher.enrich(NCTestRequest(txt), NCTestConfig.EN, toks)
val seq = toks.asScala.toSeq
NCTestUtils.printTokens(seq)
seq.foreach (tok =>
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricherSpec.scala
index bd03637..2b8a20a 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricherSpec.scala
@@ -20,7 +20,6 @@ package org.apache.nlpcraft.nlp.token.enricher.en
import org.apache.nlpcraft.nlp.token.enricher.en.NCEnDictionaryTokenEnricher
import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
import org.apache.nlpcraft.nlp.util.*
-import org.apache.nlpcraft.nlp.util.NCTestConfig.*
import org.junit.jupiter.api.*
import scala.jdk.CollectionConverters.SeqHasAsJava
@@ -44,7 +43,7 @@ class NCEnDictionaryTokenEnricherSpec:
require(toks.head.getOpt[Boolean]("dict:en").isEmpty)
require(toks.last.getOpt[Boolean]("dict:en").isEmpty)
- enricher.enrich(null, EN_MDL_CFG, toks.asJava)
+ enricher.enrich(null, NCTestConfig.EN, toks.asJava)
NCTestUtils.printTokens(toks)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricherSpec.scala
index c8ca5cb..c694f9a 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricherSpec.scala
@@ -20,7 +20,6 @@ package org.apache.nlpcraft.nlp.token.enricher.en
import org.apache.nlpcraft.NCToken
import org.apache.nlpcraft.nlp.token.enricher.en.NCEnQuotesTokenEnricher
import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
-import org.apache.nlpcraft.nlp.util.NCTestConfig.*
import org.apache.nlpcraft.nlp.util.*
import org.junit.jupiter.api.*
@@ -42,9 +41,9 @@ class NCEnQuotesTokenEnricherSpec:
* @param quotes
*/
private def check(txt: String, quotes: Set[Integer]): Unit =
- val toks = EN_PARSER.parse(NCTestRequest(txt), EN_MDL_CFG)
+ val toks = NCTestUtils.mkTokens(NCTestConfig.EN.getTokenParser, txt)
val toksSeq = toks.asScala.toSeq
- enricher.enrich(NCTestRequest(txt), EN_MDL_CFG, toks)
+ enricher.enrich(NCTestRequest(txt), NCTestConfig.EN, toks)
NCTestUtils.printTokens(toksSeq)
toksSeq.foreach (tok =>
require(!(tok.get[Boolean]("quoted:en") ^
quotes.contains(tok.getIndex)))
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParserSpec.scala
index 2311889..d4dd0cc 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParserSpec.scala
@@ -19,9 +19,9 @@ package org.apache.nlpcraft.nlp.token.parser.opennlp.en
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.ascii.NCAsciiTable
+import org.apache.nlpcraft.nlp.token.enricher.en.{NCEnBracketsTokenEnricher,
NCEnStopWordsTokenEnricher}
import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
import org.apache.nlpcraft.nlp.util.*
-import org.apache.nlpcraft.nlp.util.NCTestConfig.*
import org.junit.jupiter.api.*
import java.util
@@ -31,57 +31,70 @@ import scala.jdk.CollectionConverters.*
*
*/
class NCEnOpenNlpTokenParserSpec:
+ private var enricher: NCEnStopWordsTokenEnricher = _
+
+ private def isStopWord(t: NCToken): Boolean = t.get[Boolean]("stopword")
+
private def test(txt: String, validate: Seq[NCToken] => _): Unit =
- val toks = EN_PARSER.parse(nlp.util.NCTestRequest(txt),
EN_MDL_CFG).asScala.toSeq
+ val toksList = NCTestUtils.mkTokens(NCTestConfig.EN.getTokenParser,
txt)
+ enricher.enrich(NCTestRequest(txt), NCTestConfig.EN, toksList)
+
+ val toks = toksList.asScala.toSeq
+
assert(toks.nonEmpty)
NCTestUtils.printTokens(toks)
validate(toks)
+ @BeforeEach
+ def start(): Unit = enricher =
+ NCTestUtils.makeAndStart(new NCEnStopWordsTokenEnricher(null, null))
+
+
@Test
def test(): Unit =
test(
"Test requests!",
toks =>
require(toks.length == 3);
- require(!toks.head.isStopWord);
- require(toks.last.isStopWord)
+ require(!isStopWord(toks.head));
+ require(isStopWord(toks.last))
)
test(
"Test requests !",
toks =>
require(toks.length == 3);
- require(!toks.head.isStopWord);
- require(toks.last.isStopWord)
+ require(!isStopWord(toks.head));
+ require(isStopWord(toks.last))
)
test(
// First and last are stop words,
// Third and fourth are not because quoted.
- // Note that "A ` A A` A" parsed as 5 tokens ("A", "`", ""A, "A`",
"A") because OpenNLP tokenizer logic,
+ // Note that "a ` a a` a" parsed as 5 tokens ("a", "`", ""a, "a`",
"a") because OpenNLP tokenizer logic,
// So we use spaces around quotes to simplify test.
- "A ` A A ` A",
+ "a ` a a ` a",
toks =>
require(toks.length == 6);
- require(toks.head.isStopWord);
- require(toks.last.isStopWord);
- require(toks.drop(1).reverse.drop(1).forall(!_.isStopWord))
+ require(isStopWord(toks.head));
+ require(isStopWord(toks.last));
+ require(toks.drop(1).reverse.drop(1).forall(p =>
!isStopWord(p)))
)
test(
// First and last are stop words,
// Third and fourth are not because brackets.
- "A ( A A ) A",
+ "a ( a a ) a",
toks =>
require(toks.length == 6);
- require(toks.head.isStopWord);
- require(toks.last.isStopWord);
- require(toks.drop(1).reverse.drop(1).forall(!_.isStopWord))
+ require(isStopWord(toks.head));
+ require(isStopWord(toks.last));
+ require(toks.drop(1).reverse.drop(1).forall(p =>
!isStopWord(p)))
)
test(
// Invalid brackets.
- "A ( A A A",
- toks => toks.filter(_.getText != "(").forall(_.isStopWord)
+ "a ( a a a",
+ toks => toks.filter(_.getText != "(").forall(isStopWord)
)
test(
// Nested brackets.
- "< < [ A ] > >",
- toks => require(!toks.find(_.getText == "A").get.isStopWord)
+ "< < [ a ] > >",
+ toks => require(!isStopWord(toks.find(_.getText == "a").get))
)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestConfig.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestConfig.scala
index bd1d1b1..b5ff768 100644
--- a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestConfig.scala
+++ b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestConfig.scala
@@ -17,35 +17,43 @@
package org.apache.nlpcraft.nlp.util
-import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
import org.apache.nlpcraft.*
-import org.apache.nlpcraft.nlp.tokenizer.opennlp.NCOpenNlpTokenizer
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
-import java.util
-import java.util.Optional
+import java.util.{Optional, ArrayList as JAList, List as JList}
+/***
+ *
+ */
object NCTestConfig:
- final val EN_TOKENIZER = new NCOpenNlpTokenizer(
- "opennlp/en-token.bin"
- )
- final val EN_PARSER: NCTokenParser = new NCEnOpenNlpTokenParser(
- "opennlp/en-pos-maxent.bin",
- "opennlp/en-lemmatizer.dict"
- )
-
- final val EN_MDL_CFG: NCModelConfig = new NCPropertyMapAdapter() with
NCModelConfig:
- override def getTokenizer: NCTokenizer = EN_TOKENIZER
- override def getTokenParsers: util.List[NCTokenParser] =
util.Collections.singletonList(EN_PARSER);
- override def getTokenEnrichers: util.List[NCTokenEnricher] = new
util.ArrayList[NCTokenEnricher]()
- override def getEntityEnrichers: util.List[NCEntityEnricher] = new
util.ArrayList[NCEntityEnricher]()
- override def getEntityParsers: util.List[NCEntityParser] = new
util.ArrayList[NCEntityParser]()
- override def getTokenValidators: util.List[NCTokenValidator] = new
util.ArrayList[NCTokenValidator]()
- override def getEntityValidators: util.List[NCEntityValidator] = new
util.ArrayList[NCEntityValidator]()
- override def getVariantsFilters: util.List[NCVariantsFilter] = new
util.ArrayList[NCVariantsFilter]()
+ final val EN: NCModelConfig = new NCPropertyMapAdapter() with
NCModelConfig:
+ private val p =
+ new NCEnOpenNlpTokenParser(
+ "opennlp/en-token.bin",
+ "opennlp/en-pos-maxent.bin",
+ "opennlp/en-lemmatizer.dict"
+ )
+
+ override def getTokenParser: NCTokenParser = p
+ override def getTokenEnrichers: JList[NCTokenEnricher] = new
JAList[NCTokenEnricher]()
+ override def getEntityEnrichers: JList[NCEntityEnricher] = new
JAList[NCEntityEnricher]()
+ override def getEntityParsers: JList[NCEntityParser] = new
JAList[NCEntityParser]()
+ override def getTokenValidators: JList[NCTokenValidator] = new
JAList[NCTokenValidator]()
+ override def getEntityValidators: JList[NCEntityValidator] = new
JAList[NCEntityValidator]()
+ override def getVariantValidators: JList[NCVariantsValidator] = new
JAList[NCVariantsValidator]()
override def getId: String = "test"
override def getName: String = "test"
override def getVersion: String = "1.0"
- EN_TOKENIZER.start(EN_MDL_CFG)
- EN_PARSER.start(EN_MDL_CFG)
+ // TODO: references?
+ EN.getTokenParser.start(EN)
+
+ start(EN.getTokenEnrichers)
+ start(EN.getEntityEnrichers)
+ start(EN.getEntityParsers)
+ start(EN.getTokenValidators)
+ start(EN.getEntityValidators)
+ start(EN.getVariantValidators)
+
+ private def start[T <: NCLifecycle](l: JList[T]): Unit = if l != null then
l.forEach(_.start(EN))
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
index 7af1a71..0cbdc70 100644
--- a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
+++ b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
@@ -46,7 +46,6 @@ case class NCTestRequest(
override def getReceiveTimestamp: Long = ts
override def getUserAgent: String = userAgent
override def getRequestData: util.Map[String, AnyRef] = data
- override def getWords: util.List[NCWord] =
EN_TOKENIZER.tokenize(EN_MDL_CFG, txt)
/**
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestToken.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestToken.scala
index d027e9c..a8e5584 100644
--- a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestToken.scala
+++ b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestToken.scala
@@ -35,17 +35,10 @@ case class NCTestToken(
idx: Int,
lemma: String = null,
stem: String = null,
- pos: String = null,
- isStop: Boolean = false,
- start: Int = -1,
- end: Int = -1
+ pos: String = null
) extends NCPropertyMapAdapter with NCToken:
override def getText: String = txt
override def getLemma: String = lemma
override def getStem: String = stem
override def getPos: String = pos
- override def isStopWord: Boolean = isStop
- override def getStartCharIndex: Int = start
- override def getEndCharIndex: Int = end
- override def getLength: Int = end - start + 1
override def getIndex: Int = idx
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
index a69039d..ac01f0d 100644
--- a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
+++ b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
@@ -20,10 +20,10 @@ package org.apache.nlpcraft.nlp.util
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.ascii.NCAsciiTable
import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
-import org.apache.nlpcraft.nlp.tokenizer.opennlp.NCOpenNlpTokenizer
+import java.util
import scala.jdk.CollectionConverters.*
-
+import scala.jdk.OptionConverters.RichOptional
/**
*
*/
@@ -32,7 +32,7 @@ object NCTestUtils:
* @param toks
*/
def printTokens(toks: Seq[NCToken]): Unit =
- val tbl = NCAsciiTable("Text", "Index", "POS", "Stem", "Lemma",
"Start", "End", "Length", "Stopword", "Properties")
+ val tbl = NCAsciiTable("Text", "Index", "POS", "Stem", "Lemma",
"Stopword", "Properties")
for (t <- toks)
tbl += (
@@ -41,10 +41,10 @@ object NCTestUtils:
t.getPos,
t.getStem,
t.getLemma,
- t.getStartCharIndex,
- t.getEndCharIndex,
- t.getLength,
- t.isStopWord,
+ t.getOpt[Boolean]("stopword").toScala match
+ case Some(b) => b.toString
+ case None => "undef."
+ ,
t.keysSet().asScala.map(p =>
s"$p=${t.get[Any](p)}").mkString("[", ", ", "]")
)
@@ -67,6 +67,20 @@ object NCTestUtils:
tbl.print(s"Request: $req")
+ def mkTokens(p: NCTokenParser, txt: String): util.List[NCToken] =
+ val toks = p.tokenize(txt)
+ val poses = p.getPoses(toks)
+ val lemmas = p.getLemmas(toks, poses)
+
+ toks.asScala.zip(poses.asScala).zip(lemmas.asScala).zipWithIndex.map {
case (((t, pos), lemma), idx) =>
+ new NCPropertyMapAdapter with NCToken:
+ override def getText: String = t
+ override def getLemma: String = lemma
+ override def getStem: String = p.getStem(t)
+ override def getPos: String = pos
+ override def getIndex: Int = idx
+ }.asJava
+
/**
*
* @param make
@@ -79,7 +93,9 @@ object NCTestUtils:
val start = now()
val t = make
val started = now()
+
+ t.start(NCTestConfig.EN)
- t.start(NCTestConfig.EN_MDL_CFG)
println(s"'${t.getClass.getSimpleName}' created in ${started -
start}ms and started in ${now() - started}ms.")
+
t
\ No newline at end of file