This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-468
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-468 by this push:
new 9a8fb4d WIP.
9a8fb4d is described below
commit 9a8fb4d7b7edd528e261d1a2aeabdf1783826b5f
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Oct 12 20:31:06 2021 +0300
WIP.
---
.../org/apache/nlpcraft/model/NCConversation.java | 2 +-
.../org/apache/nlpcraft/model/NCModelConfig.java | 221 ++++++++++-----------
.../model/builders/NCModelConfigBuilder.java | 30 +--
.../NCDefaultStopWordsDetector.java} | 15 +-
.../NCDefaultSwearWordsDetector.java} | 15 +-
.../NCFileWordsDetector.java} | 21 +-
.../impl/{ => ner}/opennlp/NCOpenNlpNerParser.java | 8 +-
.../ner/{ => synonyms}/NCSynonymsNerElement.java | 2 +-
.../ner/{ => synonyms}/NCSynonymsNerParser.java | 2 +-
.../ner/{ => synonyms}/NCSynonymsNerValue.java | 2 +-
.../{ => synonyms}/NCSynonymsNerValueLoader.java | 2 +-
.../builders/NCSynonymsNerElementBuilder.java | 8 +-
.../builders/NCSynonymsNerParserBuilder.java | 6 +-
.../{opennlp => tokenizer}/NCOpenNlpTokenizer.java | 2 +-
.../apache/nlpcraft/model/nlp/NCNlpNerParser.java | 2 +-
.../nlp/{NCNlpNerToken.java => NCNlpToken.java} | 6 +-
.../org/apache/nlpcraft/model/nlp/NCNlpWord.java | 10 +
...{NCNlpRichWord.java => NCNlpWordsDetector.java} | 24 +--
.../src/test/java/org/apache/nlpcraft/NCSpec.java | 23 ++-
19 files changed, 195 insertions(+), 206 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCConversation.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCConversation.java
index 6e2ba33..f4f28dd 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCConversation.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCConversation.java
@@ -17,7 +17,7 @@
package org.apache.nlpcraft.model;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
import java.util.List;
import java.util.function.Predicate;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelConfig.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelConfig.java
index e13af8a..0dd105c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelConfig.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelConfig.java
@@ -17,10 +17,14 @@
package org.apache.nlpcraft.model;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
-import org.apache.nlpcraft.model.impl.opennlp.NCOpenNlpTokenizer;
+import org.apache.nlpcraft.model.impl.detectors.NCDefaultStopWordsDetector;
+import org.apache.nlpcraft.model.impl.detectors.NCDefaultSwearWordsDetector;
+import org.apache.nlpcraft.model.impl.detectors.NCFileWordsDetector;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.tokenizer.NCOpenNlpTokenizer;
import org.apache.nlpcraft.model.nlp.NCNlpNerParser;
import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
+import org.apache.nlpcraft.model.nlp.NCNlpWordsDetector;
import java.time.Duration;
import java.util.Collections;
@@ -371,6 +375,8 @@ public interface NCModelConfig {
}
/**
+ * TODO: drop it. or change to boolean like 'isStopWordsAllowed' and
'isSwearWordsAllowed'.
+ *
* Gets maximum number of suspicious words until automatic rejection. A
suspicious word is a word
* that is defined by the model that should not appear in a valid user
input under no circumstances.
* A typical example of suspicious words would be words "sex" or "porn"
when processing
@@ -563,36 +569,6 @@ public interface NCModelConfig {
return DFLT_IS_NO_NOUNS_ALLOWED;
}
-
-
- /**
- * Whether to allow the user input with no user token detected. If {@code
false} such user
- * input will be automatically rejected. Note that this property only
applies to user-defined
- * token (i.e. model element). Even if there are no user defined tokens,
the user input may still
- * contain system token like <code>nlpcraft:city</code> or
<code>nlpcraft:date</code>. In many cases models
- * should be build to allow user input without user tokens. However, set
it to {@code false} if presence
- * of at least one user token is mandatory.
- * <p>
- * <b>Default</b>
- * <br>
- * If not provided by the model the default value {@link
#DFLT_IS_NO_USER_TOKENS_ALLOWED} will be used.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>noUserTokensAllowed</code> property:
- * <pre class="brush: js">
- * {
- * "noUserTokensAllowed": false
- * }
- * </pre>
- *
- * @return Whether to allow the user input with no user token detected.
- */
- // TODO? do we need it?
- default boolean isNoUserTokensAllowed() {
- return DFLT_IS_NO_USER_TOKENS_ALLOWED;
- }
-
/**
* Gets optional user defined model metadata that can be set by the
developer and accessed later.
* By default, it returns an empty map. Note that this metadata is mutable
and can be
@@ -618,88 +594,6 @@ public interface NCModelConfig {
}
/**
- * Gets an optional list of stopwords to add to the built-in ones.
- * <p>
- * Stopword is an individual word (i.e. sequence of characters excluding
whitespaces) that contribute no
- * semantic meaning to the sentence. For example, 'the', 'wow', or 'hm'
provide no semantic meaning to the
- * sentence and can be safely excluded from semantic analysis.
- * <p>
- * NLPCraft comes with a carefully selected list of English stopwords
which should be sufficient
- * for a majority of use cases. However, you can add additional stopwords
to this list. The typical
- * use for user-defined stopwords are jargon parasite words that are
specific to the model's domain.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>additionalStopwords</code> property:
- * <pre class="brush: js">
- * {
- * "additionalStopwords": [
- * "stopword1",
- * "stopword2"
- * ]
- * }
- * </pre>
- *
- * @return Potentially empty list of additional stopwords.
- */
- default Set<String> getAdditionalStopWords() {
- return Collections.emptySet();
- }
-
- /**
- * Gets an optional list of stopwords to exclude from the built-in list of
stopwords.
- * <p>
- * Just like you can add additional stopwords via {@link
#getAdditionalStopWords()} you can exclude
- * certain words from the list of stopwords. This can be useful in rare
cases when default built-in
- * stopword has specific meaning of your model. In order to process them
you need to exclude them
- * from the list of stopwords.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>excludedStopwords</code> property:
- * <pre class="brush: js">
- * {
- * "excludedStopwords": [
- * "excludedStopword1",
- * "excludedStopword2"
- * ]
- * }
- * </pre>
- *
- * @return Potentially empty list of excluded stopwords.
- */
- default Set<String> getExcludedStopWords() {
- return Collections.emptySet();
- }
-
- /**
- * Gets an optional list of suspicious words. A suspicious word is a word
that generally should not appear in user
- * sentence when used with this model. For example, if a particular model
is for children oriented book search,
- * the words "sex" and "porn" should probably NOT appear in the user input
and can be automatically rejected
- * when added here and model's metadata {@code MAX_SUSPICIOUS_WORDS}
property set to zero.
- * <p>
- * Note that by setting model's metadata {@code MAX_SUSPICIOUS_WORDS}
property to non-zero value you can
- * adjust the sensitivity of suspicious words auto-rejection logic.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>suspiciousWords</code> property:
- * <pre class="brush: js">
- * {
- * "suspiciousWords": [
- * "sex",
- * "porn"
- * ]
- * }
- * </pre>
- *
- * @return Potentially empty list of suspicious words in their lemma form.
- */
- default Set<String> getSuspiciousWords() {
- return Collections.emptySet();
- }
-
- /**
* Gets optional user-defined model element parsers for custom NER
implementations. Note that order of the parsers
* is important as they will be invoked in the same order they are
returned.
* <p>
@@ -804,4 +698,103 @@ public interface NCModelConfig {
default boolean isStopWordsAllowed() {
return DFLT_IS_STOPWORDS_ALLOWED;
}
+
+ // TODO: dropped. getSuspiciousWords,
+ // TODO: dropped - getAdditionalStopWords, getExcludedStopWords - just
override if necessary getStopWordsDetector.
+
+ default NCNlpWordsDetector getStopWordsDetector() {
+ return new NCDefaultStopWordsDetector();
+ }
+
+ default NCNlpWordsDetector getSwearWordsDetector() {
+ return new NCDefaultSwearWordsDetector();
+ }
+
+ default NCNlpWordsDetector getSuspiciousWordsDetector() {
+ return null;
+ }
+
+
+// /**
+// * Gets an optional list of suspicious words. A suspicious word is a
word that generally should not appear in user
+// * sentence when used with this model. For example, if a particular
model is for children oriented book search,
+// * the words "sex" and "porn" should probably NOT appear in the user
input and can be automatically rejected
+// * when added here and model's metadata {@code MAX_SUSPICIOUS_WORDS}
property set to zero.
+// * <p>
+// * Note that by setting model's metadata {@code MAX_SUSPICIOUS_WORDS}
property to non-zero value you can
+// * adjust the sensitivity of suspicious words auto-rejection logic.
+// * <p>
+// * <b>JSON</b>
+// * <br>
+// * If using JSON/YAML model presentation this is set by
<code>suspiciousWords</code> property:
+// * <pre class="brush: js">
+// * {
+// * "suspiciousWords": [
+// * "sex",
+// * "porn"
+// * ]
+// * }
+// * </pre>
+// *
+// * @return Potentially empty list of suspicious words in their lemma
form.
+// */
+// default Set<String> getSuspiciousWords() {
+// return Collections.emptySet();
+// }
+//
+//
+// /**
+// * Gets an optional list of stopwords to add to the built-in ones.
+// * <p>
+// * Stopword is an individual word (i.e. sequence of characters excluding
whitespaces) that contribute no
+// * semantic meaning to the sentence. For example, 'the', 'wow', or 'hm'
provide no semantic meaning to the
+// * sentence and can be safely excluded from semantic analysis.
+// * <p>
+// * NLPCraft comes with a carefully selected list of English stopwords
which should be sufficient
+// * for a majority of use cases. However, you can add additional
stopwords to this list. The typical
+// * use for user-defined stopwords are jargon parasite words that are
specific to the model's domain.
+// * <p>
+// * <b>JSON</b>
+// * <br>
+// * If using JSON/YAML model presentation this is set by
<code>additionalStopwords</code> property:
+// * <pre class="brush: js">
+// * {
+// * "additionalStopwords": [
+// * "stopword1",
+// * "stopword2"
+// * ]
+// * }
+// * </pre>
+// *
+// * @return Potentially empty list of additional stopwords.
+// */
+// default Set<String> getAdditionalStopWords() {
+// return Collections.emptySet();
+// }
+//
+// /**
+// * Gets an optional list of stopwords to exclude from the built-in list
of stopwords.
+// * <p>
+// * Just like you can add additional stopwords via {@link
#getAdditionalStopWords()} you can exclude
+// * certain words from the list of stopwords. This can be useful in rare
cases when default built-in
+// * stopword has specific meaning of your model. In order to process them
you need to exclude them
+// * from the list of stopwords.
+// * <p>
+// * <b>JSON</b>
+// * <br>
+// * If using JSON/YAML model presentation this is set by
<code>excludedStopwords</code> property:
+// * <pre class="brush: js">
+// * {
+// * "excludedStopwords": [
+// * "excludedStopword1",
+// * "excludedStopword2"
+// * ]
+// * }
+// * </pre>
+// *
+// * @return Potentially empty list of excluded stopwords.
+// */
+// default Set<String> getExcludedStopWords() {
+// return Collections.emptySet();
+// }
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/builders/NCModelConfigBuilder.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/builders/NCModelConfigBuilder.java
index d7e5e41..82cfd26 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/builders/NCModelConfigBuilder.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/builders/NCModelConfigBuilder.java
@@ -18,14 +18,12 @@
package org.apache.nlpcraft.model.builders;
import org.apache.nlpcraft.model.NCModelConfig;
+import org.apache.nlpcraft.model.nlp.NCNlpWordsDetector;
import org.apache.nlpcraft.model.nlp.NCNlpNerParser;
import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
-import org.apache.nlpcraft.model.nlp.NCNlpWord;
import java.util.List;
import java.util.Map;
-import java.util.Set;
-import java.util.function.Function;
// Mandatory withOnContext or any of withIntentsXXX methods.
// All other - optional.
@@ -73,38 +71,20 @@ public class NCModelConfigBuilder {
public NCModelConfigBuilder withNoNounsAllowed(boolean noNounsAllowed) {
return this;
}
- // TODO? do we need it?
- public NCModelConfigBuilder withNoUserTokensAllowed(boolean
noUserTokensAllowed) {
- return this;
- }
public NCModelConfigBuilder withConversationTimeout(long
conversationTimeout) { return this; }
public NCModelConfigBuilder withConversationDepth(int conversationDepth) {
return this; }
public NCModelConfigBuilder withMetadata(Map<String, Object> meta) {
return this;
}
- // 2. Words - for built stop/swear EN detection. (Suspicious via
dictionary)
- public NCModelConfigBuilder withAdditionalStopWords(Set<String>
additionalStopWords) {
- return this;
- }
- public NCModelConfigBuilder withExcludedStopWords(Set<String>
excludedStopWords) {
- return this;
- }
- public NCModelConfigBuilder withSuspiciousWords(Set<String>
suspiciousWords) {
- return this;
- }
-
- // TODO: discucc - Alternative - 3 custom words free implementation
support - discuss it.
- // We can provie all logic via these components for DE etc.
- // Function<List<NCNlpWord>, List<NCNlpWord>> filter
- // input - all sentence's words, output - detected stop/swear words. Empty
result - not found.
- public NCModelConfigBuilder withStopWordsFilter(Function<List<NCNlpWord>,
List<NCNlpWord>> finder) {
+ // 2. 3 custom words detector. Free implementation supported.
+ public NCModelConfigBuilder withStopWordsDetector(NCNlpWordsDetector
detector) {
return this;
}
- public NCModelConfigBuilder withSwearWordsFilter(Function<List<NCNlpWord>,
List<NCNlpWord>> finder) {
+ public NCModelConfigBuilder withSwearWordsDetector(NCNlpWordsDetector
detectorr) {
return this;
}
- public NCModelConfigBuilder
withSuspiciousWordsFilter(Function<List<NCNlpWord>, List<NCNlpWord>> finder) {
+ public NCModelConfigBuilder withSuspiciousWordsDetector(NCNlpWordsDetector
detector) {
return this;
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultStopWordsDetector.java
similarity index 71%
copy from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
copy to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultStopWordsDetector.java
index 72ed9cd..baa99bb 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultStopWordsDetector.java
@@ -15,18 +15,19 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.model.impl.opennlp;
+package org.apache.nlpcraft.model.impl.detectors;
-import org.apache.nlpcraft.model.NCRequest;
-import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
import org.apache.nlpcraft.model.nlp.NCNlpWord;
+import org.apache.nlpcraft.model.nlp.NCNlpWordsDetector;
+import java.io.File;
+import java.net.URL;
+import java.util.Collections;
import java.util.List;
-// Implementation by default. Stanford in another module. Can be provided by
user.
-public class NCOpenNlpTokenizer implements NCNlpTokenizer {
+public class NCDefaultStopWordsDetector implements NCNlpWordsDetector {
@Override
- public List<NCNlpWord> tokenize(NCRequest req) {
- return null;
+ public List<NCNlpWord> detect(List<NCNlpWord> sen) {
+ return Collections.emptyList();
}
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultSwearWordsDetector.java
similarity index 71%
copy from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
copy to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultSwearWordsDetector.java
index 72ed9cd..1669e65 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultSwearWordsDetector.java
@@ -15,18 +15,19 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.model.impl.opennlp;
+package org.apache.nlpcraft.model.impl.detectors;
-import org.apache.nlpcraft.model.NCRequest;
-import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
import org.apache.nlpcraft.model.nlp.NCNlpWord;
+import org.apache.nlpcraft.model.nlp.NCNlpWordsDetector;
+import java.io.File;
+import java.net.URL;
+import java.util.Collections;
import java.util.List;
-// Implementation by default. Stanford in another module. Can be provided by
user.
-public class NCOpenNlpTokenizer implements NCNlpTokenizer {
+public class NCDefaultSwearWordsDetector implements NCNlpWordsDetector {
@Override
- public List<NCNlpWord> tokenize(NCRequest req) {
- return null;
+ public List<NCNlpWord> detect(List<NCNlpWord> sen) {
+ return Collections.emptyList();
}
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCFileWordsDetector.java
similarity index 67%
copy from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
copy to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCFileWordsDetector.java
index 72ed9cd..3043843 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCFileWordsDetector.java
@@ -15,18 +15,25 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.model.impl.opennlp;
+package org.apache.nlpcraft.model.impl.detectors;
-import org.apache.nlpcraft.model.NCRequest;
-import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
import org.apache.nlpcraft.model.nlp.NCNlpWord;
+import org.apache.nlpcraft.model.nlp.NCNlpWordsDetector;
+import java.io.File;
+import java.net.URL;
+import java.util.Collections;
import java.util.List;
-// Implementation by default. Stanford in another module. Can be provided by
user.
-public class NCOpenNlpTokenizer implements NCNlpTokenizer {
+public class NCFileWordsDetector implements NCNlpWordsDetector {
+ public NCFileWordsDetector(File data) {
+ }
+
+ public NCFileWordsDetector(URL data) {
+ }
+
@Override
- public List<NCNlpWord> tokenize(NCRequest req) {
- return null;
+ public List<NCNlpWord> detect(List<NCNlpWord> sen) {
+ return Collections.emptyList();
}
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpNerParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/opennlp/NCOpenNlpNerParser.java
similarity index 80%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpNerParser.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/opennlp/NCOpenNlpNerParser.java
index 3650fab..d2fbb5b 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpNerParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/opennlp/NCOpenNlpNerParser.java
@@ -15,20 +15,20 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.model.impl.opennlp;
+package org.apache.nlpcraft.model.impl.ner.opennlp;
import org.apache.nlpcraft.model.NCModelConfig;
import org.apache.nlpcraft.model.NCRequest;
import org.apache.nlpcraft.model.nlp.NCNlpNerParser;
-import org.apache.nlpcraft.model.nlp.NCNlpNerToken;
-import org.apache.nlpcraft.model.nlp.NCNlpRichWord;
+import org.apache.nlpcraft.model.nlp.NCNlpToken;
+import org.apache.nlpcraft.model.nlp.NCNlpWord;
import java.util.List;
// Implementation by default for opennlp NERs. Stanford in another module.
public class NCOpenNlpNerParser implements NCNlpNerParser {
@Override
- public List<NCNlpNerToken> parse(NCRequest req, NCModelConfig cfg,
List<NCNlpRichWord> words, List<NCNlpNerToken> elements) {
+ public List<NCNlpToken> parse(NCRequest req, NCModelConfig cfg,
List<NCNlpWord> words, List<NCNlpToken> elements) {
return null;
}
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerElement.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerElement.java
similarity index 99%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerElement.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerElement.java
index d16ac7d..0c34c05 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerElement.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerElement.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.model.impl.ner;
+package org.apache.nlpcraft.model.impl.ner.synonyms;
import org.apache.nlpcraft.model.NCConversation;
import org.apache.nlpcraft.model.NCToken;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerParser.java
similarity index 99%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerParser.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerParser.java
index 458c5f5..89214c0 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerParser.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.model.impl.ner;
+package org.apache.nlpcraft.model.impl.ner.synonyms;
import org.apache.nlpcraft.model.NCModel;
import org.apache.nlpcraft.model.nlp.NCNlpNerParser;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValue.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValue.java
similarity index 96%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValue.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValue.java
index e0b8e8a..075b7e5 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValue.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValue.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.model.impl.ner;
+package org.apache.nlpcraft.model.impl.ner.synonyms;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValueLoader.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValueLoader.java
similarity index 98%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValueLoader.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValueLoader.java
index e0ebe5a..0744e7a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValueLoader.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValueLoader.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.model.impl.ner;
+package org.apache.nlpcraft.model.impl.ner.synonyms;
import java.util.Set;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerElementBuilder.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerElementBuilder.java
similarity index 86%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerElementBuilder.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerElementBuilder.java
index e3b3410..836d9aa 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerElementBuilder.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerElementBuilder.java
@@ -15,11 +15,11 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.model.impl.ner.builders;
+package org.apache.nlpcraft.model.impl.ner.synonyms.builders;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerValue;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerValueLoader;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerValue;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerValueLoader;
import java.util.List;
import java.util.Map;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerParserBuilder.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerParserBuilder.java
similarity index 89%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerParserBuilder.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerParserBuilder.java
index 6272b01..bf95154 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerParserBuilder.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerParserBuilder.java
@@ -15,10 +15,10 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.model.impl.ner.builders;
+package org.apache.nlpcraft.model.impl.ner.synonyms.builders;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerParser;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerParser;
import java.io.File;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/tokenizer/NCOpenNlpTokenizer.java
similarity index 96%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/tokenizer/NCOpenNlpTokenizer.java
index 72ed9cd..7cccfd8 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/tokenizer/NCOpenNlpTokenizer.java
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.model.impl.opennlp;
+package org.apache.nlpcraft.model.impl.tokenizer;
import org.apache.nlpcraft.model.NCRequest;
import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerParser.java
index 5f46915..37c8ab9 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerParser.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerParser.java
@@ -61,5 +61,5 @@ public interface NCNlpNerParser {
* @return List of custom elements. List can be empty or {@code null} if
no model elements detected.
* @see NCModel#getParsers()
*/
- List<NCNlpNerToken> parse(NCRequest req, NCModelConfig cfg,
List<NCNlpRichWord> words, List<NCNlpNerToken> toks);
+ List<NCNlpToken> parse(NCRequest req, NCModelConfig cfg, List<NCNlpWord>
words, List<NCNlpToken> toks);
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerToken.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpToken.java
similarity index 96%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerToken.java
rename to nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpToken.java
index 2fc3950..0d182e0 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerToken.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpToken.java
@@ -20,14 +20,14 @@ package org.apache.nlpcraft.model.nlp;
import org.apache.nlpcraft.model.NCModel;
import org.apache.nlpcraft.model.NCModelConfig;
import org.apache.nlpcraft.model.NCRequest;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
import java.util.Collections;
import java.util.List;
import java.util.Map;
// NCNlpNerTokensParser parsing result.
-public interface NCNlpNerToken {
+public interface NCNlpToken {
/**
* Gets ID of the detected model element. Note that it <b>must
correspond</b> to one of the elements
* defined in the model. In other words, the parser doesn't define a new
model element but rather
@@ -45,7 +45,7 @@ public interface NCNlpNerToken {
*
* @return List of NLP custom words that comprise detected custom model
element.
*/
- List<NCNlpRichWord> getWords();
+ List<NCNlpWord> getWords();
/**
* TODO: javadoc
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWord.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWord.java
index dca1f5d..982c23c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWord.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWord.java
@@ -27,4 +27,14 @@ public interface NCNlpWord {
int getStart();
int getEnd();
int getLength();
+
+ default boolean isStopWord() {
+ return false;
+ }
+ default boolean isSwearWord() {
+ return false;
+ }
+ default boolean isSuspiciousWord() {
+ return false;
+ }
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpRichWord.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWordsDetector.java
similarity index 70%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpRichWord.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWordsDetector.java
index 1396825..794821c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpRichWord.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWordsDetector.java
@@ -17,20 +17,16 @@
package org.apache.nlpcraft.model.nlp;
-/**
- * Extended word data, enriched by NLP.
- * It is argument for NCNlpNerTokensParser.
- */
-public interface NCNlpRichWord extends NCNlpWord {
- boolean isStopWord();
- boolean isSwearWord();
-
- // TODO: add
- boolean isSuspiciousWord();
+import org.apache.nlpcraft.model.nlp.NCNlpWord;
- // TODO: drop
-// boolean isBracketed();
-// boolean isQuoted();
-// boolean isKnownWord();
+import java.util.List;
+public interface NCNlpWordsDetector {
+ /**
+ * Finds words by some criteria in given words list.
+ *
+ * @param sen
+ * @return
+ */
+ List<NCNlpWord> detect(List<NCNlpWord> sen);
}
diff --git a/nlpcraft/src/test/java/org/apache/nlpcraft/NCSpec.java
b/nlpcraft/src/test/java/org/apache/nlpcraft/NCSpec.java
index 69e2f75..d8b9eaf 100644
--- a/nlpcraft/src/test/java/org/apache/nlpcraft/NCSpec.java
+++ b/nlpcraft/src/test/java/org/apache/nlpcraft/NCSpec.java
@@ -24,15 +24,15 @@ import org.apache.nlpcraft.model.NCRejection;
import org.apache.nlpcraft.model.NCModelConfig;
import org.apache.nlpcraft.model.NCResult;
import org.apache.nlpcraft.model.builders.NCModelConfigBuilder;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerValue;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerValue;
import org.apache.nlpcraft.model.builders.NCModelBuilder;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerParser;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerValueLoader;
-import org.apache.nlpcraft.model.impl.ner.builders.NCSynonymsNerElementBuilder;
-import org.apache.nlpcraft.model.impl.ner.builders.NCSynonymsNerParserBuilder;
-import org.apache.nlpcraft.model.impl.opennlp.NCOpenNlpNerParser;
-import org.apache.nlpcraft.model.impl.opennlp.NCOpenNlpTokenizer;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerParser;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerValueLoader;
+import
org.apache.nlpcraft.model.impl.ner.synonyms.builders.NCSynonymsNerElementBuilder;
+import
org.apache.nlpcraft.model.impl.ner.synonyms.builders.NCSynonymsNerParserBuilder;
+import org.apache.nlpcraft.model.impl.ner.opennlp.NCOpenNlpNerParser;
+import org.apache.nlpcraft.model.impl.tokenizer.NCOpenNlpTokenizer;
import org.junit.jupiter.api.Test;
import java.io.File;
@@ -87,9 +87,10 @@ public class NCSpec {
// Common.
withId("modleId").
withName("name").
- withSwearWordsAllowed(true).
- // Stopwords etc.
- withAdditionalStopWords(new HashSet<>(Arrays.asList("x1",
"x2"))).
+ // TODO:
+// withSwearWordsAllowed(true).
+// // Stopwords etc.
+// withAdditionalStopWords(new HashSet<>(Arrays.asList("x1",
"x2"))).
// Nlp tokenizer.
withTokenizer(new NCOpenNlpTokenizer()).
// NERs.