[incubator-nlpcraft] branch NLPCRAFT-468 updated: WIP.

sergeykamov Tue, 12 Oct 2021 10:31:27 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-468
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-468 by this push:
     new 9a8fb4d  WIP.
9a8fb4d is described below

commit 9a8fb4d7b7edd528e261d1a2aeabdf1783826b5f
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Oct 12 20:31:06 2021 +0300

    WIP.
---
 .../org/apache/nlpcraft/model/NCConversation.java  |   2 +-
 .../org/apache/nlpcraft/model/NCModelConfig.java   | 221 ++++++++++-----------
 .../model/builders/NCModelConfigBuilder.java       |  30 +--
 .../NCDefaultStopWordsDetector.java}               |  15 +-
 .../NCDefaultSwearWordsDetector.java}              |  15 +-
 .../NCFileWordsDetector.java}                      |  21 +-
 .../impl/{ => ner}/opennlp/NCOpenNlpNerParser.java |   8 +-
 .../ner/{ => synonyms}/NCSynonymsNerElement.java   |   2 +-
 .../ner/{ => synonyms}/NCSynonymsNerParser.java    |   2 +-
 .../ner/{ => synonyms}/NCSynonymsNerValue.java     |   2 +-
 .../{ => synonyms}/NCSynonymsNerValueLoader.java   |   2 +-
 .../builders/NCSynonymsNerElementBuilder.java      |   8 +-
 .../builders/NCSynonymsNerParserBuilder.java       |   6 +-
 .../{opennlp => tokenizer}/NCOpenNlpTokenizer.java |   2 +-
 .../apache/nlpcraft/model/nlp/NCNlpNerParser.java  |   2 +-
 .../nlp/{NCNlpNerToken.java => NCNlpToken.java}    |   6 +-
 .../org/apache/nlpcraft/model/nlp/NCNlpWord.java   |  10 +
 ...{NCNlpRichWord.java => NCNlpWordsDetector.java} |  24 +--
 .../src/test/java/org/apache/nlpcraft/NCSpec.java  |  23 ++-
 19 files changed, 195 insertions(+), 206 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCConversation.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCConversation.java
index 6e2ba33..f4f28dd 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCConversation.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCConversation.java
@@ -17,7 +17,7 @@
 
 package org.apache.nlpcraft.model;
 
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
 
 import java.util.List;
 import java.util.function.Predicate;
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelConfig.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelConfig.java
index e13af8a..0dd105c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelConfig.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelConfig.java
@@ -17,10 +17,14 @@
 
 package org.apache.nlpcraft.model;
 
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
-import org.apache.nlpcraft.model.impl.opennlp.NCOpenNlpTokenizer;
+import org.apache.nlpcraft.model.impl.detectors.NCDefaultStopWordsDetector;
+import org.apache.nlpcraft.model.impl.detectors.NCDefaultSwearWordsDetector;
+import org.apache.nlpcraft.model.impl.detectors.NCFileWordsDetector;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.tokenizer.NCOpenNlpTokenizer;
 import org.apache.nlpcraft.model.nlp.NCNlpNerParser;
 import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
+import org.apache.nlpcraft.model.nlp.NCNlpWordsDetector;
 
 import java.time.Duration;
 import java.util.Collections;
@@ -371,6 +375,8 @@ public interface NCModelConfig {
     }
 
     /**
+     * TODO: drop it. or change to boolean like 'isStopWordsAllowed' and 
'isSwearWordsAllowed'.
+     *
      * Gets maximum number of suspicious words until automatic rejection. A 
suspicious word is a word
      * that is defined by the model that should not appear in a valid user 
input under no circumstances.
      * A typical example of suspicious words would be words "sex" or "porn" 
when processing
@@ -563,36 +569,6 @@ public interface NCModelConfig {
         return DFLT_IS_NO_NOUNS_ALLOWED;
     }
 
-
-
-    /**
-     * Whether to allow the user input with no user token detected. If {@code 
false} such user
-     * input will be automatically rejected. Note that this property only 
applies to user-defined
-     * token (i.e. model element). Even if there are no user defined tokens, 
the user input may still
-     * contain system token like <code>nlpcraft:city</code> or 
<code>nlpcraft:date</code>. In many cases models
-     * should be build to allow user input without user tokens. However, set 
it to {@code false} if presence
-     * of at least one user token is mandatory.
-     * <p>
-     * <b>Default</b>
-     * <br>
-     * If not provided by the model the default value {@link 
#DFLT_IS_NO_USER_TOKENS_ALLOWED} will be used.
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by 
<code>noUserTokensAllowed</code> property:
-     * <pre class="brush: js">
-     * {
-     *      "noUserTokensAllowed": false
-     * }
-     * </pre>
-     *
-     * @return Whether to allow the user input with no user token detected.
-     */
-    // TODO? do we need it?
-    default boolean isNoUserTokensAllowed() {
-        return DFLT_IS_NO_USER_TOKENS_ALLOWED;
-    }
-
     /**
      * Gets optional user defined model metadata that can be set by the 
developer and accessed later.
      * By default, it returns an empty map. Note that this metadata is mutable 
and can be
@@ -618,88 +594,6 @@ public interface NCModelConfig {
     }
 
     /**
-     * Gets an optional list of stopwords to add to the built-in ones.
-     * <p>
-     * Stopword is an individual word (i.e. sequence of characters excluding 
whitespaces) that contribute no
-     * semantic meaning to the sentence. For example, 'the', 'wow', or 'hm' 
provide no semantic meaning to the
-     * sentence and can be safely excluded from semantic analysis.
-     * <p>
-     * NLPCraft comes with a carefully selected list of English stopwords 
which should be sufficient
-     * for a majority of use cases. However, you can add additional stopwords 
to this list. The typical
-     * use for user-defined stopwords are jargon parasite words that are 
specific to the model's domain.
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by 
<code>additionalStopwords</code> property:
-     * <pre class="brush: js">
-     * {
-     *      "additionalStopwords": [
-     *          "stopword1",
-     *          "stopword2"
-     *      ]
-     * }
-     * </pre>
-     *
-     * @return Potentially empty list of additional stopwords.
-     */
-    default Set<String> getAdditionalStopWords() {
-        return Collections.emptySet();
-    }
-
-    /**
-     * Gets an optional list of stopwords to exclude from the built-in list of 
stopwords.
-     * <p>
-     * Just like you can add additional stopwords via {@link 
#getAdditionalStopWords()} you can exclude
-     * certain words from the list of stopwords. This can be useful in rare 
cases when default built-in
-     * stopword has specific meaning of your model. In order to process them 
you need to exclude them
-     * from the list of stopwords.
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by 
<code>excludedStopwords</code> property:
-     * <pre class="brush: js">
-     * {
-     *      "excludedStopwords": [
-     *          "excludedStopword1",
-     *          "excludedStopword2"
-     *      ]
-     * }
-     * </pre>
-     *
-     * @return Potentially empty list of excluded stopwords.
-     */
-    default Set<String> getExcludedStopWords() {
-        return Collections.emptySet();
-    }
-
-    /**
-     * Gets an optional list of suspicious words. A suspicious word is a word 
that generally should not appear in user
-     * sentence when used with this model. For example, if a particular model 
is for children oriented book search,
-     * the words "sex" and "porn" should probably NOT appear in the user input 
and can be automatically rejected
-     * when added here and model's metadata {@code MAX_SUSPICIOUS_WORDS} 
property set to zero.
-     * <p>
-     * Note that by setting model's metadata {@code MAX_SUSPICIOUS_WORDS} 
property to non-zero value you can
-     * adjust the sensitivity of suspicious words auto-rejection logic.
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by 
<code>suspiciousWords</code> property:
-     * <pre class="brush: js">
-     * {
-     *      "suspiciousWords": [
-     *          "sex",
-     *          "porn"
-     *      ]
-     * }
-     * </pre>
-     *
-     * @return Potentially empty list of suspicious words in their lemma form.
-     */
-    default Set<String> getSuspiciousWords() {
-        return Collections.emptySet();
-    }
-
-    /**
      * Gets optional user-defined model element parsers for custom NER 
implementations. Note that order of the parsers
      * is important as they will be invoked in the same order they are 
returned.
      * <p>
@@ -804,4 +698,103 @@ public interface NCModelConfig {
     default boolean isStopWordsAllowed() {
         return DFLT_IS_STOPWORDS_ALLOWED;
     }
+
+    // TODO: dropped. getSuspiciousWords,
+    //  TODO: dropped - getAdditionalStopWords, getExcludedStopWords - just 
override if necessary getStopWordsDetector.
+
+    default NCNlpWordsDetector getStopWordsDetector() {
+        return new NCDefaultStopWordsDetector();
+    }
+
+    default NCNlpWordsDetector getSwearWordsDetector() {
+        return new NCDefaultSwearWordsDetector();
+    }
+
+    default NCNlpWordsDetector getSuspiciousWordsDetector() {
+        return null;
+    }
+
+
+//    /**
+//     * Gets an optional list of suspicious words. A suspicious word is a 
word that generally should not appear in user
+//     * sentence when used with this model. For example, if a particular 
model is for children oriented book search,
+//     * the words "sex" and "porn" should probably NOT appear in the user 
input and can be automatically rejected
+//     * when added here and model's metadata {@code MAX_SUSPICIOUS_WORDS} 
property set to zero.
+//     * <p>
+//     * Note that by setting model's metadata {@code MAX_SUSPICIOUS_WORDS} 
property to non-zero value you can
+//     * adjust the sensitivity of suspicious words auto-rejection logic.
+//     * <p>
+//     * <b>JSON</b>
+//     * <br>
+//     * If using JSON/YAML model presentation this is set by 
<code>suspiciousWords</code> property:
+//     * <pre class="brush: js">
+//     * {
+//     *      "suspiciousWords": [
+//     *          "sex",
+//     *          "porn"
+//     *      ]
+//     * }
+//     * </pre>
+//     *
+//     * @return Potentially empty list of suspicious words in their lemma 
form.
+//     */
+//    default Set<String> getSuspiciousWords() {
+//        return Collections.emptySet();
+//    }
+//
+//
+//    /**
+//     * Gets an optional list of stopwords to add to the built-in ones.
+//     * <p>
+//     * Stopword is an individual word (i.e. sequence of characters excluding 
whitespaces) that contribute no
+//     * semantic meaning to the sentence. For example, 'the', 'wow', or 'hm' 
provide no semantic meaning to the
+//     * sentence and can be safely excluded from semantic analysis.
+//     * <p>
+//     * NLPCraft comes with a carefully selected list of English stopwords 
which should be sufficient
+//     * for a majority of use cases. However, you can add additional 
stopwords to this list. The typical
+//     * use for user-defined stopwords are jargon parasite words that are 
specific to the model's domain.
+//     * <p>
+//     * <b>JSON</b>
+//     * <br>
+//     * If using JSON/YAML model presentation this is set by 
<code>additionalStopwords</code> property:
+//     * <pre class="brush: js">
+//     * {
+//     *      "additionalStopwords": [
+//     *          "stopword1",
+//     *          "stopword2"
+//     *      ]
+//     * }
+//     * </pre>
+//     *
+//     * @return Potentially empty list of additional stopwords.
+//     */
+//    default Set<String> getAdditionalStopWords() {
+//        return Collections.emptySet();
+//    }
+//
+//    /**
+//     * Gets an optional list of stopwords to exclude from the built-in list 
of stopwords.
+//     * <p>
+//     * Just like you can add additional stopwords via {@link 
#getAdditionalStopWords()} you can exclude
+//     * certain words from the list of stopwords. This can be useful in rare 
cases when default built-in
+//     * stopword has specific meaning of your model. In order to process them 
you need to exclude them
+//     * from the list of stopwords.
+//     * <p>
+//     * <b>JSON</b>
+//     * <br>
+//     * If using JSON/YAML model presentation this is set by 
<code>excludedStopwords</code> property:
+//     * <pre class="brush: js">
+//     * {
+//     *      "excludedStopwords": [
+//     *          "excludedStopword1",
+//     *          "excludedStopword2"
+//     *      ]
+//     * }
+//     * </pre>
+//     *
+//     * @return Potentially empty list of excluded stopwords.
+//     */
+//    default Set<String> getExcludedStopWords() {
+//        return Collections.emptySet();
+//    }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/builders/NCModelConfigBuilder.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/builders/NCModelConfigBuilder.java
index d7e5e41..82cfd26 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/builders/NCModelConfigBuilder.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/builders/NCModelConfigBuilder.java
@@ -18,14 +18,12 @@
 package org.apache.nlpcraft.model.builders;
 
 import org.apache.nlpcraft.model.NCModelConfig;
+import org.apache.nlpcraft.model.nlp.NCNlpWordsDetector;
 import org.apache.nlpcraft.model.nlp.NCNlpNerParser;
 import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
-import org.apache.nlpcraft.model.nlp.NCNlpWord;
 
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
-import java.util.function.Function;
 
 // Mandatory withOnContext or any of withIntentsXXX methods.
 // All other - optional.
@@ -73,38 +71,20 @@ public class NCModelConfigBuilder {
     public NCModelConfigBuilder withNoNounsAllowed(boolean noNounsAllowed) {
         return this;
     }
-    // TODO? do we need it?
-    public NCModelConfigBuilder withNoUserTokensAllowed(boolean 
noUserTokensAllowed) {
-        return this;
-    }
     public NCModelConfigBuilder withConversationTimeout(long 
conversationTimeout) { return this; }
     public NCModelConfigBuilder withConversationDepth(int conversationDepth) { 
return this; }
     public NCModelConfigBuilder withMetadata(Map<String, Object> meta) {
         return this;
     }
 
-    // 2. Words - for built stop/swear EN detection. (Suspicious via 
dictionary)
-    public NCModelConfigBuilder withAdditionalStopWords(Set<String> 
additionalStopWords) {
-        return this;
-    }
-    public NCModelConfigBuilder withExcludedStopWords(Set<String> 
excludedStopWords) {
-        return this;
-    }
-    public NCModelConfigBuilder withSuspiciousWords(Set<String> 
suspiciousWords) {
-        return this;
-    }
-
-    // TODO: discucc - Alternative - 3 custom words free implementation 
support - discuss it.
-    // We can provie all logic via these components for DE etc.
-    // Function<List<NCNlpWord>, List<NCNlpWord>> filter
-    // input - all sentence's words, output - detected stop/swear words. Empty 
result - not found.
-    public NCModelConfigBuilder withStopWordsFilter(Function<List<NCNlpWord>, 
List<NCNlpWord>> finder) {
+    // 2. 3 custom words detector. Free implementation supported.
+    public NCModelConfigBuilder withStopWordsDetector(NCNlpWordsDetector 
detector) {
         return this;
     }
-    public NCModelConfigBuilder withSwearWordsFilter(Function<List<NCNlpWord>, 
List<NCNlpWord>> finder) {
+    public NCModelConfigBuilder withSwearWordsDetector(NCNlpWordsDetector 
detectorr) {
         return this;
     }
-    public NCModelConfigBuilder 
withSuspiciousWordsFilter(Function<List<NCNlpWord>, List<NCNlpWord>> finder) {
+    public NCModelConfigBuilder withSuspiciousWordsDetector(NCNlpWordsDetector 
detector) {
         return this;
     }
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultStopWordsDetector.java
similarity index 71%
copy from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
copy to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultStopWordsDetector.java
index 72ed9cd..baa99bb 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultStopWordsDetector.java
@@ -15,18 +15,19 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.model.impl.opennlp;
+package org.apache.nlpcraft.model.impl.detectors;
 
-import org.apache.nlpcraft.model.NCRequest;
-import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
 import org.apache.nlpcraft.model.nlp.NCNlpWord;
+import org.apache.nlpcraft.model.nlp.NCNlpWordsDetector;
 
+import java.io.File;
+import java.net.URL;
+import java.util.Collections;
 import java.util.List;
 
-// Implementation by default. Stanford in another module. Can  be provided by 
user.
-public class NCOpenNlpTokenizer implements NCNlpTokenizer {
+public class NCDefaultStopWordsDetector implements NCNlpWordsDetector  {
     @Override
-    public List<NCNlpWord> tokenize(NCRequest req) {
-        return null;
+    public List<NCNlpWord> detect(List<NCNlpWord> sen) {
+        return Collections.emptyList();
     }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultSwearWordsDetector.java
similarity index 71%
copy from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
copy to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultSwearWordsDetector.java
index 72ed9cd..1669e65 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCDefaultSwearWordsDetector.java
@@ -15,18 +15,19 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.model.impl.opennlp;
+package org.apache.nlpcraft.model.impl.detectors;
 
-import org.apache.nlpcraft.model.NCRequest;
-import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
 import org.apache.nlpcraft.model.nlp.NCNlpWord;
+import org.apache.nlpcraft.model.nlp.NCNlpWordsDetector;
 
+import java.io.File;
+import java.net.URL;
+import java.util.Collections;
 import java.util.List;
 
-// Implementation by default. Stanford in another module. Can  be provided by 
user.
-public class NCOpenNlpTokenizer implements NCNlpTokenizer {
+public class NCDefaultSwearWordsDetector implements NCNlpWordsDetector  {
     @Override
-    public List<NCNlpWord> tokenize(NCRequest req) {
-        return null;
+    public List<NCNlpWord> detect(List<NCNlpWord> sen) {
+        return Collections.emptyList();
     }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCFileWordsDetector.java
similarity index 67%
copy from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
copy to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCFileWordsDetector.java
index 72ed9cd..3043843 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/detectors/NCFileWordsDetector.java
@@ -15,18 +15,25 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.model.impl.opennlp;
+package org.apache.nlpcraft.model.impl.detectors;
 
-import org.apache.nlpcraft.model.NCRequest;
-import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
 import org.apache.nlpcraft.model.nlp.NCNlpWord;
+import org.apache.nlpcraft.model.nlp.NCNlpWordsDetector;
 
+import java.io.File;
+import java.net.URL;
+import java.util.Collections;
 import java.util.List;
 
-// Implementation by default. Stanford in another module. Can  be provided by 
user.
-public class NCOpenNlpTokenizer implements NCNlpTokenizer {
+public class NCFileWordsDetector implements NCNlpWordsDetector  {
+    public NCFileWordsDetector(File data) {
+    }
+
+    public NCFileWordsDetector(URL data) {
+    }
+
     @Override
-    public List<NCNlpWord> tokenize(NCRequest req) {
-        return null;
+    public List<NCNlpWord> detect(List<NCNlpWord> sen) {
+        return Collections.emptyList();
     }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpNerParser.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/opennlp/NCOpenNlpNerParser.java
similarity index 80%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpNerParser.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/opennlp/NCOpenNlpNerParser.java
index 3650fab..d2fbb5b 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpNerParser.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/opennlp/NCOpenNlpNerParser.java
@@ -15,20 +15,20 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.model.impl.opennlp;
+package org.apache.nlpcraft.model.impl.ner.opennlp;
 
 import org.apache.nlpcraft.model.NCModelConfig;
 import org.apache.nlpcraft.model.NCRequest;
 import org.apache.nlpcraft.model.nlp.NCNlpNerParser;
-import org.apache.nlpcraft.model.nlp.NCNlpNerToken;
-import org.apache.nlpcraft.model.nlp.NCNlpRichWord;
+import org.apache.nlpcraft.model.nlp.NCNlpToken;
+import org.apache.nlpcraft.model.nlp.NCNlpWord;
 
 import java.util.List;
 
 // Implementation by default for opennlp NERs. Stanford in another module.
 public class NCOpenNlpNerParser implements NCNlpNerParser {
     @Override
-    public List<NCNlpNerToken> parse(NCRequest req, NCModelConfig cfg, 
List<NCNlpRichWord> words, List<NCNlpNerToken> elements) {
+    public List<NCNlpToken> parse(NCRequest req, NCModelConfig cfg, 
List<NCNlpWord> words, List<NCNlpToken> elements) {
         return null;
     }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerElement.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerElement.java
similarity index 99%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerElement.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerElement.java
index d16ac7d..0c34c05 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerElement.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerElement.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.model.impl.ner;
+package org.apache.nlpcraft.model.impl.ner.synonyms;
 
 import org.apache.nlpcraft.model.NCConversation;
 import org.apache.nlpcraft.model.NCToken;
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerParser.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerParser.java
similarity index 99%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerParser.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerParser.java
index 458c5f5..89214c0 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerParser.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerParser.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.model.impl.ner;
+package org.apache.nlpcraft.model.impl.ner.synonyms;
 
 import org.apache.nlpcraft.model.NCModel;
 import org.apache.nlpcraft.model.nlp.NCNlpNerParser;
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValue.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValue.java
similarity index 96%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValue.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValue.java
index e0b8e8a..075b7e5 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValue.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValue.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.model.impl.ner;
+package org.apache.nlpcraft.model.impl.ner.synonyms;
 
 import java.util.List;
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValueLoader.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValueLoader.java
similarity index 98%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValueLoader.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValueLoader.java
index e0ebe5a..0744e7a 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/NCSynonymsNerValueLoader.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/NCSynonymsNerValueLoader.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.model.impl.ner;
+package org.apache.nlpcraft.model.impl.ner.synonyms;
 
 import java.util.Set;
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerElementBuilder.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerElementBuilder.java
similarity index 86%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerElementBuilder.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerElementBuilder.java
index e3b3410..836d9aa 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerElementBuilder.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerElementBuilder.java
@@ -15,11 +15,11 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.model.impl.ner.builders;
+package org.apache.nlpcraft.model.impl.ner.synonyms.builders;
 
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerValue;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerValueLoader;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerValue;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerValueLoader;
 
 import java.util.List;
 import java.util.Map;
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerParserBuilder.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerParserBuilder.java
similarity index 89%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerParserBuilder.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerParserBuilder.java
index 6272b01..bf95154 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/builders/NCSynonymsNerParserBuilder.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/ner/synonyms/builders/NCSynonymsNerParserBuilder.java
@@ -15,10 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.model.impl.ner.builders;
+package org.apache.nlpcraft.model.impl.ner.synonyms.builders;
 
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerParser;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerParser;
 
 import java.io.File;
 import java.util.List;
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/tokenizer/NCOpenNlpTokenizer.java
similarity index 96%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/tokenizer/NCOpenNlpTokenizer.java
index 72ed9cd..7cccfd8 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/opennlp/NCOpenNlpTokenizer.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/tokenizer/NCOpenNlpTokenizer.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.model.impl.opennlp;
+package org.apache.nlpcraft.model.impl.tokenizer;
 
 import org.apache.nlpcraft.model.NCRequest;
 import org.apache.nlpcraft.model.nlp.NCNlpTokenizer;
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerParser.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerParser.java
index 5f46915..37c8ab9 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerParser.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerParser.java
@@ -61,5 +61,5 @@ public interface NCNlpNerParser {
      * @return List of custom elements. List can be empty or {@code null} if 
no model elements detected.
      * @see NCModel#getParsers()
      */
-    List<NCNlpNerToken> parse(NCRequest req, NCModelConfig cfg, 
List<NCNlpRichWord> words, List<NCNlpNerToken> toks);
+    List<NCNlpToken> parse(NCRequest req, NCModelConfig cfg, List<NCNlpWord> 
words, List<NCNlpToken> toks);
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerToken.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpToken.java
similarity index 96%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerToken.java
rename to nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpToken.java
index 2fc3950..0d182e0 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpNerToken.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpToken.java
@@ -20,14 +20,14 @@ package org.apache.nlpcraft.model.nlp;
 import org.apache.nlpcraft.model.NCModel;
 import org.apache.nlpcraft.model.NCModelConfig;
 import org.apache.nlpcraft.model.NCRequest;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
 
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 
 // NCNlpNerTokensParser parsing result.
-public interface NCNlpNerToken {
+public interface NCNlpToken {
     /**
      * Gets ID of the detected model element. Note that it <b>must 
correspond</b> to one of the elements
      * defined in the model. In other words, the parser doesn't define a new 
model element but rather
@@ -45,7 +45,7 @@ public interface NCNlpNerToken {
      *
      * @return List of NLP custom words that comprise detected custom model 
element.
      */
-    List<NCNlpRichWord> getWords();
+    List<NCNlpWord> getWords();
 
     /**
      * TODO: javadoc
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWord.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWord.java
index dca1f5d..982c23c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWord.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWord.java
@@ -27,4 +27,14 @@ public interface NCNlpWord {
     int getStart();
     int getEnd();
     int getLength();
+
+    default boolean isStopWord() {
+        return false;
+    }
+    default boolean isSwearWord() {
+        return false;
+    }
+    default boolean isSuspiciousWord() {
+        return false;
+    }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpRichWord.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWordsDetector.java
similarity index 70%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpRichWord.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWordsDetector.java
index 1396825..794821c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpRichWord.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/nlp/NCNlpWordsDetector.java
@@ -17,20 +17,16 @@
 
 package org.apache.nlpcraft.model.nlp;
 
-/**
- * Extended word data, enriched by NLP.
- * It is argument for NCNlpNerTokensParser.
- */
-public interface NCNlpRichWord extends NCNlpWord {
-    boolean isStopWord();
-    boolean isSwearWord();
-
-    // TODO: add
-    boolean isSuspiciousWord();
+import org.apache.nlpcraft.model.nlp.NCNlpWord;
 
-    // TODO: drop
-//    boolean isBracketed();
-//    boolean isQuoted();
-//    boolean isKnownWord();
+import java.util.List;
 
+public interface NCNlpWordsDetector {
+    /**
+     * Finds words by some criteria in given words list.
+     *
+     * @param sen
+     * @return
+     */
+    List<NCNlpWord> detect(List<NCNlpWord> sen);
 }
diff --git a/nlpcraft/src/test/java/org/apache/nlpcraft/NCSpec.java 
b/nlpcraft/src/test/java/org/apache/nlpcraft/NCSpec.java
index 69e2f75..d8b9eaf 100644
--- a/nlpcraft/src/test/java/org/apache/nlpcraft/NCSpec.java
+++ b/nlpcraft/src/test/java/org/apache/nlpcraft/NCSpec.java
@@ -24,15 +24,15 @@ import org.apache.nlpcraft.model.NCRejection;
 import org.apache.nlpcraft.model.NCModelConfig;
 import org.apache.nlpcraft.model.NCResult;
 import org.apache.nlpcraft.model.builders.NCModelConfigBuilder;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerValue;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerValue;
 import org.apache.nlpcraft.model.builders.NCModelBuilder;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerElement;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerParser;
-import org.apache.nlpcraft.model.impl.ner.NCSynonymsNerValueLoader;
-import org.apache.nlpcraft.model.impl.ner.builders.NCSynonymsNerElementBuilder;
-import org.apache.nlpcraft.model.impl.ner.builders.NCSynonymsNerParserBuilder;
-import org.apache.nlpcraft.model.impl.opennlp.NCOpenNlpNerParser;
-import org.apache.nlpcraft.model.impl.opennlp.NCOpenNlpTokenizer;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerElement;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerParser;
+import org.apache.nlpcraft.model.impl.ner.synonyms.NCSynonymsNerValueLoader;
+import 
org.apache.nlpcraft.model.impl.ner.synonyms.builders.NCSynonymsNerElementBuilder;
+import 
org.apache.nlpcraft.model.impl.ner.synonyms.builders.NCSynonymsNerParserBuilder;
+import org.apache.nlpcraft.model.impl.ner.opennlp.NCOpenNlpNerParser;
+import org.apache.nlpcraft.model.impl.tokenizer.NCOpenNlpTokenizer;
 import org.junit.jupiter.api.Test;
 
 import java.io.File;
@@ -87,9 +87,10 @@ public class NCSpec {
                 // Common.
                 withId("modleId").
                 withName("name").
-                withSwearWordsAllowed(true).
-                // Stopwords etc.
-                withAdditionalStopWords(new HashSet<>(Arrays.asList("x1", 
"x2"))).
+                // TODO:
+//                withSwearWordsAllowed(true).
+//                // Stopwords etc.
+//                withAdditionalStopWords(new HashSet<>(Arrays.asList("x1", 
"x2"))).
                 // Nlp tokenizer.
                 withTokenizer(new NCOpenNlpTokenizer()).
                 // NERs.

[incubator-nlpcraft] branch NLPCRAFT-468 updated: WIP.

Reply via email to