This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new 33ad4d9 WIP.
33ad4d9 is described below
commit 33ad4d9c6fff2bd5220358dfede138afe467e67d
Author: Aaron Radzinski <[email protected]>
AuthorDate: Tue Dec 7 18:22:42 2021 -0800
WIP.
---
.../main/scala/org/apache/nlpcraft/NCEntity.java | 121 +-----------
.../{NCEntityParser.java => NCEntityEnricher.java} | 25 ++-
.../scala/org/apache/nlpcraft/NCEntityParser.java | 3 +-
.../scala/org/apache/nlpcraft/NCModelConfig.java | 217 ++-------------------
.../{NCToken.java => NCParameterized.java} | 65 +++---
.../main/scala/org/apache/nlpcraft/NCToken.java | 8 +-
.../{NCEntityParser.java => NCTokenEnricher.java} | 21 +-
7 files changed, 105 insertions(+), 355 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
index 35aac20..6f2899e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
@@ -22,109 +22,25 @@ import java.util.List;
/**
*
*/
-public interface NCEntity {
- /**
- * Gets ID of the request this entity is part of.
- *
- * @return ID of the request this entity is part of.
- */
- String getRequestId();
-
+public interface NCEntity extends NCParameterized {
/**
*
* @return
*/
- String getId();
-
- /**
- * Gets the optional parent ID of the model element this entity
represents. This only available
- * for user-defined model elements - built-in entities do not have parents
and this will return {@code null}.
- *
- * @return ID of the entity's element immediate parent or {@code null} if
not available.
- * @see NCElement#getParentId()
- * @see #getAncestors()
- */
- String getParentId();
-
- /**
- * Gets the list of all parent IDs from this entity up to the root. This
only available
- * for user-defined model elements = built-in entities do not have parents
and will return an empty list.
- *
- * @return List, potentially empty but never {@code null}, of all parent
IDs from this entity up to the root.
- * @see #getParentId()
- */
- List<String> getAncestors();
+ List<NCToken> getTokens();
/**
- * Tests whether this entity is a child of given entity ID. It is
equivalent to:
- * <pre class="brush: java">
- * return getAncestors().contains(tokId);
- * </pre>
- *
- * @param tokId Ancestor entity ID.
- * @return <code>true</code> this entity is a child of given entity ID,
<code>false</code> otherwise.
- */
- default boolean isChildOf(String tokId) {
- return getAncestors().contains(tokId);
- }
-
- /**
- * Gets the value if this entity was detected via element's value (or its
synonyms). Otherwise,
- * returns {@code null}. Only applicable for user-defined model elements -
built-in entities
- * do not have values, and it will return {@code null}.
- *
- * @return Value for the user-defined model element or {@code null}, if
not available.
- * @see NCElement#getValues()
- */
- String getValue();
-
- /**
- * Gets the list of groups this entity belongs to. Note that, by default,
if not specified explicitly,
- * entity always belongs to one group with ID equal to entity ID.
- *
- * @return entity groups list. Never {@code null} - but can be empty.
- * @see NCElement#getGroups()
- */
- List<String> getGroups();
-
- /**
- * Tests whether this entity belongs to the given group. It is equivalent
to:
- * <pre class="brush: java">
- * return getGroups().contains(grp);
- * </pre>
- *
- * @param grp Group to test.
- * @return <code>True</code> if this entity belongs to the group
<code>grp</code>, {@code false} otherwise.
- */
- default boolean isMemberOf(String grp) {
- return getGroups().contains(grp);
- }
-
- /**
- * Gets start character index of this entity in the original text.
- *
- * @return Start character index of this entity.
- */
- int getStartCharIndex();
-
- /**
- * Gets end character index of this entity in the original text.
+ * Gets ID of the request this entity is part of.
*
- * @return End character index of this entity.
+ * @return ID of the request this entity is part of.
*/
- int getEndCharIndex();
+ String getRequestId();
/**
*
- * @return Whether this entity is a stopword.
- */
- boolean isStopWord();
-
- /**
- *
- * @return Original user input text for this entity.
+ * @return
*/
- String getOriginalText();
+ String getId();
/**
*
@@ -133,29 +49,6 @@ public interface NCEntity {
int getIndex();
/**
- *
- * @return Normalized user input text for this entity.
- */
- String getNormalizedText();
- /**
- *
- * @return Lemma of this entity, i.e. a canonical form of this word.
- */
- String getLemma();
-
- /**
- *
- * @return Stem of this entity.
- */
- String getStem();
-
- /**
- *
- * @return Penn Treebank POS tag for this entity.
- */
- String getPos();
-
- /**
* A shortcut method that gets internal globally unique system ID of the
entity.
* <p>
* This method is equivalent to:
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityEnricher.java
similarity index 57%
copy from nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
copy to nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityEnricher.java
index fedf377..ccc49f2 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityEnricher.java
@@ -15,21 +15,30 @@
* limitations under the License.
*/
-package org.apache.nlpcraft;
+package org.apache.nlpcraft;/*
+ _________ ______________
+ __ ____/_______________ __ \__ /_____ _____ __
+ _ / _ __ \_ ___/_ /_/ /_ /_ __ `/_ / / /
+ / /___ / /_/ /(__ )_ ____/_ / / /_/ /_ /_/ /
+ \____/ \____//____/ /_/ /_/ \__,_/ _\__, /
+ /____/
-import java.util.List;
+ 2D ASCII JVM GAME ENGINE FOR SCALA3
+ (C) 2021 Rowan Games, Inc.
+ ALl rights reserved.
+*/
+
+import java.util.*;
/**
*
*/
-public interface NCEntityParser {
+public interface NCEntityEnricher {
/**
*
* @param req
- * @param cfg
- * @param toks
- * @param ents List of already parsed entities prio to this step. Can be
empty but never {@code null}.
- * @return
+ * @param cfg
+ * @param ents
*/
- List<NCEntity> parse(NCRequest req, NCModelConfig cfg, List<NCToken> toks,
List<NCEntity> ents);
+ void enrich(NCRequest req, NCModelConfig cfg, List<NCEntity> ents);
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
index fedf377..d58dbc8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
@@ -28,8 +28,7 @@ public interface NCEntityParser {
* @param req
* @param cfg
* @param toks
- * @param ents List of already parsed entities prio to this step. Can be
empty but never {@code null}.
* @return
*/
- List<NCEntity> parse(NCRequest req, NCModelConfig cfg, List<NCToken> toks,
List<NCEntity> ents);
+ List<NCEntity> parse(NCRequest req, NCModelConfig cfg, List<NCToken> toks);
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java
index fa6f4b9..998cb16 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java
@@ -22,17 +22,7 @@ import java.util.*;
/**
*
*/
-public interface NCModelConfig {
- /**
- * Default value for {@link #getMinWords()} method.
- */
- int DFLT_MIN_WORDS = 1;
-
- /**
- * Default value for {@link #getMaxWords()} method.
- */
- int DFLT_MAX_WORDS = 50;
-
+public interface NCModelConfig extends NCParameterized {
/**
* Default value for {@link #getMinTokens()} method.
*/
@@ -54,15 +44,22 @@ public interface NCModelConfig {
boolean DFLT_IS_NOT_LATIN_CHARSET_ALLOWED = false;
/**
- * Default value for {@link #isSwearWordsAllowed()} method.
+ *
+ * @return
+ */
+ NCTokenParser getTokenParser();
+
+ /**
+ *
+ * @return
*/
- boolean DFLT_IS_SWEAR_WORDS_ALLOWED = false;
+ List<NCTokenEnricher> getTokenEnrichers();
/**
*
* @return
*/
- NCTokenParser getTokenParser();
+ List<NCEntityEnricher> getEntityEnrichers();
/**
*
@@ -72,52 +69,20 @@ public interface NCModelConfig {
/**
* Gets unique, <i>immutable</i> ID of this model.
- * <p>
- * Note that <b>model IDs are immutable</b> while name and version
- * can be changed freely. Changing model ID is equal to creating a
completely new model.
- * Model IDs (unlike name and version) are not exposed to the end user and
only serve a
- * technical purpose. ID's max length is 32 characters.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by <code>id</code>
property:
- * <pre class="brush: js">
- * {
- * "id": "my.model.id"
- * }
- * </pre>
*
* @return Unique, <i>immutable</i> ID of this model.
*/
String getId();
/**
- * Gets descriptive name of this model. Name's max length is 64 characters.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by <code>name</code>
property:
- * <pre class="brush: js">
- * {
- * "name": "My Model"
- * }
- * </pre>
+ * Gets descriptive name of this model.
*
* @return Descriptive name for this model.
*/
String getName();
/**
- * Gets the version of this model using semantic versioning. Version's max
length is 16 characters.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>version</code> property:
- * <pre class="brush: js">
- * {
- * "version": "1.0.0"
- * }
- * </pre>
+ * Gets the version of this model using semantic versioning.
*
* @return A version compatible with (<a
href="http://www.semver.org">www.semver.org</a>) specification.
*/
@@ -126,15 +91,6 @@ public interface NCModelConfig {
/**
* Gets optional short model description. This can be displayed by the
management tools.
* Default implementation retusrns <code>null</code>.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>description</code> property:
- * <pre class="brush: js">
- * {
- * "description": "Model description..."
- * }
- * </pre>
*
* @return Optional short model description. Can return <code>null</code>.
*/
@@ -153,169 +109,36 @@ public interface NCModelConfig {
}
/**
- * Gets minimum word count (<i>including</i> stopwords) below which user
input will be automatically
- * rejected as too short. In almost all cases this value should be greater
than or equal to one.
- * <p>
- * <b>Default</b>
- * <br>
- * If not provided by the model the default value {@link #DFLT_MIN_WORDS}
will be used.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>minWords</code> property:
- * <pre class="brush: js">
- * {
- * "minWords": 2
- * }
- * </pre>
*
- * @return Minimum word count (<i>including</i> stopwords) below which
user input will be automatically
- * rejected as too short.
- */
- default int getMinWords() {
- return DFLT_MIN_WORDS;
- }
-
- /**
- * Gets maximum word count (<i>including</i> stopwords) above which user
input will be automatically
- * rejected as too long. In almost all cases this value should be greater
than or equal to one.
- * <p>
- * <b>Default</b>
- * <br>
- * If not provided by the model the default value {@link #DFLT_MAX_WORDS}
will be used.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>maxWords</code> property:
- * <pre class="brush: js">
- * {
- * "maxWords": 50
- * }
- * </pre>
- *
- * @return Maximum word count (<i>including</i> stopwords) above which
user input will be automatically
- * rejected as too long.
- */
- default int getMaxWords() {
- return DFLT_MAX_WORDS;
- }
-
- /**
- * Gets minimum number of all tokens (system and user defined) below which
user input will be
- * automatically rejected as too short. In almost all cases this value
should be greater than or equal to one.
- * <p>
- * <b>Default</b>
- * <br>
- * If not provided by the model the default value {@link #DFLT_MIN_TOKENS}
will be used.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>minTokens</code> property:
- * <pre class="brush: js">
- * {
- * "minTokens": 1
- * }
- * </pre>
- *
- * @return Minimum number of all tokens.
+ * @return
*/
default int getMinTokens() {
return DFLT_MIN_TOKENS;
}
/**
- * Gets maximum number of all tokens (system and user defined) above which
user input will be
- * automatically rejected as too long. Note that sentences with large
number of token can result
- * in significant processing delay and substantial memory consumption.
- * <p>
- * <b>Default</b>
- * <br>
- * If not provided by the model the default value {@link #DFLT_MAX_TOKENS}
will be used.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>maxTokens</code> property:
- * <pre class="brush: js">
- * {
- * "maxTokens": 100
- * }
- * </pre>
*
- * @return Maximum number of all tokens.
+ * @return
*/
default int getMaxTokens() {
return DFLT_MAX_TOKENS;
}
+ int getMaxStopWords();
+
/**
- * Gets minimum word count (<i>excluding</i> stopwords) below which user
input will be automatically rejected
- * as ambiguous sentence.
- * <p>
- * <b>Default</b>
- * <br>
- * If not provided by the model the default value {@link
#DFLT_MIN_NON_STOPWORDS} will be used.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>minNonStopwords</code> property:
- * <pre class="brush: js">
- * {
- * "minNonStopwords": 2
- * }
- * </pre>
*
- * @return Minimum word count (<i>excluding</i> stopwords) below which
user input will be automatically
- * rejected as too short.
+ * @return
*/
- default int getMinNonStopwords() {
+ default int getMinNonStopWords() {
return DFLT_MIN_NON_STOPWORDS;
}
/**
- * Whether to allow non-Latin charset in user input. Currently, only
- * Latin charset is supported. However, model can choose whether to
automatically reject user
- * input with characters outside of Latin charset. If {@code false} such
user input will be automatically
- * rejected.
- * <p>
- * <b>Default</b>
- * <br>
- * If not provided by the model the default value {@link
#DFLT_IS_NOT_LATIN_CHARSET_ALLOWED} will be used.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>nonLatinCharsetAllowed</code> property:
- * <pre class="brush: js">
- * {
- * "nonLatinCharsetAllowed": false
- * }
- * </pre>
*
- * @return Whether to allow non-Latin charset in user input.
+ * @return
*/
default boolean isNotLatinCharsetAllowed() {
return DFLT_IS_NOT_LATIN_CHARSET_ALLOWED;
}
-
- /**
- * Whether to allow known swear words in user input. If {@code false} -
user input with
- * detected known swear words will be automatically rejected.
- * <p>
- * <b>Default</b>
- * <br>
- * If not provided by the model the default value {@link
#DFLT_IS_SWEAR_WORDS_ALLOWED} will be used.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>swearWordsAllowed</code> property:
- * <pre class="brush: js">
- * {
- * "swearWordsAllowed": false
- * }
- * </pre>
- *
- * @return Whether to allow known swear words in user input.
- */
- default boolean isSwearWordsAllowed() {
- return DFLT_IS_SWEAR_WORDS_ALLOWED;
- }
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCParameterized.java
similarity index 50%
copy from nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
copy to nlpcraft/src/main/scala/org/apache/nlpcraft/NCParameterized.java
index 14927df..854fbba 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCParameterized.java
@@ -15,57 +15,68 @@
* limitations under the License.
*/
-package org.apache.nlpcraft;
+package org.apache.nlpcraft;/*
+ _________ ______________
+ __ ____/_______________ __ \__ /_____ _____ __
+ _ / _ __ \_ ___/_ /_/ /_ /_ __ `/_ / / /
+ / /___ / /_/ /(__ )_ ____/_ / / /_/ /_ /_/ /
+ \____/ \____//____/ /_/ /_/ \__,_/ _\__, /
+ /____/
+
+ 2D ASCII JVM GAME ENGINE FOR SCALA3
+ (C) 2021 Rowan Games, Inc.
+ ALl rights reserved.
+*/
+
+import java.util.Optional;
/**
*
*/
-public interface NCToken {
- /**
- *
- * @return
- */
- String getOriginalText();
-
+public interface NCParameterized {
/**
- *
+ *
+ * @param key
+ * @param <T>
* @return
*/
- String getNormalizedText();
+ <T> T get(String key);
/**
- *
+ *
+ * @param key
+ * @param <T>
* @return
*/
- String getLemma();
+ <T> Optional<T> getOpt(String key);
/**
- *
- * @return
- */
- String getStem();
-
- /**
- *
- * @return
+ *
+ * @param key
+ * @param obj
*/
- String getPos();
+ void put(String key, Object obj);
/**
- *
+ *
+ * @param key
+ * @param obj
+ * @param <T>
* @return
*/
- int getStartCharIndex();
+ <T> T putIfAbsent(String key, T obj);
/**
- *
+ *
+ * @param key
* @return
*/
- int getEndCharIndex();
+ boolean contains(String key);
/**
- *
+ *
+ * @param key
* @return
*/
- int getLength();
+ boolean remove(String key);
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
index 14927df..bcb0aa5 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
@@ -20,7 +20,7 @@ package org.apache.nlpcraft;
/**
*
*/
-public interface NCToken {
+public interface NCToken extends NCParameterized {
/**
*
* @return
@@ -55,6 +55,12 @@ public interface NCToken {
*
* @return
*/
+ boolean isStopWord();
+
+ /**
+ *
+ * @return
+ */
int getStartCharIndex();
/**
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenEnricher.java
similarity index 60%
copy from nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
copy to nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenEnricher.java
index fedf377..a26abe7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenEnricher.java
@@ -15,21 +15,30 @@
* limitations under the License.
*/
-package org.apache.nlpcraft;
+package org.apache.nlpcraft;/*
+ _________ ______________
+ __ ____/_______________ __ \__ /_____ _____ __
+ _ / _ __ \_ ___/_ /_/ /_ /_ __ `/_ / / /
+ / /___ / /_/ /(__ )_ ____/_ / / /_/ /_ /_/ /
+ \____/ \____//____/ /_/ /_/ \__,_/ _\__, /
+ /____/
-import java.util.List;
+ 2D ASCII JVM GAME ENGINE FOR SCALA3
+ (C) 2021 Rowan Games, Inc.
+ ALl rights reserved.
+*/
+
+import java.util.*;
/**
*
*/
-public interface NCEntityParser {
+public interface NCTokenEnricher {
/**
*
* @param req
* @param cfg
* @param toks
- * @param ents List of already parsed entities prio to this step. Can be
empty but never {@code null}.
- * @return
*/
- List<NCEntity> parse(NCRequest req, NCModelConfig cfg, List<NCToken> toks,
List<NCEntity> ents);
+ void enrich(NCRequest req, NCModelConfig cfg, List<NCToken> toks);
}