This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new e1446ea WIP.
new 9741585 Merge remote-tracking branch 'origin/NLPCRAFT-472' into
NLPCRAFT-472
e1446ea is described below
commit e1446ea8d88bdf4b09434062e753278866acba95
Author: Sergey Kamov <[email protected]>
AuthorDate: Thu Dec 30 00:01:00 2021 +0300
WIP.
---
.../main/scala/org/apache/nlpcraft/NCRequest.java | 8 +-
.../entity/parser/semantic/NCSemanticElement.java | 269 +--------------------
...anticValue.java => NCSemanticElementValue.java} | 9 +-
.../parser/semantic/NCSemanticEntityParser.java | 36 ++-
.../{NCStemmer.java => NCSemanticTextStemmer.java} | 9 +-
.../parser/semantic/NCSemanticValueLoader.java | 24 --
.../semantic/impl/NCSemanticEntityParserImpl.scala | 12 +-
.../en/NCENSemanticTextStemmer.java} | 9 +-
.../parser/opennlp/impl/en/NCEnOpenNlpImpl.scala | 26 +-
.../semantic/NCSemanticEntityParserSpec.scala | 3 +-
.../apache/nlpcraft/nlp/util/NCTestRequest.scala | 11 +-
11 files changed, 96 insertions(+), 320 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java
index 2b181ae..72faad8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java
@@ -17,7 +17,7 @@
package org.apache.nlpcraft;
-import java.util.Map;
+import java.util.*;
/**
* Information about the user request.
@@ -52,6 +52,12 @@ public interface NCRequest {
String getText();
/**
+ *
+ * @return
+ */
+ List<NCWord> getWords();
+
+ /**
* Gets UTC/GMT timestamp in millis when user input was received.
*
* @return UTC/GMT timestamp in ms when user input was received.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
index cd59b38..b3599af 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
@@ -19,301 +19,54 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic;
import java.util.Collections;
import java.util.List;
-import java.util.Map;
-import java.util.Optional;
+/**
+ *
+ */
public interface NCSemanticElement {
/**
- * Gets unique ID of this element.
- * <p>
- * This unique ID should be human-readable for simpler debugging and
testing of the model.
- * Although element ID could be any arbitrary string it is highly
recommended having
- * element ID as a lower case string starting with some model prefix,
followed by colon and
- * then the element's name. For example, some built-in NLPCraft IDs are:
<code>nlpcraft:date</code>,
- * <code>nlpcraft:city</code>.
- * <p>
- * Few important notes:
- * <ul>
- * <li>Element IDs starting with <code>nlpcraft:</code> are reserved
for built-in NLPCraft IDs.</li>
- * <li>
- * Element ID is an implicit synonym for that element.
- * Thus element ID can be used in the user input directly to
clearly
- * disambiguate the element in the input sentence instead of
relying on synonyms or other
- * ways of detection.
- * </li>
- * </ul>
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by <code>id</code>
property:
- * <pre class="brush: js, highlight: [3]">
- * "elements": [
- * {
- * "id": "phone:act",
- * "description": "Phone action.",
- * "synonyms": [
- * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
- * ]
- * }
- * ]
- * </pre>
*
- * @see NCToken#getId()
- * @return Unique ID of this element.
+ * @return
*/
String getId();
/**
- * Gets the list of groups this element belongs to.
- * <p>
- * Model element can belong to one or more groups. By default, the element
belongs to a single group whose group
- * ID is equal to its {@link #getId() ID}. The proper grouping of the
model elements is required for operation
- * of Short-Term-Memory (STM) in {@link NCConversation conversation} (if
and when conversation
- * is used). Specifically, a token (i.e. found model element) that is part
of the group set will override
- * other tokens from the same set or its superset. In other words, tokens
with a smaller group set
- * (more specific token) will override the tokens from a larger group set
(more generic tokens).
- * <p>
- * Note that built-in tokens (including from 3rd party token providers)
belong to a single group whose group
- * ID is equal to their IDs.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>groups</code> property:
- * <pre class="brush: js, highlight: [5]">
- * "elements": [
- * {
- * "id": "phone:act",
- * "description": "Phone action.",
- * "groups": ["group1", "group2"]
- * "synonyms": [
- * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
- * ]
- * }
- * ]
- * </pre>
*
- * @return List of groups this element belongs to. By default, the model
element belongs to one group
- * with ID equal to the element {@link #getId() ID}.
- * @see NCConversation
- * @see #getId()
+ * @return
*/
default List<String> getGroups() {
return Collections.singletonList(getId());
}
/**
- * Shortcut method to test if this element is a member of given group. It
is equivalent to:
- * <pre class="brush: java">
- * return getGroups().contains(grp);
- * </pre>
*
- * @param grp Token group to test.
- * @return {@code True} if this element belongs to the given group, {@code
false} otherwise.
+ * @return
*/
default boolean isMemberOf(String grp) {
return getGroups().contains(grp);
}
/**
- * Gets optional user-defined element's metadata. When a {@link NCToken
token} for this element
- * is detected in the input this metadata is merged into {@link
NCToken#getMetadata()} method returned metadata.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>description</code> property:
- * <pre class="brush: js, highlight: [8,9,10,11,12]">
- * "elements": [
- * {
- * "id": "phone:act",
- * "description": "Phone action.",
- * "synonyms": [
- * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
- * ],
- * "metadata": {
- * "str": "val1",
- * "num": 100,
- * "bool": false
- * }
- * }
- * ]
- * </pre>
*
- * @return Element's metadata or empty collection if none provided.
Default implementation return empty collection.
- */
- default Map<String, Object> getMetadata() {
- return Collections.emptyMap();
- }
-
- /**
- * Gets optional element description.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>description</code> property:
- * <pre class="brush: js, highlight: [4]">
- * "elements": [
- * {
- * "id": "phone:act",
- * "description": "Phone action.",
- * "synonyms": [
- * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
- * ]
- * }
- * ]
- * </pre>
- *
- * @return Optional element description. Default implementation returns
{@code null}.
+ * @return
*/
default String getDescription() {
return null;
}
/**
- * Gets optional map of {@link NCValue values} for this element.
- * <p>
- * Each element can generally be recognized either by one of its synonyms
or values. Elements and their values
- * are analogous to types and instances of that type in programming
languages. Each value
- * has a name and optional set of its own synonyms by which that value,
and ultimately its element, can be
- * recognized by. Note that value name itself acts as an implicit synonym
even when no additional synonyms added
- * for that value.
- * <p>
- * Consider this example. A model element {@code x:car} can have:
- * <ul>
- * <li>
- * Set of general synonyms:
- * <code>{transportation|transport|_}
{vehicle|car|sedan|auto|automobile|suv|crossover|coupe|truck}</code>
- * </li>
- * <li>Set of values:
- * <ul>
- * <li>{@code mercedes} with synonyms {@code (mercedes,
mercedes-benz, mb, benz)}</li>
- * <li>{@code bmw} with synonyms {@code (bmw, bimmer)}</li>
- * <li>{@code chevrolet} with synonyms {@code (chevy,
chevrolet)}</li>
- * </ul>
- * </li>
- * </ul>
- * With that setup {@code x:car} element will be recognized by any of the
following input sub-string:
- * <ul>
- * <li>{@code transport car}</li>
- * <li>{@code benz}</li>
- * <li>{@code automobile}</li>
- * <li>{@code transport vehicle}</li>
- * <li>{@code sedan}</li>
- * <li>{@code chevy}</li>
- * <li>{@code bimmer}</li>
- * <li>{@code x:car}</li>
- * </ul>
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>values</code> property:
- * <pre class="brush: js, highlight: [8,9,10,11,12,13]">
- * "elements": [
- * {
- * "id": "phone:act",
- * "description": "Phone action.",
- * "synonyms": [
- * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
- * ],
- * "values": [
- * {
- * "name": "name1",
- * "synonyms": ["syn1", "syn2"]
- * }
- * ]
- * }
- * ]
- * </pre>
*
- * @return Map of value's name and its synonyms or {@code null} if not
defined.
+ * @return
*/
- default List<NCSemanticValue> getValues() {
+ default List<NCSemanticElementValue> getValues() {
return Collections.emptyList();
}
/**
- * Gets optional ID of the immediate parent element. Parent ID allows
model elements to form into hierarchy.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>parentId</code> property:
- * <pre class="brush: js, highlight: [5]">
- * "elements": [
- * {
- * "id": "phone:act",
- * "description": "Phone action.",
- * "parentId": "parent",
- * "synonyms": [
- * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
- * ]
- * }
- * ]
- * </pre>
- *
- * @return Optional parent element ID, or {@code null} if not specified.
Default implementation returns
- * {@code null}.
- */
- default String getParentId() {
- return null;
- }
-
- /**
- * Gets the list of synonyms by which this model element will be
recognized by. Read more about
- * many forms of synonyms in <a target=_
href="https://nlpcraft.apache.org/data-model.html">Data Model</a> section
- * and review <a target=_
href="https://github.com/apache/incubator-nlpcraft/tree/master/nlpcraft-examples">examples</a>.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>synonyms</code> property:
- * <pre class="brush: js, highlight: [5,6,7]">
- * "elements": [
- * {
- * "id": "phone:act",
- * "description": "Phone action.",
- * "synonyms": [
- * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
- * ]
- * }
- * ]
- * </pre>
*
- * @return List of synonyms for this element. List is generally optional
since element's ID acts
- * as an implicit synonym. Default implementation returns an empty
list.
+ * @return
*/
default List<String> getSynonyms() {
return Collections.emptyList();
}
-
- /**
- * Gets optional dynamic value loader. This loader will be used
additionally to any
- * values defined in {@link #getValues()} method. Default implementation
returns {@code null}.
- * <p>
- * <b>JSON</b>
- * <br>
- * If using JSON/YAML model presentation this is set by
<code>valueLoader</code> property with value
- * of a fully qualified class name implementing {@link NCValueLoader}
interface. Note that
- * only one instance of the value loader will be created per model and
given class name:
- * <pre class="brush: js, highlight: [14]">
- * "elements": [
- * {
- * "id": "phone:act",
- * "description": "Phone action.",
- * "synonyms": [
- * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
- * ],
- * "values": [
- * {
- * "name": "name1",
- * "synonyms": ["syn1", "syn2"]
- * }
- * ],
- * "valueLoader": "my.package.ValueLoader"
- * }
- * ]
- * </pre>
- *
- * @return Optional instance of dynamic value loader.
- */
- default Optional<NCSemanticValueLoader> getValueLoader() {
- return Optional.empty();
- }
-}
+}
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElementValue.java
similarity index 90%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElementValue.java
index 5f45a79..a7106c8 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElementValue.java
@@ -19,6 +19,13 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic;
import java.util.List;
-public interface NCSemanticValue {
+/**
+ *
+ */
+public interface NCSemanticElementValue {
+ /**
+ *
+ * @return
+ */
List<String> getSynonyms();
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
index 09e83fd..717b1bd 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
@@ -27,23 +27,47 @@ import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParse
import java.io.File;
import java.util.*;
+/**
+ *
+ */
public class NCSemanticEntityParser implements NCEntityParser {
private final NCSemanticEntityParserImpl impl;
- public NCSemanticEntityParser(NCStemmer stemmer, List<NCSemanticElement>
elems) {
+ /**
+ *
+ * @param stemmer
+ * @param elems
+ */
+ public NCSemanticEntityParser(NCSemanticTextStemmer stemmer,
List<NCSemanticElement> elems) {
impl = NCSemanticEntityParserImpl.apply(stemmer,
Collections.emptyMap(), elems);
}
- public NCSemanticEntityParser(NCStemmer stemmer, Map<String, String>
macros, List<NCSemanticElement> elems) {
+ /**
+ *
+ * @param stemmer
+ * @param macros
+ * @param elems
+ */
+ public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, Map<String,
String> macros, List<NCSemanticElement> elems) {
impl = NCSemanticEntityParserImpl.apply(stemmer, macros, elems);
}
- public NCSemanticEntityParser(NCStemmer stemmer, File elemsFile,
Map<String, NCSemanticValueLoader> valsLoaders) {
- impl = NCSemanticEntityParserImpl.apply(stemmer, elemsFile,
valsLoaders);
+ /**
+ *
+ * @param stemmer
+ * @param elemsFile
+ */
+ public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, File
elemsFile) {
+ impl = NCSemanticEntityParserImpl.apply(stemmer, elemsFile);
}
- public NCSemanticEntityParser(NCStemmer stemmer, String elemsSrc,
Map<String, NCSemanticValueLoader> valsLoaders) {
- impl = NCSemanticEntityParserImpl.apply(stemmer, elemsSrc,
valsLoaders);
+ /**
+ *
+ * @param stemmer
+ * @param elemsSrc
+ */
+ public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, String
elemsSrc) {
+ impl = NCSemanticEntityParserImpl.apply(stemmer, elemsSrc);
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticTextStemmer.java
similarity index 87%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticTextStemmer.java
index bffc30d..5ef08d3 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticTextStemmer.java
@@ -20,6 +20,11 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic;
/**
*
*/
-public interface NCStemmer {
- String stem(String word);
+public interface NCSemanticTextStemmer {
+ /**
+ *
+ * @param text
+ * @return
+ */
+ String stem(String text);
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
deleted file mode 100644
index 83e83fd..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.nlp.entity.parser.semantic;
-
-import java.util.Set;
-
-public interface NCSemanticValueLoader {
- Set<NCSemanticValue> load();
-}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index 1148690..e33c3b1 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -31,11 +31,11 @@ import scala.collection.mutable
import scala.jdk.CollectionConverters.*
object NCSemanticEntityParserImpl:
- def apply(stemmer: NCStemmer, macros: Jmap[String, String], elems:
JList[NCSemanticElement]): NCSemanticEntityParserImpl =
+ def apply(stemmer: NCSemanticTextStemmer, macros: Jmap[String, String],
elems: JList[NCSemanticElement]): NCSemanticEntityParserImpl =
new NCSemanticEntityParserImpl(stemmer, macros.asScala.toMap,
elems.asScala.toSeq)
- def apply(stemmer: NCStemmer, elemsFile: File, valsLoaders: Jmap[String,
NCSemanticValueLoader]): NCSemanticEntityParserImpl =
+ def apply(stemmer: NCSemanticTextStemmer, elemsFile: File):
NCSemanticEntityParserImpl =
new NCSemanticEntityParserImpl(stemmer, null, null)
- def apply(stemmer: NCStemmer, elemsSrc: String, valsLoaders: Jmap[String,
NCSemanticValueLoader]): NCSemanticEntityParserImpl =
+ def apply(stemmer: NCSemanticTextStemmer, elemsSrc: String):
NCSemanticEntityParserImpl =
new NCSemanticEntityParserImpl(stemmer, null, null)
private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
@@ -48,7 +48,7 @@ object NCSemanticEntityParserImpl:
private case class Piece(main: Seq[NCToken], extra: Seq[Seq[NCToken]])
private def startsAndEnds(fix: String, s: String): Boolean =
s.startsWith(fix) && s.endsWith(fix)
- private def mkChunk(stemmer: NCStemmer, chunk: String): NCSynonymChunk =
+ private def mkChunk(stemmer: NCSemanticTextStemmer, chunk: String):
NCSynonymChunk =
def stripSuffix(fix: String, s: String): String = s.slice(fix.length,
s.length - fix.length)
// Regex synonym.
@@ -126,7 +126,7 @@ object NCSemanticEntityParserImpl:
import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl.*
class NCSemanticEntityParserImpl(
- stemmer: NCStemmer,
+ stemmer: NCSemanticTextStemmer,
macros: Map[String, String],
elements: Seq[NCSemanticElement]
) extends NCEntityParser with LazyLogging:
@@ -163,6 +163,8 @@ class NCSemanticEntityParserImpl(
// TODO:
toSeq.map(chunks => NCSynonym(false, false, null,
chunks))
)
+
+ // TODO: values, elementID
)
sortedSyns =
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCENSemanticTextStemmer.java
similarity index 76%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCENSemanticTextStemmer.java
index f1e77ca..15f0274 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCENSemanticTextStemmer.java
@@ -15,15 +15,16 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.nlp.entity.parser.semantic;
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en;
import opennlp.tools.stemmer.PorterStemmer;
+import org.apache.nlpcraft.nlp.entity.parser.semantic.NCSemanticTextStemmer;
-public class NCEnStemmer implements NCStemmer {
+public class NCENSemanticTextStemmer implements NCSemanticTextStemmer {
private final PorterStemmer s = new PorterStemmer();
@Override
- public String stem(String word) {
- return s.stem(word);
+ public String stem(String text) {
+ return s.stem(text);
}
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
index 9e818f4..b13d520 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
@@ -118,15 +118,13 @@ class NCEnOpenNlpImpl(
override def parse(req: NCRequest, cfg: NCModelConfig): JList[NCToken] =
// OpenNLP classes are not thread-safe.
this.synchronized {
- val sen = req.getText
+ val words = req.getWords.asScala
- val holders = cfg.getTokenizer.tokenize(cfg, sen).asScala
+ val wordsTxts = words.map(_.getText).toArray
+ val posTags = tagger.tag(wordsTxts)
+ var lemmas = lemmatizer.lemmatize(wordsTxts, posTags).toSeq
- val words = holders.map(_.getText).toArray
- val posTags = tagger.tag(words)
- var lemmas = lemmatizer.lemmatize(words, posTags).toSeq
-
- require(holders.length == posTags.length)
+ require(words.length == posTags.length)
// For some reasons lemmatizer (en-lemmatizer.dict) marks some
words with non-existent POS 'NNN'
// Valid POS list:
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
@@ -137,7 +135,7 @@ class NCEnOpenNlpImpl(
if suspIdxs.nonEmpty then
val fixes: Map[Int, String] = lemmatizer.
- lemmatize(suspIdxs.map(i => words(i)).toArray,
suspIdxs.map(_ => "NNN").toArray).
+ lemmatize(suspIdxs.map(i => wordsTxts(i)).toArray,
suspIdxs.map(_ => "NNN").toArray).
zipWithIndex.
flatMap {
(lemma, i) => Option.when(lemma != "0")(suspIdxs(i) ->
lemma)
@@ -146,16 +144,16 @@ class NCEnOpenNlpImpl(
(lemma, idx) => fixes.getOrElse(idx, lemma)
}
- val res: Seq[NCToken] =
holders.zip(posTags).zip(lemmas).toIndexedSeq.zipWithIndex.map { case (((h,
pos), lemma), idx) =>
+ val res: Seq[NCToken] =
words.zip(posTags).zip(lemmas).toIndexedSeq.zipWithIndex.map { case (((w, pos),
lemma), idx) =>
new NCPropertyMapAdapter with NCToken:
- override def getText: String = h.getText
+ override def getText: String = w.getText
override def getLemma: String = lemma
- override def getStem: String =
stemmer.stem(h.getText.toLowerCase)
+ override def getStem: String =
stemmer.stem(w.getText.toLowerCase)
override def getPos: String = pos
override def isStopWord: Boolean = false
- override def getStartCharIndex: Int = h.getStartCharIndex
- override def getEndCharIndex: Int = h.getEndCharIndex
- override def getLength: Int = h.getLength
+ override def getStartCharIndex: Int = w.getStartCharIndex
+ override def getEndCharIndex: Int = w.getEndCharIndex
+ override def getLength: Int = w.getLength
override def getIndex: Int = idx
}
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
index 041d900..e7d0381 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
@@ -20,6 +20,7 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
+import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en.NCENSemanticTextStemmer
import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
import org.apache.nlpcraft.nlp.util.*
import org.apache.nlpcraft.nlp.util.NCTestConfig.*
@@ -42,7 +43,7 @@ class NCSemanticEntityParserSpec:
parser =
NCTestUtils.makeAndStart(
new NCSemanticEntityParser(
- new NCEnStemmer,
+ new NCENSemanticTextStemmer,
Seq(
new NCSemanticElement {
override def getId: String = "testId"
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
index ab07b91..7af1a71 100644
--- a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
+++ b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
@@ -17,9 +17,10 @@
package org.apache.nlpcraft.nlp.util
-import org.apache.nlpcraft.NCRequest
-
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.nlp.util.NCTestConfig.*
import java.util
+import java.util.List
/**
* Request test implementation.
@@ -45,14 +46,16 @@ case class NCTestRequest(
override def getReceiveTimestamp: Long = ts
override def getUserAgent: String = userAgent
override def getRequestData: util.Map[String, AnyRef] = data
+ override def getWords: util.List[NCWord] =
EN_TOKENIZER.tokenize(EN_MDL_CFG, txt)
+
/**
* Java side helper.
*/
object NCTestRequest:
/**
- *
+ *
* @param txt
* @return
*/
- def apply(txt: String): NCTestRequest = new NCTestRequest(txt)
+ def apply(txt: String): NCTestRequest = new NCTestRequest(txt)
\ No newline at end of file