[incubator-nlpcraft] branch NLPCRAFT-472 updated: WIP.

sergeykamov Wed, 29 Dec 2021 13:01:16 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
     new e1446ea  WIP.
     new 9741585  Merge remote-tracking branch 'origin/NLPCRAFT-472' into 
NLPCRAFT-472
e1446ea is described below

commit e1446ea8d88bdf4b09434062e753278866acba95
Author: Sergey Kamov <[email protected]>
AuthorDate: Thu Dec 30 00:01:00 2021 +0300

    WIP.
---
 .../main/scala/org/apache/nlpcraft/NCRequest.java  |   8 +-
 .../entity/parser/semantic/NCSemanticElement.java  | 269 +--------------------
 ...anticValue.java => NCSemanticElementValue.java} |   9 +-
 .../parser/semantic/NCSemanticEntityParser.java    |  36 ++-
 .../{NCStemmer.java => NCSemanticTextStemmer.java} |   9 +-
 .../parser/semantic/NCSemanticValueLoader.java     |  24 --
 .../semantic/impl/NCSemanticEntityParserImpl.scala |  12 +-
 .../en/NCENSemanticTextStemmer.java}               |   9 +-
 .../parser/opennlp/impl/en/NCEnOpenNlpImpl.scala   |  26 +-
 .../semantic/NCSemanticEntityParserSpec.scala      |   3 +-
 .../apache/nlpcraft/nlp/util/NCTestRequest.scala   |  11 +-
 11 files changed, 96 insertions(+), 320 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java
index 2b181ae..72faad8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCRequest.java
@@ -17,7 +17,7 @@
 
 package org.apache.nlpcraft;
 
-import java.util.Map;
+import java.util.*;
 
 /**
  * Information about the user request.
@@ -52,6 +52,12 @@ public interface NCRequest {
     String getText();
 
     /**
+     *
+     * @return
+     */
+    List<NCWord> getWords();
+
+    /**
      * Gets UTC/GMT timestamp in millis when user input was received.
      *
      * @return UTC/GMT timestamp in ms when user input was received.
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
index cd59b38..b3599af 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
@@ -19,301 +19,54 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic;
 
 import java.util.Collections;
 import java.util.List;
-import java.util.Map;
-import java.util.Optional;
 
+/**
+ *
+ */
 public interface NCSemanticElement {
     /**
-     * Gets unique ID of this element.
-     * <p>
-     * This unique ID should be human-readable for simpler debugging and 
testing of the model.
-     * Although element ID could be any arbitrary string it is highly 
recommended having
-     * element ID as a lower case string starting with some model prefix, 
followed by colon and
-     * then the element's name. For example, some built-in NLPCraft IDs are: 
<code>nlpcraft:date</code>,
-     * <code>nlpcraft:city</code>.
-     * <p>
-     * Few important notes:
-     * <ul>
-     *      <li>Element IDs starting with <code>nlpcraft:</code> are reserved 
for built-in NLPCraft IDs.</li>
-     *      <li>
-     *          Element ID is an implicit synonym for that element.
-     *          Thus element ID can be used in the user input directly to 
clearly
-     *          disambiguate the element in the input sentence instead of 
relying on synonyms or other
-     *          ways of detection.
-     *      </li>
-     * </ul>
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by <code>id</code> 
property:
-     * <pre class="brush: js, highlight: [3]">
-     *     "elements": [
-     *         {
-     *             "id": "phone:act",
-     *             "description": "Phone action.",
-     *             "synonyms": [
-     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
-     *             ]
-     *         }
-     *     ]
-     * </pre>
      *
-     * @see NCToken#getId()
-     * @return Unique ID of this element.
+     * @return
      */
     String getId();
 
     /**
-     * Gets the list of groups this element belongs to.
-     * <p>
-     * Model element can belong to one or more groups. By default, the element 
belongs to a single group whose group
-     * ID is equal to its {@link #getId() ID}. The proper grouping of the 
model elements is required for operation
-     * of Short-Term-Memory (STM) in {@link NCConversation conversation} (if 
and when conversation
-     * is used). Specifically, a token (i.e. found model element) that is part 
of the group set will override
-     * other tokens from the same set or its superset. In other words, tokens 
with a smaller group set
-     * (more specific token) will override the tokens from a larger group set 
(more generic tokens).
-     * <p>
-     * Note that built-in tokens (including from 3rd party token providers) 
belong to a single group whose group
-     * ID is equal to their IDs.
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by 
<code>groups</code> property:
-     * <pre class="brush: js, highlight: [5]">
-     *     "elements": [
-     *         {
-     *             "id": "phone:act",
-     *             "description": "Phone action.",
-     *             "groups": ["group1", "group2"]
-     *             "synonyms": [
-     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
-     *             ]
-     *         }
-     *     ]
-     * </pre>
      *
-     * @return List of groups this element belongs to. By default, the model 
element belongs to one group
-     *      with ID equal to the element {@link #getId() ID}.
-     * @see NCConversation
-     * @see #getId()
+     * @return
      */
     default List<String> getGroups() {
         return Collections.singletonList(getId());
     }
 
     /**
-     * Shortcut method to test if this element is a member of given group. It 
is equivalent to:
-     * <pre class="brush: java">
-     *     return getGroups().contains(grp);
-     * </pre>
      *
-     * @param grp Token group to test.
-     * @return {@code True} if this element belongs to the given group, {@code 
false} otherwise.
+     * @return
      */
     default boolean isMemberOf(String grp) {
         return getGroups().contains(grp);
     }
 
     /**
-     * Gets optional user-defined element's metadata. When a {@link NCToken 
token} for this element
-     * is detected in the input this metadata is merged into {@link 
NCToken#getMetadata()} method returned metadata.
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by 
<code>description</code> property:
-     * <pre class="brush: js, highlight: [8,9,10,11,12]">
-     *     "elements": [
-     *         {
-     *             "id": "phone:act",
-     *             "description": "Phone action.",
-     *             "synonyms": [
-     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
-     *             ],
-     *             "metadata": {
-     *                 "str": "val1",
-     *                 "num": 100,
-     *                 "bool": false
-     *             }
-     *         }
-     *     ]
-     * </pre>
      *
-     * @return Element's metadata or empty collection if none provided. 
Default implementation return empty collection.
-     */
-    default Map<String, Object> getMetadata() {
-        return Collections.emptyMap();
-    }
-
-    /**
-     * Gets optional element description.
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by 
<code>description</code> property:
-     * <pre class="brush: js, highlight: [4]">
-     *     "elements": [
-     *         {
-     *             "id": "phone:act",
-     *             "description": "Phone action.",
-     *             "synonyms": [
-     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
-     *             ]
-     *         }
-     *     ]
-     * </pre>
-     *
-     * @return Optional element description. Default implementation returns 
{@code null}.
+     * @return
      */
     default String getDescription() {
         return null;
     }
 
     /**
-     * Gets optional map of {@link NCValue values} for this element.
-     * <p>
-     * Each element can generally be recognized either by one of its synonyms 
or values. Elements and their values
-     * are analogous to types and instances of that type in programming 
languages. Each value
-     * has a name and optional set of its own synonyms by which that value, 
and ultimately its element, can be
-     * recognized by. Note that value name itself acts as an implicit synonym 
even when no additional synonyms added
-     * for that value.
-     * <p>
-     * Consider this example. A model element {@code x:car} can have:
-     * <ul>
-     *      <li>
-     *          Set of general synonyms:
-     *          <code>{transportation|transport|_} 
{vehicle|car|sedan|auto|automobile|suv|crossover|coupe|truck}</code>
-     *      </li>
-     *      <li>Set of values:
-     *          <ul>
-     *              <li>{@code mercedes} with synonyms {@code (mercedes, 
mercedes-benz, mb, benz)}</li>
-     *              <li>{@code bmw} with synonyms {@code (bmw, bimmer)}</li>
-     *              <li>{@code chevrolet} with synonyms {@code (chevy, 
chevrolet)}</li>
-     *          </ul>
-     *      </li>
-     * </ul>
-     * With that setup {@code x:car} element will be recognized by any of the 
following input sub-string:
-     * <ul>
-     *      <li>{@code transport car}</li>
-     *      <li>{@code benz}</li>
-     *      <li>{@code automobile}</li>
-     *      <li>{@code transport vehicle}</li>
-     *      <li>{@code sedan}</li>
-     *      <li>{@code chevy}</li>
-     *      <li>{@code bimmer}</li>
-     *      <li>{@code x:car}</li>
-     * </ul>
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by 
<code>values</code> property:
-     * <pre class="brush: js, highlight: [8,9,10,11,12,13]">
-     *     "elements": [
-     *         {
-     *             "id": "phone:act",
-     *             "description": "Phone action.",
-     *             "synonyms": [
-     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
-     *             ],
-     *             "values": [
-     *                  {
-     *                      "name": "name1",
-     *                      "synonyms": ["syn1", "syn2"]
-     *                  }
-     *             ]
-     *         }
-     *     ]
-     * </pre>
      *
-     * @return Map of value's name and its synonyms or {@code null} if not 
defined.
+     * @return
      */
-    default List<NCSemanticValue> getValues() {
+    default List<NCSemanticElementValue> getValues() {
         return Collections.emptyList();
     }
 
     /**
-     * Gets optional ID of the immediate parent element. Parent ID allows 
model elements to form into hierarchy.
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by 
<code>parentId</code> property:
-     * <pre class="brush: js, highlight: [5]">
-     *     "elements": [
-     *         {
-     *             "id": "phone:act",
-     *             "description": "Phone action.",
-     *             "parentId": "parent",
-     *             "synonyms": [
-     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
-     *             ]
-     *         }
-     *     ]
-     * </pre>
-     *
-     * @return Optional parent element ID, or {@code null} if not specified. 
Default implementation returns
-     *      {@code null}.
-     */
-    default String getParentId() {
-        return null;
-    }
-
-    /**
-     * Gets the list of synonyms by which this model element will be 
recognized by. Read more about
-     * many forms of synonyms in <a target=_ 
href="https://nlpcraft.apache.org/data-model.html";>Data Model</a> section
-     * and review <a target=_ 
href="https://github.com/apache/incubator-nlpcraft/tree/master/nlpcraft-examples";>examples</a>.
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by 
<code>synonyms</code> property:
-     * <pre class="brush: js, highlight: [5,6,7]">
-     *     "elements": [
-     *         {
-     *             "id": "phone:act",
-     *             "description": "Phone action.",
-     *             "synonyms": [
-     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
-     *             ]
-     *         }
-     *     ]
-     * </pre>
      *
-     * @return List of synonyms for this element. List is generally optional 
since element's ID acts
-     *      as an implicit synonym. Default implementation returns an empty 
list.
+     * @return
      */
     default List<String> getSynonyms() {
         return Collections.emptyList();
     }
-
-    /**
-     * Gets optional dynamic value loader. This loader will be used 
additionally to any
-     * values defined in {@link #getValues()} method. Default implementation 
returns {@code null}.
-     * <p>
-     * <b>JSON</b>
-     * <br>
-     * If using JSON/YAML model presentation this is set by 
<code>valueLoader</code> property with value
-     * of a fully qualified class name implementing {@link NCValueLoader} 
interface. Note that
-     * only one instance of the value loader will be created per model and 
given class name:
-     * <pre class="brush: js, highlight: [14]">
-     *     "elements": [
-     *         {
-     *             "id": "phone:act",
-     *             "description": "Phone action.",
-     *             "synonyms": [
-     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
-     *             ],
-     *             "values": [
-     *                  {
-     *                      "name": "name1",
-     *                      "synonyms": ["syn1", "syn2"]
-     *                  }
-     *             ],
-     *             "valueLoader": "my.package.ValueLoader"
-     *         }
-     *     ]
-     * </pre>
-     *
-     * @return Optional instance of dynamic value loader.
-     */
-    default Optional<NCSemanticValueLoader> getValueLoader() {
-        return Optional.empty();
-    }
-}
+}
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElementValue.java
similarity index 90%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElementValue.java
index 5f45a79..a7106c8 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElementValue.java
@@ -19,6 +19,13 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic;
 
 import java.util.List;
 
-public interface NCSemanticValue {
+/**
+ *
+ */
+public interface NCSemanticElementValue {
+    /**
+     *
+     * @return
+     */
     List<String> getSynonyms();
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
index 09e83fd..717b1bd 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
@@ -27,23 +27,47 @@ import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParse
 import java.io.File;
 import java.util.*;
 
+/**
+ *
+ */
 public class NCSemanticEntityParser implements NCEntityParser {
     private final NCSemanticEntityParserImpl impl;
 
-    public NCSemanticEntityParser(NCStemmer stemmer, List<NCSemanticElement> 
elems) {
+    /**
+     *
+     * @param stemmer
+     * @param elems
+     */
+    public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, 
List<NCSemanticElement> elems) {
         impl = NCSemanticEntityParserImpl.apply(stemmer, 
Collections.emptyMap(), elems);
     }
 
-    public NCSemanticEntityParser(NCStemmer stemmer, Map<String, String> 
macros, List<NCSemanticElement> elems) {
+    /**
+     *
+     * @param stemmer
+     * @param macros
+     * @param elems
+     */
+    public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, Map<String, 
String> macros, List<NCSemanticElement> elems) {
         impl = NCSemanticEntityParserImpl.apply(stemmer, macros, elems);
     }
 
-    public NCSemanticEntityParser(NCStemmer stemmer, File elemsFile, 
Map<String, NCSemanticValueLoader> valsLoaders) {
-        impl = NCSemanticEntityParserImpl.apply(stemmer, elemsFile, 
valsLoaders);
+    /**
+     *
+     * @param stemmer
+     * @param elemsFile
+     */
+    public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, File 
elemsFile) {
+        impl = NCSemanticEntityParserImpl.apply(stemmer, elemsFile);
     }
 
-    public NCSemanticEntityParser(NCStemmer stemmer, String elemsSrc, 
Map<String, NCSemanticValueLoader> valsLoaders) {
-        impl = NCSemanticEntityParserImpl.apply(stemmer, elemsSrc, 
valsLoaders);
+    /**
+     *
+     * @param stemmer
+     * @param elemsSrc
+     */
+    public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, String 
elemsSrc) {
+        impl = NCSemanticEntityParserImpl.apply(stemmer, elemsSrc);
     }
 
     @Override
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticTextStemmer.java
similarity index 87%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticTextStemmer.java
index bffc30d..5ef08d3 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticTextStemmer.java
@@ -20,6 +20,11 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic;
 /**
  *
  */
-public interface NCStemmer {
-    String stem(String word);
+public interface NCSemanticTextStemmer {
+    /**
+     *
+     * @param text
+     * @return
+     */
+    String stem(String text);
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
deleted file mode 100644
index 83e83fd..0000000
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.nlp.entity.parser.semantic;
-
-import java.util.Set;
-
-public interface NCSemanticValueLoader {
-    Set<NCSemanticValue> load();
-}
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index 1148690..e33c3b1 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -31,11 +31,11 @@ import scala.collection.mutable
 import scala.jdk.CollectionConverters.*
 
 object NCSemanticEntityParserImpl:
-    def apply(stemmer: NCStemmer, macros: Jmap[String, String], elems: 
JList[NCSemanticElement]): NCSemanticEntityParserImpl =
+    def apply(stemmer: NCSemanticTextStemmer, macros: Jmap[String, String], 
elems: JList[NCSemanticElement]): NCSemanticEntityParserImpl =
         new NCSemanticEntityParserImpl(stemmer, macros.asScala.toMap, 
elems.asScala.toSeq)
-    def apply(stemmer: NCStemmer, elemsFile: File, valsLoaders: Jmap[String, 
NCSemanticValueLoader]): NCSemanticEntityParserImpl =
+    def apply(stemmer: NCSemanticTextStemmer, elemsFile: File): 
NCSemanticEntityParserImpl =
         new NCSemanticEntityParserImpl(stemmer, null, null)
-    def apply(stemmer: NCStemmer, elemsSrc: String, valsLoaders: Jmap[String, 
NCSemanticValueLoader]): NCSemanticEntityParserImpl =
+    def apply(stemmer: NCSemanticTextStemmer, elemsSrc: String): 
NCSemanticEntityParserImpl =
         new NCSemanticEntityParserImpl(stemmer, null, null)
 
     private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
@@ -48,7 +48,7 @@ object NCSemanticEntityParserImpl:
     private case class Piece(main: Seq[NCToken], extra: Seq[Seq[NCToken]])
 
     private def startsAndEnds(fix: String, s: String): Boolean = 
s.startsWith(fix) && s.endsWith(fix)
-    private def mkChunk(stemmer: NCStemmer, chunk: String): NCSynonymChunk =
+    private def mkChunk(stemmer: NCSemanticTextStemmer, chunk: String): 
NCSynonymChunk =
         def stripSuffix(fix: String, s: String): String = s.slice(fix.length, 
s.length - fix.length)
 
         // Regex synonym.
@@ -126,7 +126,7 @@ object NCSemanticEntityParserImpl:
 import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl.*
 
 class NCSemanticEntityParserImpl(
-    stemmer: NCStemmer,
+    stemmer: NCSemanticTextStemmer,
     macros: Map[String, String],
     elements: Seq[NCSemanticElement]
 ) extends NCEntityParser with LazyLogging:
@@ -163,6 +163,8 @@ class NCSemanticEntityParserImpl(
                         // TODO:
                         toSeq.map(chunks => NCSynonym(false, false, null, 
chunks))
                 )
+
+                // TODO: values, elementID
         )
 
         sortedSyns =
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCENSemanticTextStemmer.java
similarity index 76%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCENSemanticTextStemmer.java
index f1e77ca..15f0274 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCENSemanticTextStemmer.java
@@ -15,15 +15,16 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.nlp.entity.parser.semantic;
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en;
 
 import opennlp.tools.stemmer.PorterStemmer;
+import org.apache.nlpcraft.nlp.entity.parser.semantic.NCSemanticTextStemmer;
 
-public class NCEnStemmer implements NCStemmer {
+public class NCENSemanticTextStemmer implements NCSemanticTextStemmer {
     private final PorterStemmer s = new PorterStemmer();
     
     @Override
-    public String stem(String word) {
-        return s.stem(word);
+    public String stem(String text) {
+        return s.stem(text);
     }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
index 9e818f4..b13d520 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
@@ -118,15 +118,13 @@ class NCEnOpenNlpImpl(
     override def parse(req: NCRequest, cfg: NCModelConfig): JList[NCToken] =
         // OpenNLP classes are not thread-safe.
         this.synchronized {
-            val sen = req.getText
+            val words = req.getWords.asScala
 
-            val holders = cfg.getTokenizer.tokenize(cfg, sen).asScala
+            val wordsTxts = words.map(_.getText).toArray
+            val posTags = tagger.tag(wordsTxts)
+            var lemmas = lemmatizer.lemmatize(wordsTxts, posTags).toSeq
 
-            val words = holders.map(_.getText).toArray
-            val posTags = tagger.tag(words)
-            var lemmas = lemmatizer.lemmatize(words, posTags).toSeq
-
-            require(holders.length == posTags.length)
+            require(words.length == posTags.length)
 
             // For some reasons lemmatizer (en-lemmatizer.dict) marks some 
words with non-existent POS 'NNN'
             // Valid POS list: 
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
@@ -137,7 +135,7 @@ class NCEnOpenNlpImpl(
 
             if suspIdxs.nonEmpty then
                 val fixes: Map[Int, String] = lemmatizer.
-                    lemmatize(suspIdxs.map(i => words(i)).toArray, 
suspIdxs.map(_ => "NNN").toArray).
+                    lemmatize(suspIdxs.map(i => wordsTxts(i)).toArray, 
suspIdxs.map(_ => "NNN").toArray).
                     zipWithIndex.
                     flatMap {
                         (lemma, i) => Option.when(lemma != "0")(suspIdxs(i) -> 
lemma)
@@ -146,16 +144,16 @@ class NCEnOpenNlpImpl(
                     (lemma, idx) => fixes.getOrElse(idx, lemma)
                 }
 
-            val res: Seq[NCToken] = 
holders.zip(posTags).zip(lemmas).toIndexedSeq.zipWithIndex.map { case (((h, 
pos), lemma), idx) =>
+            val res: Seq[NCToken] = 
words.zip(posTags).zip(lemmas).toIndexedSeq.zipWithIndex.map { case (((w, pos), 
lemma), idx) =>
                 new NCPropertyMapAdapter with NCToken:
-                    override def getText: String = h.getText
+                    override def getText: String = w.getText
                     override def getLemma: String = lemma
-                    override def getStem: String = 
stemmer.stem(h.getText.toLowerCase)
+                    override def getStem: String = 
stemmer.stem(w.getText.toLowerCase)
                     override def getPos: String = pos
                     override def isStopWord: Boolean = false
-                    override def getStartCharIndex: Int = h.getStartCharIndex
-                    override def getEndCharIndex: Int = h.getEndCharIndex
-                    override def getLength: Int = h.getLength
+                    override def getStartCharIndex: Int = w.getStartCharIndex
+                    override def getEndCharIndex: Int = w.getEndCharIndex
+                    override def getLength: Int = w.getLength
                     override def getIndex: Int = idx
             }
 
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
index 041d900..e7d0381 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
@@ -20,6 +20,7 @@ package org.apache.nlpcraft.nlp.entity.parser.semantic
 import org.apache.nlpcraft.*
 import org.apache.nlpcraft.internal.util.NCUtils
 import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
+import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en.NCENSemanticTextStemmer
 import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
 import org.apache.nlpcraft.nlp.util.*
 import org.apache.nlpcraft.nlp.util.NCTestConfig.*
@@ -42,7 +43,7 @@ class NCSemanticEntityParserSpec:
         parser =
             NCTestUtils.makeAndStart(
                 new NCSemanticEntityParser(
-                    new NCEnStemmer,
+                    new NCENSemanticTextStemmer,
                     Seq(
                         new NCSemanticElement {
                             override def getId: String = "testId"
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala 
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
index ab07b91..7af1a71 100644
--- a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
+++ b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
@@ -17,9 +17,10 @@
 
 package org.apache.nlpcraft.nlp.util
 
-import org.apache.nlpcraft.NCRequest
-
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.nlp.util.NCTestConfig.*
 import java.util
+import java.util.List
 
 /**
   * Request test implementation.
@@ -45,14 +46,16 @@ case class NCTestRequest(
     override def getReceiveTimestamp: Long = ts
     override def getUserAgent: String = userAgent
     override def getRequestData: util.Map[String, AnyRef] = data
+    override def getWords: util.List[NCWord] = 
EN_TOKENIZER.tokenize(EN_MDL_CFG, txt)
+
 
 /**
   * Java side helper.
   */
 object NCTestRequest:
     /**
-      * 
+      *
       * @param txt
       * @return
       */
-    def apply(txt: String): NCTestRequest = new NCTestRequest(txt)
+    def apply(txt: String): NCTestRequest = new NCTestRequest(txt)
\ No newline at end of file

[incubator-nlpcraft] branch NLPCRAFT-472 updated: WIP.

Reply via email to