[incubator-nlpcraft] branch NLPCRAFT-472 updated: WIP.

sergeykamov Wed, 29 Dec 2021 08:16:51 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
     new 956f544  WIP.
956f544 is described below

commit 956f544f11278df76bb8ad028df4b8dabb6471ec
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 29 19:13:58 2021 +0300

    WIP.
---
 .../opennlp/impl/NCOpenNlpEntityParserImpl.scala   |   3 +-
 ...CSemanticEntityParser.java => NCEnStemmer.java} |  13 +-
 .../entity/parser/semantic/NCSemanticElement.java  | 300 ++++++++++++++++++++-
 .../parser/semantic/NCSemanticEntityParser.java    |  41 ++-
 .../entity/parser/semantic/NCSemanticValue.java    |   5 +-
 .../parser/semantic/NCSemanticValueLoader.java     |   5 +-
 .../{NCSemanticValueLoader.java => NCStemmer.java} |   3 +-
 .../semantic/impl/NCSemanticEntityParserImpl.scala | 184 +++++++++++++
 .../NCSynonym.scala}                               |  20 +-
 .../NCSynonymChunk.scala}                          |  29 +-
 .../NCSynonymChunkKind.scala}                      |  11 +-
 .../semantic/NCSemanticEntityParserSpec.scala      |  67 +++++
 12 files changed, 650 insertions(+), 31 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
index dd3e24e..7c7be26 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
@@ -72,12 +72,11 @@ class NCOpenNlpEntityParserImpl(is: InputStream, res: 
String) extends NCEntityPa
             lazy val i2 = calcIndex(_.end)
 
             Option.when(i1 != -1 && i2 != -1)(
-                new NCPropertyMapAdapter with NCEntity {
+                new NCPropertyMapAdapter with NCEntity:
                     put(s"opennlp:${h.name}:probability", h.probability)
 
                     override def getTokens: JList[NCToken] = toksSeq.flatMap(t 
=> Option.when(t.getIndex >= i1 && t.getIndex <= i2)(t)).asJava
                     override def getRequestId: String = req.getRequestId
                     override def getId: String = s"opennlp:${h.name}"
-                }
             )
         ).toSeq.asJava
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
similarity index 78%
copy from 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
copy to 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
index 4b9816d..f1e77ca 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
@@ -17,12 +17,13 @@
 
 package org.apache.nlpcraft.nlp.entity.parser.semantic;
 
-import java.io.File;
+import opennlp.tools.stemmer.PorterStemmer;
 
-public class NCSemanticEntityParser {
-    public NCSemanticEntityParser(File f) {
-    }
-
-    public NCSemanticEntityParser() {
+public class NCEnStemmer implements NCStemmer {
+    private final PorterStemmer s = new PorterStemmer();
+    
+    @Override
+    public String stem(String word) {
+        return s.stem(word);
     }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
index de8b60c..cd59b38 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
@@ -17,5 +17,303 @@
 
 package org.apache.nlpcraft.nlp.entity.parser.semantic;
 
-public class NCSemanticElement {
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+
+public interface NCSemanticElement {
+    /**
+     * Gets unique ID of this element.
+     * <p>
+     * This unique ID should be human-readable for simpler debugging and 
testing of the model.
+     * Although element ID could be any arbitrary string it is highly 
recommended having
+     * element ID as a lower case string starting with some model prefix, 
followed by colon and
+     * then the element's name. For example, some built-in NLPCraft IDs are: 
<code>nlpcraft:date</code>,
+     * <code>nlpcraft:city</code>.
+     * <p>
+     * Few important notes:
+     * <ul>
+     *      <li>Element IDs starting with <code>nlpcraft:</code> are reserved 
for built-in NLPCraft IDs.</li>
+     *      <li>
+     *          Element ID is an implicit synonym for that element.
+     *          Thus element ID can be used in the user input directly to 
clearly
+     *          disambiguate the element in the input sentence instead of 
relying on synonyms or other
+     *          ways of detection.
+     *      </li>
+     * </ul>
+     * <p>
+     * <b>JSON</b>
+     * <br>
+     * If using JSON/YAML model presentation this is set by <code>id</code> 
property:
+     * <pre class="brush: js, highlight: [3]">
+     *     "elements": [
+     *         {
+     *             "id": "phone:act",
+     *             "description": "Phone action.",
+     *             "synonyms": [
+     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+     *             ]
+     *         }
+     *     ]
+     * </pre>
+     *
+     * @see NCToken#getId()
+     * @return Unique ID of this element.
+     */
+    String getId();
+
+    /**
+     * Gets the list of groups this element belongs to.
+     * <p>
+     * Model element can belong to one or more groups. By default, the element 
belongs to a single group whose group
+     * ID is equal to its {@link #getId() ID}. The proper grouping of the 
model elements is required for operation
+     * of Short-Term-Memory (STM) in {@link NCConversation conversation} (if 
and when conversation
+     * is used). Specifically, a token (i.e. found model element) that is part 
of the group set will override
+     * other tokens from the same set or its superset. In other words, tokens 
with a smaller group set
+     * (more specific token) will override the tokens from a larger group set 
(more generic tokens).
+     * <p>
+     * Note that built-in tokens (including from 3rd party token providers) 
belong to a single group whose group
+     * ID is equal to their IDs.
+     * <p>
+     * <b>JSON</b>
+     * <br>
+     * If using JSON/YAML model presentation this is set by 
<code>groups</code> property:
+     * <pre class="brush: js, highlight: [5]">
+     *     "elements": [
+     *         {
+     *             "id": "phone:act",
+     *             "description": "Phone action.",
+     *             "groups": ["group1", "group2"]
+     *             "synonyms": [
+     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+     *             ]
+     *         }
+     *     ]
+     * </pre>
+     *
+     * @return List of groups this element belongs to. By default, the model 
element belongs to one group
+     *      with ID equal to the element {@link #getId() ID}.
+     * @see NCConversation
+     * @see #getId()
+     */
+    default List<String> getGroups() {
+        return Collections.singletonList(getId());
+    }
+
+    /**
+     * Shortcut method to test if this element is a member of given group. It 
is equivalent to:
+     * <pre class="brush: java">
+     *     return getGroups().contains(grp);
+     * </pre>
+     *
+     * @param grp Token group to test.
+     * @return {@code True} if this element belongs to the given group, {@code 
false} otherwise.
+     */
+    default boolean isMemberOf(String grp) {
+        return getGroups().contains(grp);
+    }
+
+    /**
+     * Gets optional user-defined element's metadata. When a {@link NCToken 
token} for this element
+     * is detected in the input this metadata is merged into {@link 
NCToken#getMetadata()} method returned metadata.
+     * <p>
+     * <b>JSON</b>
+     * <br>
+     * If using JSON/YAML model presentation this is set by 
<code>description</code> property:
+     * <pre class="brush: js, highlight: [8,9,10,11,12]">
+     *     "elements": [
+     *         {
+     *             "id": "phone:act",
+     *             "description": "Phone action.",
+     *             "synonyms": [
+     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+     *             ],
+     *             "metadata": {
+     *                 "str": "val1",
+     *                 "num": 100,
+     *                 "bool": false
+     *             }
+     *         }
+     *     ]
+     * </pre>
+     *
+     * @return Element's metadata or empty collection if none provided. 
Default implementation return empty collection.
+     */
+    default Map<String, Object> getMetadata() {
+        return Collections.emptyMap();
+    }
+
+    /**
+     * Gets optional element description.
+     * <p>
+     * <b>JSON</b>
+     * <br>
+     * If using JSON/YAML model presentation this is set by 
<code>description</code> property:
+     * <pre class="brush: js, highlight: [4]">
+     *     "elements": [
+     *         {
+     *             "id": "phone:act",
+     *             "description": "Phone action.",
+     *             "synonyms": [
+     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+     *             ]
+     *         }
+     *     ]
+     * </pre>
+     *
+     * @return Optional element description. Default implementation returns 
{@code null}.
+     */
+    default String getDescription() {
+        return null;
+    }
+
+    /**
+     * Gets optional map of {@link NCValue values} for this element.
+     * <p>
+     * Each element can generally be recognized either by one of its synonyms 
or values. Elements and their values
+     * are analogous to types and instances of that type in programming 
languages. Each value
+     * has a name and optional set of its own synonyms by which that value, 
and ultimately its element, can be
+     * recognized by. Note that value name itself acts as an implicit synonym 
even when no additional synonyms added
+     * for that value.
+     * <p>
+     * Consider this example. A model element {@code x:car} can have:
+     * <ul>
+     *      <li>
+     *          Set of general synonyms:
+     *          <code>{transportation|transport|_} 
{vehicle|car|sedan|auto|automobile|suv|crossover|coupe|truck}</code>
+     *      </li>
+     *      <li>Set of values:
+     *          <ul>
+     *              <li>{@code mercedes} with synonyms {@code (mercedes, 
mercedes-benz, mb, benz)}</li>
+     *              <li>{@code bmw} with synonyms {@code (bmw, bimmer)}</li>
+     *              <li>{@code chevrolet} with synonyms {@code (chevy, 
chevrolet)}</li>
+     *          </ul>
+     *      </li>
+     * </ul>
+     * With that setup {@code x:car} element will be recognized by any of the 
following input sub-string:
+     * <ul>
+     *      <li>{@code transport car}</li>
+     *      <li>{@code benz}</li>
+     *      <li>{@code automobile}</li>
+     *      <li>{@code transport vehicle}</li>
+     *      <li>{@code sedan}</li>
+     *      <li>{@code chevy}</li>
+     *      <li>{@code bimmer}</li>
+     *      <li>{@code x:car}</li>
+     * </ul>
+     * <p>
+     * <b>JSON</b>
+     * <br>
+     * If using JSON/YAML model presentation this is set by 
<code>values</code> property:
+     * <pre class="brush: js, highlight: [8,9,10,11,12,13]">
+     *     "elements": [
+     *         {
+     *             "id": "phone:act",
+     *             "description": "Phone action.",
+     *             "synonyms": [
+     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+     *             ],
+     *             "values": [
+     *                  {
+     *                      "name": "name1",
+     *                      "synonyms": ["syn1", "syn2"]
+     *                  }
+     *             ]
+     *         }
+     *     ]
+     * </pre>
+     *
+     * @return Map of value's name and its synonyms or {@code null} if not 
defined.
+     */
+    default List<NCSemanticValue> getValues() {
+        return Collections.emptyList();
+    }
+
+    /**
+     * Gets optional ID of the immediate parent element. Parent ID allows 
model elements to form into hierarchy.
+     * <p>
+     * <b>JSON</b>
+     * <br>
+     * If using JSON/YAML model presentation this is set by 
<code>parentId</code> property:
+     * <pre class="brush: js, highlight: [5]">
+     *     "elements": [
+     *         {
+     *             "id": "phone:act",
+     *             "description": "Phone action.",
+     *             "parentId": "parent",
+     *             "synonyms": [
+     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+     *             ]
+     *         }
+     *     ]
+     * </pre>
+     *
+     * @return Optional parent element ID, or {@code null} if not specified. 
Default implementation returns
+     *      {@code null}.
+     */
+    default String getParentId() {
+        return null;
+    }
+
+    /**
+     * Gets the list of synonyms by which this model element will be 
recognized by. Read more about
+     * many forms of synonyms in <a target=_ 
href="https://nlpcraft.apache.org/data-model.html";>Data Model</a> section
+     * and review <a target=_ 
href="https://github.com/apache/incubator-nlpcraft/tree/master/nlpcraft-examples";>examples</a>.
+     * <p>
+     * <b>JSON</b>
+     * <br>
+     * If using JSON/YAML model presentation this is set by 
<code>synonyms</code> property:
+     * <pre class="brush: js, highlight: [5,6,7]">
+     *     "elements": [
+     *         {
+     *             "id": "phone:act",
+     *             "description": "Phone action.",
+     *             "synonyms": [
+     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+     *             ]
+     *         }
+     *     ]
+     * </pre>
+     *
+     * @return List of synonyms for this element. List is generally optional 
since element's ID acts
+     *      as an implicit synonym. Default implementation returns an empty 
list.
+     */
+    default List<String> getSynonyms() {
+        return Collections.emptyList();
+    }
+
+    /**
+     * Gets optional dynamic value loader. This loader will be used 
additionally to any
+     * values defined in {@link #getValues()} method. Default implementation 
returns {@code null}.
+     * <p>
+     * <b>JSON</b>
+     * <br>
+     * If using JSON/YAML model presentation this is set by 
<code>valueLoader</code> property with value
+     * of a fully qualified class name implementing {@link NCValueLoader} 
interface. Note that
+     * only one instance of the value loader will be created per model and 
given class name:
+     * <pre class="brush: js, highlight: [14]">
+     *     "elements": [
+     *         {
+     *             "id": "phone:act",
+     *             "description": "Phone action.",
+     *             "synonyms": [
+     *                 "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+     *             ],
+     *             "values": [
+     *                  {
+     *                      "name": "name1",
+     *                      "synonyms": ["syn1", "syn2"]
+     *                  }
+     *             ],
+     *             "valueLoader": "my.package.ValueLoader"
+     *         }
+     *     ]
+     * </pre>
+     *
+     * @return Optional instance of dynamic value loader.
+     */
+    default Optional<NCSemanticValueLoader> getValueLoader() {
+        return Optional.empty();
+    }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
index 4b9816d..09e83fd 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
@@ -17,12 +17,47 @@
 
 package org.apache.nlpcraft.nlp.entity.parser.semantic;
 
+import org.apache.nlpcraft.NCEntity;
+import org.apache.nlpcraft.NCEntityParser;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl;
+
 import java.io.File;
+import java.util.*;
+
+public class NCSemanticEntityParser implements NCEntityParser {
+    private final NCSemanticEntityParserImpl impl;
+
+    public NCSemanticEntityParser(NCStemmer stemmer, List<NCSemanticElement> 
elems) {
+        impl = NCSemanticEntityParserImpl.apply(stemmer, 
Collections.emptyMap(), elems);
+    }
+
+    public NCSemanticEntityParser(NCStemmer stemmer, Map<String, String> 
macros, List<NCSemanticElement> elems) {
+        impl = NCSemanticEntityParserImpl.apply(stemmer, macros, elems);
+    }
+
+    public NCSemanticEntityParser(NCStemmer stemmer, File elemsFile, 
Map<String, NCSemanticValueLoader> valsLoaders) {
+        impl = NCSemanticEntityParserImpl.apply(stemmer, elemsFile, 
valsLoaders);
+    }
+
+    public NCSemanticEntityParser(NCStemmer stemmer, String elemsSrc, 
Map<String, NCSemanticValueLoader> valsLoaders) {
+        impl = NCSemanticEntityParserImpl.apply(stemmer, elemsSrc, 
valsLoaders);
+    }
+
+    @Override
+    public List<NCEntity> parse(NCRequest req, NCModelConfig cfg, 
List<NCToken> toks) {
+        return impl.parse(req, cfg, toks);
+    }
 
-public class NCSemanticEntityParser {
-    public NCSemanticEntityParser(File f) {
+    @Override
+    public void start(NCModelConfig cfg) {
+        impl.start(cfg);
     }
 
-    public NCSemanticEntityParser() {
+    @Override
+    public void stop() {
+        impl.stop();
     }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
index fe8178a..5f45a79 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
@@ -17,5 +17,8 @@
 
 package org.apache.nlpcraft.nlp.entity.parser.semantic;
 
-public class NCSemanticValue {
+import java.util.List;
+
+public interface NCSemanticValue {
+    List<String> getSynonyms();
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
index 0e2d12b..83e83fd 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
@@ -17,5 +17,8 @@
 
 package org.apache.nlpcraft.nlp.entity.parser.semantic;
 
-public class NCSemanticValueLoader {
+import java.util.Set;
+
+public interface NCSemanticValueLoader {
+    Set<NCSemanticValue> load();
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
similarity index 93%
copy from 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
copy to 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
index 0e2d12b..11d5e8c 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
@@ -17,5 +17,6 @@
 
 package org.apache.nlpcraft.nlp.entity.parser.semantic;
 
-public class NCSemanticValueLoader {
+public interface NCStemmer {
+    String stem(String word);
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
new file mode 100644
index 0000000..841e6f3
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
+
+
+import java.util.List as JList
+import java.util.Map as Jmap
+import java.io.File
+import org.apache.nlpcraft.nlp.entity.parser.semantic.{NCSemanticElement, *}
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.makro.NCMacroParser
+import org.apache.nlpcraft.internal.util.NCUtils
+
+import java.util.regex.{Pattern, PatternSyntaxException}
+import scala.collection.mutable
+import scala.jdk.CollectionConverters.*
+import NCSynonymChunkKind.*
+import com.typesafe.scalalogging.LazyLogging
+
+object NCSemanticEntityParserImpl:
+    def apply(stemmer: NCStemmer, macros: Jmap[String, String], elems: 
JList[NCSemanticElement]): NCSemanticEntityParserImpl =
+        new NCSemanticEntityParserImpl(stemmer, macros.asScala.toMap, 
elems.asScala.toSeq)
+    def apply(stemmer: NCStemmer, elemsFile: File, valsLoaders: Jmap[String, 
NCSemanticValueLoader]): NCSemanticEntityParserImpl =
+        new NCSemanticEntityParserImpl(stemmer, null, null)
+    def apply(stemmer: NCStemmer, elemsSrc: String, valsLoaders: Jmap[String, 
NCSemanticValueLoader]): NCSemanticEntityParserImpl =
+        new NCSemanticEntityParserImpl(stemmer, null, null)
+
+    private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
+    private final val REGEX_FIX = "//"
+
+    private case class Piece(main: Seq[NCToken], extra: Seq[Seq[NCToken]])
+
+    private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
+        (for (n <- toks.size until 0 by -1) yield 
toks.sliding(n)).flatten.map(p => p)
+
+    private def startsAndEnds(fix: String, s: String): Boolean = 
s.startsWith(fix) && s.endsWith(fix)
+    private def mkChunk(stemmer: NCStemmer, chunk: String): NCSynonymChunk = {
+        def stripSuffix(fix: String, s: String): String = s.slice(fix.length, 
s.length - fix.length)
+
+        // Regex synonym.
+        if (startsAndEnds(REGEX_FIX, chunk)) {
+            val ptrn = stripSuffix(REGEX_FIX, chunk)
+            if (ptrn.nonEmpty) {
+                try
+                    NCSynonymChunk(kind = REGEX, origText = chunk, regex = 
Pattern.compile(ptrn))
+                catch {
+                    case e: PatternSyntaxException =>
+                        throw new NCException(s"Invalid regex synonym syntax 
detected [" +
+                            s"chunk=$chunk" +
+                            s"]", e)
+                }
+            }
+            else
+                throw new NCException(s"Empty regex synonym detected [" +
+                    s"chunk=$chunk" +
+                    s"]")
+        }
+        // IDL-based synonym.
+        else
+            NCSynonymChunk(kind = TEXT, origText = chunk, wordStem = 
stemmer.stem(chunk))
+    }
+
+    private def getPieces(toks: Seq[NCToken]): Seq[Piece] =
+        combos(toks).map(combo => {
+            val stops = combo.filter(s => s.isStopWord && s != combo.head && s 
!= combo.last)
+            val slides = 
mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NCToken]]
+
+            for (stop <- stops)
+                if slides.nonEmpty && slides.last.last.getIndex + 1 == 
stop.getIndex then
+                    slides.last += stop
+                else
+                    slides += mutable.ArrayBuffer.empty :+ stop
+
+            // Too many stopwords inside skipped.
+            val bigSlides = slides.filter(_.size > 2)
+
+            var stops4Delete =
+                if bigSlides.nonEmpty then
+                    val allBig = bigSlides.flatten
+                    val stops4AllCombs = stops.filter(p => !allBig.contains(p))
+
+                    if stops4AllCombs.nonEmpty then
+                        for (
+                            seq1 <- Range.inclusive(0, 
stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
+                            seq2 <- Range.inclusive(0, 
bigSlides.size).flatMap(bigSlides.combinations)
+                        )
+                        yield seq1 ++ seq2.flatten
+                    else
+                        for (seq <- Range.inclusive(0, 
bigSlides.size).flatMap(bigSlides.combinations))
+                            yield seq.toSeq.flatten
+                else
+                    Range.inclusive(1, stops.size).flatMap(stops.combinations)
+
+            stops4Delete = stops4Delete.filter(seq => 
!seq.contains(combo.head) && !seq.contains(combo.last))
+
+            Piece(combo,  stops4Delete.map(del => combo.filter(t => 
!del.contains(t))).filter(_.nonEmpty))
+        })
+
+import NCSemanticEntityParserImpl._
+
+class NCSemanticEntityParserImpl(stemmer: NCStemmer, macros: Map[String, 
String], elements: Seq[NCSemanticElement]) extends NCEntityParser with 
LazyLogging:
+    private var sortedSyns: Map[Int, Map[String, Seq[NCSynonym]]] = _
+
+    override def start(cfg: NCModelConfig): Unit =
+        val p = new NCMacroParser
+
+        for ((name, body) <- macros) p.addMacro(name, body)
+
+        case class Holder(elemId: String, synonyms: Seq[NCSynonym])
+
+        val all = mutable.ArrayBuffer.empty[Holder]
+
+        elements.foreach(e => {
+            if e.getSynonyms != null then
+                val syns = e.getSynonyms.asScala
+
+                val susp = syns.filter(syn => !syn.contains("//") && 
SUSP_SYNS_CHARS.exists(susp => syn.contains(susp)))
+
+                if susp.nonEmpty then
+                    logger.warn(
+                        s"Suspicious synonyms detected (use of 
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
+                            s"elementId=${e.getId}, " +
+                            s"synonyms=[${susp.mkString(", ")}]" +
+                            s"]"
+                    )
+
+                // TODO: parsing
+                all += Holder(
+                    e.getId,
+                    syns.
+                        flatMap(p.expand).
+                        map(_.split(" ").map(p => mkChunk(stemmer, 
p)).toIndexedSeq).toSeq.
+                        map(chunks => NCSynonym(false, false, false, null, 
chunks))
+                )
+        })
+
+        // TODO: sort
+        sortedSyns =
+            all.groupBy(_.synonyms.size).map {
+                case (len, hs) =>
+                    len -> hs.groupBy(_.elemId).map { case (elemId, seq) => 
elemId -> seq.flatMap(_.synonyms).toSeq }
+            }
+
+    override def stop(): Unit = sortedSyns = null
+
+    override def parse(req: NCRequest, cfg: NCModelConfig, toks: 
JList[NCToken]): JList[NCEntity] =
+        val cache = mutable.HashSet.empty[Seq[Int]]
+        val entities = mutable.ArrayBuffer.empty[NCEntity]
+
+        def tryMatch(base: Seq[NCToken], toks: Seq[NCToken]): Unit =
+            val idxs = toks.map(_.getIndex)
+
+            if (cache.add(idxs))
+                for ((elemId, syns) <- sortedSyns.getOrElse(toks.size, 
Seq.empty))
+                    var found = false
+
+                    for (s <- syns if !found)
+                        if (s.isMatch(toks))
+                            found = true
+                            entities +=
+                                new NCPropertyMapAdapter with NCEntity:
+                                    override def getTokens: JList[NCToken] = 
base.asJava
+                                    override def getRequestId: String = 
req.getRequestId
+                                    override def getId: String = elemId
+
+        for (piece <- getPieces(toks.asScala.toSeq); extra <- piece.extra)
+            tryMatch(piece.main, extra)
+
+        entities.toSeq.asJava
+
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
similarity index 63%
copy from 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
copy to 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
index 4b9816d..2beed29 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
@@ -14,15 +14,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
 
-package org.apache.nlpcraft.nlp.entity.parser.semantic;
+import org.apache.nlpcraft.NCToken
 
-import java.io.File;
+case class NCSynonym(
+    isElementId: Boolean,
+    isValueName: Boolean,
+    isDirect: Boolean,
+    value: String = null,
+    chunks: Seq[NCSynonymChunk]
+) {
+    private lazy val stem = ""
 
-public class NCSemanticEntityParser {
-    public NCSemanticEntityParser(File f) {
-    }
+    // TODO: implement.
+    def isMatch(toks: Seq[NCToken]): Boolean =
+        chunks.size == toks.size  && chunks.zip(toks).forall { case (chunk, 
tok) => chunk.wordStem == tok.getStem}
 
-    public NCSemanticEntityParser() {
-    }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
similarity index 52%
copy from 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
copy to 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
index 4b9816d..86e2c09 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
@@ -15,14 +15,29 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.nlp.entity.parser.semantic;
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
 
-import java.io.File;
+import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSynonymChunkKind.NCSynonymChunkKind
 
-public class NCSemanticEntityParser {
-    public NCSemanticEntityParser(File f) {
-    }
+import java.util.regex.Pattern
 
-    public NCSemanticEntityParser() {
-    }
+/**
+ *
+ * @param kind Kind of synonym chunk.
+ * @param origText Original text.
+ * @param wordStem Optional stem for a single word synonyms.
+ * @param posTag Optional PoS tag to match on.
+ * @param regex Optional regex expression to match on.
+ */
+case class NCSynonymChunk(
+    kind: NCSynonymChunkKind,
+    origText: String,
+    wordStem: String = null, // Only for kind == TEXT.
+    posTag: String = null,
+    regex: Pattern = null
+) {
+    require(origText != null)
+    require(kind != null)
+
+    override def toString = s"($origText|$kind)"
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
similarity index 73%
copy from 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
copy to 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
index 0e2d12b..aeceeee 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
@@ -15,7 +15,14 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.nlp.entity.parser.semantic;
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
 
-public class NCSemanticValueLoader {
+/**
+  * Synonym element type.
+  */
+object NCSynonymChunkKind extends Enumeration {
+    type NCSynonymChunkKind = Value
+    
+    val TEXT: Value = Value // Simple word.
+    val REGEX: Value = Value // RegEx match expression (//[abd]+//).
 }
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
new file mode 100644
index 0000000..4126693
--- /dev/null
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.entity.parser.semantic
+
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
+import org.junit.jupiter.api.*
+
+import java.util
+import scala.collection.mutable
+import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.*
+import scala.jdk.OptionConverters.RichOptional
+
+/**
+  *
+  */
+class NCSemanticEntityParserSpec:
+    private var tParser: NCEnOpenNlpTokenParser = _
+    private var sParser: NCSemanticEntityParser = _
+
+
+    @BeforeEach
+    def start(): Unit =
+        tParser = NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
+        sParser =
+            NCTestUtils.makeAndStart(
+                new NCSemanticEntityParser(
+                    new NCEnStemmer,
+                    Seq(
+                        new NCSemanticElement {
+                            override def getId: String = "testId"
+                            override def getSynonyms: util.List[String] = 
Seq("test").asJava
+                        }
+                    ).asJava
+                )
+            )
+
+    private def checkSingleEntity(txt: String, expected: String): Unit =
+        val req = NCTestRequest(txt)
+        val res = sParser.parse(req, null, tParser.parse(req, 
null)).asScala.toSeq
+
+        require(res.size == 1)
+
+        NCTestUtils.printEntities(txt, res)
+
+    @Test
+    def test(): Unit =
+        checkSingleEntity("test", "test")

[incubator-nlpcraft] branch NLPCRAFT-472 updated: WIP.

Reply via email to