This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new 956f544 WIP.
956f544 is described below
commit 956f544f11278df76bb8ad028df4b8dabb6471ec
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 29 19:13:58 2021 +0300
WIP.
---
.../opennlp/impl/NCOpenNlpEntityParserImpl.scala | 3 +-
...CSemanticEntityParser.java => NCEnStemmer.java} | 13 +-
.../entity/parser/semantic/NCSemanticElement.java | 300 ++++++++++++++++++++-
.../parser/semantic/NCSemanticEntityParser.java | 41 ++-
.../entity/parser/semantic/NCSemanticValue.java | 5 +-
.../parser/semantic/NCSemanticValueLoader.java | 5 +-
.../{NCSemanticValueLoader.java => NCStemmer.java} | 3 +-
.../semantic/impl/NCSemanticEntityParserImpl.scala | 184 +++++++++++++
.../NCSynonym.scala} | 20 +-
.../NCSynonymChunk.scala} | 29 +-
.../NCSynonymChunkKind.scala} | 11 +-
.../semantic/NCSemanticEntityParserSpec.scala | 67 +++++
12 files changed, 650 insertions(+), 31 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
index dd3e24e..7c7be26 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
@@ -72,12 +72,11 @@ class NCOpenNlpEntityParserImpl(is: InputStream, res:
String) extends NCEntityPa
lazy val i2 = calcIndex(_.end)
Option.when(i1 != -1 && i2 != -1)(
- new NCPropertyMapAdapter with NCEntity {
+ new NCPropertyMapAdapter with NCEntity:
put(s"opennlp:${h.name}:probability", h.probability)
override def getTokens: JList[NCToken] = toksSeq.flatMap(t
=> Option.when(t.getIndex >= i1 && t.getIndex <= i2)(t)).asJava
override def getRequestId: String = req.getRequestId
override def getId: String = s"opennlp:${h.name}"
- }
)
).toSeq.asJava
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
similarity index 78%
copy from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
copy to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
index 4b9816d..f1e77ca 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCEnStemmer.java
@@ -17,12 +17,13 @@
package org.apache.nlpcraft.nlp.entity.parser.semantic;
-import java.io.File;
+import opennlp.tools.stemmer.PorterStemmer;
-public class NCSemanticEntityParser {
- public NCSemanticEntityParser(File f) {
- }
-
- public NCSemanticEntityParser() {
+public class NCEnStemmer implements NCStemmer {
+ private final PorterStemmer s = new PorterStemmer();
+
+ @Override
+ public String stem(String word) {
+ return s.stem(word);
}
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
index de8b60c..cd59b38 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
@@ -17,5 +17,303 @@
package org.apache.nlpcraft.nlp.entity.parser.semantic;
-public class NCSemanticElement {
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+
+public interface NCSemanticElement {
+ /**
+ * Gets unique ID of this element.
+ * <p>
+ * This unique ID should be human-readable for simpler debugging and
testing of the model.
+ * Although element ID could be any arbitrary string it is highly
recommended having
+ * element ID as a lower case string starting with some model prefix,
followed by colon and
+ * then the element's name. For example, some built-in NLPCraft IDs are:
<code>nlpcraft:date</code>,
+ * <code>nlpcraft:city</code>.
+ * <p>
+ * Few important notes:
+ * <ul>
+ * <li>Element IDs starting with <code>nlpcraft:</code> are reserved
for built-in NLPCraft IDs.</li>
+ * <li>
+ * Element ID is an implicit synonym for that element.
+ * Thus element ID can be used in the user input directly to
clearly
+ * disambiguate the element in the input sentence instead of
relying on synonyms or other
+ * ways of detection.
+ * </li>
+ * </ul>
+ * <p>
+ * <b>JSON</b>
+ * <br>
+ * If using JSON/YAML model presentation this is set by <code>id</code>
property:
+ * <pre class="brush: js, highlight: [3]">
+ * "elements": [
+ * {
+ * "id": "phone:act",
+ * "description": "Phone action.",
+ * "synonyms": [
+ * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+ * ]
+ * }
+ * ]
+ * </pre>
+ *
+ * @see NCToken#getId()
+ * @return Unique ID of this element.
+ */
+ String getId();
+
+ /**
+ * Gets the list of groups this element belongs to.
+ * <p>
+ * Model element can belong to one or more groups. By default, the element
belongs to a single group whose group
+ * ID is equal to its {@link #getId() ID}. The proper grouping of the
model elements is required for operation
+ * of Short-Term-Memory (STM) in {@link NCConversation conversation} (if
and when conversation
+ * is used). Specifically, a token (i.e. found model element) that is part
of the group set will override
+ * other tokens from the same set or its superset. In other words, tokens
with a smaller group set
+ * (more specific token) will override the tokens from a larger group set
(more generic tokens).
+ * <p>
+ * Note that built-in tokens (including from 3rd party token providers)
belong to a single group whose group
+ * ID is equal to their IDs.
+ * <p>
+ * <b>JSON</b>
+ * <br>
+ * If using JSON/YAML model presentation this is set by
<code>groups</code> property:
+ * <pre class="brush: js, highlight: [5]">
+ * "elements": [
+ * {
+ * "id": "phone:act",
+ * "description": "Phone action.",
+ * "groups": ["group1", "group2"]
+ * "synonyms": [
+ * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+ * ]
+ * }
+ * ]
+ * </pre>
+ *
+ * @return List of groups this element belongs to. By default, the model
element belongs to one group
+ * with ID equal to the element {@link #getId() ID}.
+ * @see NCConversation
+ * @see #getId()
+ */
+ default List<String> getGroups() {
+ return Collections.singletonList(getId());
+ }
+
+ /**
+ * Shortcut method to test if this element is a member of given group. It
is equivalent to:
+ * <pre class="brush: java">
+ * return getGroups().contains(grp);
+ * </pre>
+ *
+ * @param grp Token group to test.
+ * @return {@code True} if this element belongs to the given group, {@code
false} otherwise.
+ */
+ default boolean isMemberOf(String grp) {
+ return getGroups().contains(grp);
+ }
+
+ /**
+ * Gets optional user-defined element's metadata. When a {@link NCToken
token} for this element
+ * is detected in the input this metadata is merged into {@link
NCToken#getMetadata()} method returned metadata.
+ * <p>
+ * <b>JSON</b>
+ * <br>
+ * If using JSON/YAML model presentation this is set by
<code>description</code> property:
+ * <pre class="brush: js, highlight: [8,9,10,11,12]">
+ * "elements": [
+ * {
+ * "id": "phone:act",
+ * "description": "Phone action.",
+ * "synonyms": [
+ * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+ * ],
+ * "metadata": {
+ * "str": "val1",
+ * "num": 100,
+ * "bool": false
+ * }
+ * }
+ * ]
+ * </pre>
+ *
+ * @return Element's metadata or empty collection if none provided.
Default implementation return empty collection.
+ */
+ default Map<String, Object> getMetadata() {
+ return Collections.emptyMap();
+ }
+
+ /**
+ * Gets optional element description.
+ * <p>
+ * <b>JSON</b>
+ * <br>
+ * If using JSON/YAML model presentation this is set by
<code>description</code> property:
+ * <pre class="brush: js, highlight: [4]">
+ * "elements": [
+ * {
+ * "id": "phone:act",
+ * "description": "Phone action.",
+ * "synonyms": [
+ * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+ * ]
+ * }
+ * ]
+ * </pre>
+ *
+ * @return Optional element description. Default implementation returns
{@code null}.
+ */
+ default String getDescription() {
+ return null;
+ }
+
+ /**
+ * Gets optional map of {@link NCValue values} for this element.
+ * <p>
+ * Each element can generally be recognized either by one of its synonyms
or values. Elements and their values
+ * are analogous to types and instances of that type in programming
languages. Each value
+ * has a name and optional set of its own synonyms by which that value,
and ultimately its element, can be
+ * recognized by. Note that value name itself acts as an implicit synonym
even when no additional synonyms added
+ * for that value.
+ * <p>
+ * Consider this example. A model element {@code x:car} can have:
+ * <ul>
+ * <li>
+ * Set of general synonyms:
+ * <code>{transportation|transport|_}
{vehicle|car|sedan|auto|automobile|suv|crossover|coupe|truck}</code>
+ * </li>
+ * <li>Set of values:
+ * <ul>
+ * <li>{@code mercedes} with synonyms {@code (mercedes,
mercedes-benz, mb, benz)}</li>
+ * <li>{@code bmw} with synonyms {@code (bmw, bimmer)}</li>
+ * <li>{@code chevrolet} with synonyms {@code (chevy,
chevrolet)}</li>
+ * </ul>
+ * </li>
+ * </ul>
+ * With that setup {@code x:car} element will be recognized by any of the
following input sub-string:
+ * <ul>
+ * <li>{@code transport car}</li>
+ * <li>{@code benz}</li>
+ * <li>{@code automobile}</li>
+ * <li>{@code transport vehicle}</li>
+ * <li>{@code sedan}</li>
+ * <li>{@code chevy}</li>
+ * <li>{@code bimmer}</li>
+ * <li>{@code x:car}</li>
+ * </ul>
+ * <p>
+ * <b>JSON</b>
+ * <br>
+ * If using JSON/YAML model presentation this is set by
<code>values</code> property:
+ * <pre class="brush: js, highlight: [8,9,10,11,12,13]">
+ * "elements": [
+ * {
+ * "id": "phone:act",
+ * "description": "Phone action.",
+ * "synonyms": [
+ * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+ * ],
+ * "values": [
+ * {
+ * "name": "name1",
+ * "synonyms": ["syn1", "syn2"]
+ * }
+ * ]
+ * }
+ * ]
+ * </pre>
+ *
+ * @return Map of value's name and its synonyms or {@code null} if not
defined.
+ */
+ default List<NCSemanticValue> getValues() {
+ return Collections.emptyList();
+ }
+
+ /**
+ * Gets optional ID of the immediate parent element. Parent ID allows
model elements to form into hierarchy.
+ * <p>
+ * <b>JSON</b>
+ * <br>
+ * If using JSON/YAML model presentation this is set by
<code>parentId</code> property:
+ * <pre class="brush: js, highlight: [5]">
+ * "elements": [
+ * {
+ * "id": "phone:act",
+ * "description": "Phone action.",
+ * "parentId": "parent",
+ * "synonyms": [
+ * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+ * ]
+ * }
+ * ]
+ * </pre>
+ *
+ * @return Optional parent element ID, or {@code null} if not specified.
Default implementation returns
+ * {@code null}.
+ */
+ default String getParentId() {
+ return null;
+ }
+
+ /**
+ * Gets the list of synonyms by which this model element will be
recognized by. Read more about
+ * many forms of synonyms in <a target=_
href="https://nlpcraft.apache.org/data-model.html">Data Model</a> section
+ * and review <a target=_
href="https://github.com/apache/incubator-nlpcraft/tree/master/nlpcraft-examples">examples</a>.
+ * <p>
+ * <b>JSON</b>
+ * <br>
+ * If using JSON/YAML model presentation this is set by
<code>synonyms</code> property:
+ * <pre class="brush: js, highlight: [5,6,7]">
+ * "elements": [
+ * {
+ * "id": "phone:act",
+ * "description": "Phone action.",
+ * "synonyms": [
+ * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+ * ]
+ * }
+ * ]
+ * </pre>
+ *
+ * @return List of synonyms for this element. List is generally optional
since element's ID acts
+ * as an implicit synonym. Default implementation returns an empty
list.
+ */
+ default List<String> getSynonyms() {
+ return Collections.emptyList();
+ }
+
+ /**
+ * Gets optional dynamic value loader. This loader will be used
additionally to any
+ * values defined in {@link #getValues()} method. Default implementation
returns {@code null}.
+ * <p>
+ * <b>JSON</b>
+ * <br>
+ * If using JSON/YAML model presentation this is set by
<code>valueLoader</code> property with value
+ * of a fully qualified class name implementing {@link NCValueLoader}
interface. Note that
+ * only one instance of the value loader will be created per model and
given class name:
+ * <pre class="brush: js, highlight: [14]">
+ * "elements": [
+ * {
+ * "id": "phone:act",
+ * "description": "Phone action.",
+ * "synonyms": [
+ * "{give|_} {call|phone|ring|dial|dial up|ping|contact}"
+ * ],
+ * "values": [
+ * {
+ * "name": "name1",
+ * "synonyms": ["syn1", "syn2"]
+ * }
+ * ],
+ * "valueLoader": "my.package.ValueLoader"
+ * }
+ * ]
+ * </pre>
+ *
+ * @return Optional instance of dynamic value loader.
+ */
+ default Optional<NCSemanticValueLoader> getValueLoader() {
+ return Optional.empty();
+ }
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
index 4b9816d..09e83fd 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
@@ -17,12 +17,47 @@
package org.apache.nlpcraft.nlp.entity.parser.semantic;
+import org.apache.nlpcraft.NCEntity;
+import org.apache.nlpcraft.NCEntityParser;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl;
+
import java.io.File;
+import java.util.*;
+
+public class NCSemanticEntityParser implements NCEntityParser {
+ private final NCSemanticEntityParserImpl impl;
+
+ public NCSemanticEntityParser(NCStemmer stemmer, List<NCSemanticElement>
elems) {
+ impl = NCSemanticEntityParserImpl.apply(stemmer,
Collections.emptyMap(), elems);
+ }
+
+ public NCSemanticEntityParser(NCStemmer stemmer, Map<String, String>
macros, List<NCSemanticElement> elems) {
+ impl = NCSemanticEntityParserImpl.apply(stemmer, macros, elems);
+ }
+
+ public NCSemanticEntityParser(NCStemmer stemmer, File elemsFile,
Map<String, NCSemanticValueLoader> valsLoaders) {
+ impl = NCSemanticEntityParserImpl.apply(stemmer, elemsFile,
valsLoaders);
+ }
+
+ public NCSemanticEntityParser(NCStemmer stemmer, String elemsSrc,
Map<String, NCSemanticValueLoader> valsLoaders) {
+ impl = NCSemanticEntityParserImpl.apply(stemmer, elemsSrc,
valsLoaders);
+ }
+
+ @Override
+ public List<NCEntity> parse(NCRequest req, NCModelConfig cfg,
List<NCToken> toks) {
+ return impl.parse(req, cfg, toks);
+ }
-public class NCSemanticEntityParser {
- public NCSemanticEntityParser(File f) {
+ @Override
+ public void start(NCModelConfig cfg) {
+ impl.start(cfg);
}
- public NCSemanticEntityParser() {
+ @Override
+ public void stop() {
+ impl.stop();
}
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
index fe8178a..5f45a79 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValue.java
@@ -17,5 +17,8 @@
package org.apache.nlpcraft.nlp.entity.parser.semantic;
-public class NCSemanticValue {
+import java.util.List;
+
+public interface NCSemanticValue {
+ List<String> getSynonyms();
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
index 0e2d12b..83e83fd 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
@@ -17,5 +17,8 @@
package org.apache.nlpcraft.nlp.entity.parser.semantic;
-public class NCSemanticValueLoader {
+import java.util.Set;
+
+public interface NCSemanticValueLoader {
+ Set<NCSemanticValue> load();
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
similarity index 93%
copy from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
copy to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
index 0e2d12b..11d5e8c 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCStemmer.java
@@ -17,5 +17,6 @@
package org.apache.nlpcraft.nlp.entity.parser.semantic;
-public class NCSemanticValueLoader {
+public interface NCStemmer {
+ String stem(String word);
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
new file mode 100644
index 0000000..841e6f3
--- /dev/null
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
+
+
+import java.util.List as JList
+import java.util.Map as Jmap
+import java.io.File
+import org.apache.nlpcraft.nlp.entity.parser.semantic.{NCSemanticElement, *}
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.makro.NCMacroParser
+import org.apache.nlpcraft.internal.util.NCUtils
+
+import java.util.regex.{Pattern, PatternSyntaxException}
+import scala.collection.mutable
+import scala.jdk.CollectionConverters.*
+import NCSynonymChunkKind.*
+import com.typesafe.scalalogging.LazyLogging
+
+object NCSemanticEntityParserImpl:
+ def apply(stemmer: NCStemmer, macros: Jmap[String, String], elems:
JList[NCSemanticElement]): NCSemanticEntityParserImpl =
+ new NCSemanticEntityParserImpl(stemmer, macros.asScala.toMap,
elems.asScala.toSeq)
+ def apply(stemmer: NCStemmer, elemsFile: File, valsLoaders: Jmap[String,
NCSemanticValueLoader]): NCSemanticEntityParserImpl =
+ new NCSemanticEntityParserImpl(stemmer, null, null)
+ def apply(stemmer: NCStemmer, elemsSrc: String, valsLoaders: Jmap[String,
NCSemanticValueLoader]): NCSemanticEntityParserImpl =
+ new NCSemanticEntityParserImpl(stemmer, null, null)
+
+ private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
+ private final val REGEX_FIX = "//"
+
+ private case class Piece(main: Seq[NCToken], extra: Seq[Seq[NCToken]])
+
+ private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
+ (for (n <- toks.size until 0 by -1) yield
toks.sliding(n)).flatten.map(p => p)
+
+ private def startsAndEnds(fix: String, s: String): Boolean =
s.startsWith(fix) && s.endsWith(fix)
+ private def mkChunk(stemmer: NCStemmer, chunk: String): NCSynonymChunk = {
+ def stripSuffix(fix: String, s: String): String = s.slice(fix.length,
s.length - fix.length)
+
+ // Regex synonym.
+ if (startsAndEnds(REGEX_FIX, chunk)) {
+ val ptrn = stripSuffix(REGEX_FIX, chunk)
+ if (ptrn.nonEmpty) {
+ try
+ NCSynonymChunk(kind = REGEX, origText = chunk, regex =
Pattern.compile(ptrn))
+ catch {
+ case e: PatternSyntaxException =>
+ throw new NCException(s"Invalid regex synonym syntax
detected [" +
+ s"chunk=$chunk" +
+ s"]", e)
+ }
+ }
+ else
+ throw new NCException(s"Empty regex synonym detected [" +
+ s"chunk=$chunk" +
+ s"]")
+ }
+ // IDL-based synonym.
+ else
+ NCSynonymChunk(kind = TEXT, origText = chunk, wordStem =
stemmer.stem(chunk))
+ }
+
+ private def getPieces(toks: Seq[NCToken]): Seq[Piece] =
+ combos(toks).map(combo => {
+ val stops = combo.filter(s => s.isStopWord && s != combo.head && s
!= combo.last)
+ val slides =
mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NCToken]]
+
+ for (stop <- stops)
+ if slides.nonEmpty && slides.last.last.getIndex + 1 ==
stop.getIndex then
+ slides.last += stop
+ else
+ slides += mutable.ArrayBuffer.empty :+ stop
+
+ // Too many stopwords inside skipped.
+ val bigSlides = slides.filter(_.size > 2)
+
+ var stops4Delete =
+ if bigSlides.nonEmpty then
+ val allBig = bigSlides.flatten
+ val stops4AllCombs = stops.filter(p => !allBig.contains(p))
+
+ if stops4AllCombs.nonEmpty then
+ for (
+ seq1 <- Range.inclusive(0,
stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
+ seq2 <- Range.inclusive(0,
bigSlides.size).flatMap(bigSlides.combinations)
+ )
+ yield seq1 ++ seq2.flatten
+ else
+ for (seq <- Range.inclusive(0,
bigSlides.size).flatMap(bigSlides.combinations))
+ yield seq.toSeq.flatten
+ else
+ Range.inclusive(1, stops.size).flatMap(stops.combinations)
+
+ stops4Delete = stops4Delete.filter(seq =>
!seq.contains(combo.head) && !seq.contains(combo.last))
+
+ Piece(combo, stops4Delete.map(del => combo.filter(t =>
!del.contains(t))).filter(_.nonEmpty))
+ })
+
+import NCSemanticEntityParserImpl._
+
+class NCSemanticEntityParserImpl(stemmer: NCStemmer, macros: Map[String,
String], elements: Seq[NCSemanticElement]) extends NCEntityParser with
LazyLogging:
+ private var sortedSyns: Map[Int, Map[String, Seq[NCSynonym]]] = _
+
+ override def start(cfg: NCModelConfig): Unit =
+ val p = new NCMacroParser
+
+ for ((name, body) <- macros) p.addMacro(name, body)
+
+ case class Holder(elemId: String, synonyms: Seq[NCSynonym])
+
+ val all = mutable.ArrayBuffer.empty[Holder]
+
+ elements.foreach(e => {
+ if e.getSynonyms != null then
+ val syns = e.getSynonyms.asScala
+
+ val susp = syns.filter(syn => !syn.contains("//") &&
SUSP_SYNS_CHARS.exists(susp => syn.contains(susp)))
+
+ if susp.nonEmpty then
+ logger.warn(
+ s"Suspicious synonyms detected (use of
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
+ s"elementId=${e.getId}, " +
+ s"synonyms=[${susp.mkString(", ")}]" +
+ s"]"
+ )
+
+ // TODO: parsing
+ all += Holder(
+ e.getId,
+ syns.
+ flatMap(p.expand).
+ map(_.split(" ").map(p => mkChunk(stemmer,
p)).toIndexedSeq).toSeq.
+ map(chunks => NCSynonym(false, false, false, null,
chunks))
+ )
+ })
+
+ // TODO: sort
+ sortedSyns =
+ all.groupBy(_.synonyms.size).map {
+ case (len, hs) =>
+ len -> hs.groupBy(_.elemId).map { case (elemId, seq) =>
elemId -> seq.flatMap(_.synonyms).toSeq }
+ }
+
+ override def stop(): Unit = sortedSyns = null
+
+ override def parse(req: NCRequest, cfg: NCModelConfig, toks:
JList[NCToken]): JList[NCEntity] =
+ val cache = mutable.HashSet.empty[Seq[Int]]
+ val entities = mutable.ArrayBuffer.empty[NCEntity]
+
+ def tryMatch(base: Seq[NCToken], toks: Seq[NCToken]): Unit =
+ val idxs = toks.map(_.getIndex)
+
+ if (cache.add(idxs))
+ for ((elemId, syns) <- sortedSyns.getOrElse(toks.size,
Seq.empty))
+ var found = false
+
+ for (s <- syns if !found)
+ if (s.isMatch(toks))
+ found = true
+ entities +=
+ new NCPropertyMapAdapter with NCEntity:
+ override def getTokens: JList[NCToken] =
base.asJava
+ override def getRequestId: String =
req.getRequestId
+ override def getId: String = elemId
+
+ for (piece <- getPieces(toks.asScala.toSeq); extra <- piece.extra)
+ tryMatch(piece.main, extra)
+
+ entities.toSeq.asJava
+
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
similarity index 63%
copy from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
copy to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
index 4b9816d..2beed29 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
@@ -14,15 +14,21 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
-package org.apache.nlpcraft.nlp.entity.parser.semantic;
+import org.apache.nlpcraft.NCToken
-import java.io.File;
+case class NCSynonym(
+ isElementId: Boolean,
+ isValueName: Boolean,
+ isDirect: Boolean,
+ value: String = null,
+ chunks: Seq[NCSynonymChunk]
+) {
+ private lazy val stem = ""
-public class NCSemanticEntityParser {
- public NCSemanticEntityParser(File f) {
- }
+ // TODO: implement.
+ def isMatch(toks: Seq[NCToken]): Boolean =
+ chunks.size == toks.size && chunks.zip(toks).forall { case (chunk,
tok) => chunk.wordStem == tok.getStem}
- public NCSemanticEntityParser() {
- }
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
similarity index 52%
copy from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
copy to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
index 4b9816d..86e2c09 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
@@ -15,14 +15,29 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.nlp.entity.parser.semantic;
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
-import java.io.File;
+import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSynonymChunkKind.NCSynonymChunkKind
-public class NCSemanticEntityParser {
- public NCSemanticEntityParser(File f) {
- }
+import java.util.regex.Pattern
- public NCSemanticEntityParser() {
- }
+/**
+ *
+ * @param kind Kind of synonym chunk.
+ * @param origText Original text.
+ * @param wordStem Optional stem for a single word synonyms.
+ * @param posTag Optional PoS tag to match on.
+ * @param regex Optional regex expression to match on.
+ */
+case class NCSynonymChunk(
+ kind: NCSynonymChunkKind,
+ origText: String,
+ wordStem: String = null, // Only for kind == TEXT.
+ posTag: String = null,
+ regex: Pattern = null
+) {
+ require(origText != null)
+ require(kind != null)
+
+ override def toString = s"($origText|$kind)"
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
similarity index 73%
copy from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
copy to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
index 0e2d12b..aeceeee 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticValueLoader.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
@@ -15,7 +15,14 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.nlp.entity.parser.semantic;
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
-public class NCSemanticValueLoader {
+/**
+ * Synonym element type.
+ */
+object NCSynonymChunkKind extends Enumeration {
+ type NCSynonymChunkKind = Value
+
+ val TEXT: Value = Value // Simple word.
+ val REGEX: Value = Value // RegEx match expression (//[abd]+//).
}
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
new file mode 100644
index 0000000..4126693
--- /dev/null
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.entity.parser.semantic
+
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
+import org.junit.jupiter.api.*
+
+import java.util
+import scala.collection.mutable
+import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.*
+import scala.jdk.OptionConverters.RichOptional
+
+/**
+ *
+ */
+class NCSemanticEntityParserSpec:
+ private var tParser: NCEnOpenNlpTokenParser = _
+ private var sParser: NCSemanticEntityParser = _
+
+
+ @BeforeEach
+ def start(): Unit =
+ tParser = NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
+ sParser =
+ NCTestUtils.makeAndStart(
+ new NCSemanticEntityParser(
+ new NCEnStemmer,
+ Seq(
+ new NCSemanticElement {
+ override def getId: String = "testId"
+ override def getSynonyms: util.List[String] =
Seq("test").asJava
+ }
+ ).asJava
+ )
+ )
+
+ private def checkSingleEntity(txt: String, expected: String): Unit =
+ val req = NCTestRequest(txt)
+ val res = sParser.parse(req, null, tParser.parse(req,
null)).asScala.toSeq
+
+ require(res.size == 1)
+
+ NCTestUtils.printEntities(txt, res)
+
+ @Test
+ def test(): Unit =
+ checkSingleEntity("test", "test")