[incubator-nlpcraft] branch NLPCRAFT-472 updated: WIP.

sergeykamov Thu, 30 Dec 2021 05:23:24 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
     new add3aa5  WIP.
add3aa5 is described below

commit add3aa58b1aa15cc26ddc59cd1e5040244aeb512
Author: Sergey Kamov <[email protected]>
AuthorDate: Thu Dec 30 16:23:10 2021 +0300

    WIP.
---
 nlpcraft/pom.xml                                   |  15 ++
 .../parser/semantic/NCSemanticEntityParser.java    |  12 ++
 .../semantic/impl/NCSemanticDataReader.scala       |  99 ++++++++++++
 .../semantic/impl/NCSemanticEntityParserImpl.scala | 175 ++++++++++-----------
 .../parser/semantic/impl/NCSemanticSynonym.scala   |  61 +++++++
 .../impl/NCSemanticSynonymsProcessor.scala         | 126 +++++++++++++++
 .../entity/parser/semantic/impl/NCSynonym.scala    |  55 -------
 .../parser/semantic/impl/NCSynonymChunk.scala      |  45 ------
 .../semantic/NCSemanticEntityParserJsonSpec.scala  |  56 +++++++
 .../semantic/NCSemanticEntityParserYamlSpec.scala  |  55 +++++++
 .../resources/models/alarm_model.json}             |  25 +--
 .../test/resources/models/lightswitch_model.yaml   |  52 ++++++
 pom.xml                                            |  20 +++
 13 files changed, 595 insertions(+), 201 deletions(-)

diff --git a/nlpcraft/pom.xml b/nlpcraft/pom.xml
index e181e50..4cba752 100644
--- a/nlpcraft/pom.xml
+++ b/nlpcraft/pom.xml
@@ -105,6 +105,21 @@
         </dependency>
 
         <dependency>
+            <groupId>com.fasterxml.jackson.dataformat</groupId>
+            <artifactId>jackson-dataformat-yaml</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+        </dependency>
+
+        <dependency>
+            <groupId>com.fasterxml.jackson.module</groupId>
+            <artifactId>jackson-module-scala_3</artifactId>
+        </dependency>
+
+        <dependency>
             <groupId>org.apache.opennlp</groupId>
             <artifactId>opennlp-tools</artifactId>
         </dependency>
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
index 22bdea3..223d5dd 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
@@ -39,6 +39,9 @@ public class NCSemanticEntityParser implements NCEntityParser 
{
      * @param elems
      */
     public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, 
List<NCSemanticElement> elems) {
+        Objects.requireNonNull(stemmer, "Stemmer cannot be null");
+        Objects.requireNonNull(elems, "Elements cannot be null");
+
         impl = NCSemanticEntityParserImpl.apply(stemmer, 
Collections.emptyMap(), elems);
     }
 
@@ -49,6 +52,9 @@ public class NCSemanticEntityParser implements NCEntityParser 
{
      * @param elems
      */
     public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, Map<String, 
String> macros, List<NCSemanticElement> elems) {
+        Objects.requireNonNull(stemmer, "Stemmer cannot be null");
+        Objects.requireNonNull(elems, "Elements cannot be null");
+
         impl = NCSemanticEntityParserImpl.apply(stemmer, macros, elems);
     }
 
@@ -58,6 +64,9 @@ public class NCSemanticEntityParser implements NCEntityParser 
{
      * @param mdlFile
      */
     public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, File mdlFile) 
{
+        Objects.requireNonNull(stemmer, "Stemmer cannot be null");
+        Objects.requireNonNull(mdlFile, "File cannot be null");
+
         impl = NCSemanticEntityParserImpl.apply(stemmer, mdlFile);
     }
 
@@ -67,6 +76,9 @@ public class NCSemanticEntityParser implements NCEntityParser 
{
      * @param mdlSrc
      */
     public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, String 
mdlSrc) {
+        Objects.requireNonNull(stemmer, "Stemmer cannot be null");
+        Objects.requireNonNull(mdlSrc, "Source cannot be null");
+
         impl = NCSemanticEntityParserImpl.apply(stemmer, mdlSrc);
     }
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
new file mode 100644
index 0000000..f7c1df5
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
+
+import com.fasterxml.jackson.core.JsonParser
+import com.fasterxml.jackson.databind.*
+import com.fasterxml.jackson.dataformat.yaml.*
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.nlp.entity.parser.semantic.*
+import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticSourceType.{JSON, 
NCSemanticSourceType, YAML}
+
+import java.io.InputStream
+import java.util
+import java.util.{List as JList, Map as JMap}
+import scala.jdk.CollectionConverters.*
+
+/**
+  *
+  */
+private[impl] object NCSemanticSourceType extends Enumeration:
+    type NCSemanticSourceType = Value
+    val JSON, YAML = Value
+
+    def apply(src: String): NCSemanticSourceType =
+        val lc = src.toLowerCase
+
+        if lc.endsWith(".json") || lc.endsWith(".js") then JSON
+        else if lc.endsWith(".yaml") || lc.endsWith(".yml") then YAML
+        else throw new NCException("Unexpected  data type. Expected `yaml` or 
`json` formats.") // TODO:
+
+/**
+  *
+  */
+private[impl] case class NCSemanticData(macros: Map[String, String], elements: 
Seq[NCSemanticElement])
+
+/**
+  *
+  */
+private[impl] object NCSemanticDataReader:
+    case class Value(name: String, synonyms: Seq[String])
+    case class Element(
+        id: String, description: String, groups: Seq[String], synonyms: 
Seq[String], values: Seq[Value]
+    )
+    case class Source(macros: Map[String, String], elements: Seq[Element])
+
+    private def emptyList[T]: JList[T] = util.Collections.emptyList()
+    private def nvl[T](seq: Seq[T]): JList[T] = if seq == null then emptyList 
else seq.asJava
+    private def nvlConvert[T, R](seq: Seq[T], to: T => R): JList[R] = if seq 
== null then emptyList else seq.map(to).asJava
+    private def convertValue(v: Value) =
+        new NCSemanticElementValue:
+            override def getName: String = v.name
+            override def getSynonyms: JList[String] = nvl(v.synonyms)
+    private def convertElement(e: Element) =
+        new NCSemanticElement:
+            override def getId: String = e.id
+            override def getGroups: JList[String] = nvl(e.groups)
+            override def getDescription: String = e.description
+            override def getValues: JList[NCSemanticElementValue] = 
nvlConvert(e.values, convertValue)
+            override def getSynonyms: JList[String] = nvl(e.synonyms)
+
+    /**
+      *
+      * @param is
+      * @param typ
+      * @return
+      */
+    def read(is: InputStream, typ: NCSemanticSourceType): NCSemanticData =
+        val mapper =
+            typ match
+                case JSON => new ObjectMapper()
+                case YAML => new ObjectMapper(new YAMLFactory())
+                case _ => throw new AssertionError(s"Unexpected type: $typ")
+
+        mapper.
+            registerModule(DefaultScalaModule).
+            enable(JsonParser.Feature.ALLOW_COMMENTS).
+            configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, true)
+
+        val src = mapper.readValue(is, classOf[Source])
+
+        NCSemanticData(
+            if src.macros == null then Map.empty else src.macros,
+            if src.elements == null then Seq.empty else 
src.elements.map(convertElement)
+        )
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index cbed8d8..665f0a7 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -21,53 +21,53 @@ import com.typesafe.scalalogging.LazyLogging
 import org.apache.nlpcraft.*
 import org.apache.nlpcraft.internal.makro.NCMacroParser
 import org.apache.nlpcraft.internal.util.NCUtils
-import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSynonymChunkKind.*
 import org.apache.nlpcraft.nlp.entity.parser.semantic.*
+import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticSourceType.*
+import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticChunkKind.*
 
-import java.io.File
-import java.util.{List as JList, Map as Jmap}
+import java.io.*
 import java.util.regex.*
+import java.util.{List as JList, Map as Jmap}
 import scala.collection.mutable
 import scala.jdk.CollectionConverters.*
 
 object NCSemanticEntityParserImpl:
     def apply(stemmer: NCSemanticTextStemmer, macros: Jmap[String, String], 
elems: JList[NCSemanticElement]): NCSemanticEntityParserImpl =
-        new NCSemanticEntityParserImpl(stemmer, macros.asScala.toMap, 
elems.asScala.toSeq)
+        require(stemmer != null)
+        require(macros != null)
+
+        new NCSemanticEntityParserImpl(
+            stemmer,
+            macros = if macros == null then Map.empty else 
macros.asScala.toMap,
+            elements = elems.asScala.toSeq
+        )
+
     def apply(stemmer: NCSemanticTextStemmer, mdlFile: File): 
NCSemanticEntityParserImpl =
-        new NCSemanticEntityParserImpl(stemmer, null, null)
+        require(stemmer != null)
+        require(mdlFile != null)
+
+        new NCSemanticEntityParserImpl(
+            stemmer,
+            is = new BufferedInputStream(new FileInputStream(mdlFile)),
+            typ = NCSemanticSourceType(mdlFile.getName)
+        )
+
     def apply(stemmer: NCSemanticTextStemmer, mdlSrc: String): 
NCSemanticEntityParserImpl =
-        new NCSemanticEntityParserImpl(stemmer, null, null)
+        require(stemmer != null)
+        require(mdlSrc != null)
 
-    private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
-    private final val REGEX_FIX = "//"
+        new NCSemanticEntityParserImpl(
+            stemmer,
+            is = new BufferedInputStream(NCUtils.getStream(mdlSrc)),
+            typ = NCSemanticSourceType(mdlSrc)
+        )
 
     /**
-      * @param main Tokens.
-      * @param extra Variants without stopwords.
+      * @param baseTokens Tokens.
+      * @param variants Variants without stopwords.
       */
-    private case class Piece(main: Seq[NCToken], extra: Seq[Seq[NCToken]])
-
-    private def startsAndEnds(fix: String, s: String): Boolean = 
s.startsWith(fix) && s.endsWith(fix)
-    private def mkChunk(stemmer: NCSemanticTextStemmer, chunk: String): 
NCSynonymChunk =
-        def stripSuffix(fix: String, s: String): String = s.slice(fix.length, 
s.length - fix.length)
-
-        // Regex synonym.
-        if startsAndEnds(REGEX_FIX, chunk) then
-            val ptrn = stripSuffix(REGEX_FIX, chunk)
-            if ptrn.nonEmpty then
-                try
-                    NCSynonymChunk(kind = REGEX, text = chunk, regex = 
Pattern.compile(ptrn))
-                catch
-                    case e: PatternSyntaxException =>
-                        throw new NCException(s"Invalid regex synonym syntax 
detected [" +
-                            s"chunk=$chunk" +
-                            s"]", e)
-            else
-                throw new NCException(s"Empty regex synonym detected [" +
-                    s"chunk=$chunk" +
-                    s"]")
-        else
-            NCSynonymChunk(kind = TEXT, text = chunk, stem = 
stemmer.stem(chunk))
+    private case class Piece(baseTokens: Seq[NCToken], variants: 
Seq[Seq[NCToken]])
+
     /**
       *
       * 1. Prepares combination of tokens (sliding).
@@ -125,74 +125,71 @@ object NCSemanticEntityParserImpl:
 
 import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl.*
 
+/**
+  *
+  * @param stemmer
+  * @param macros
+  * @param elements
+  */
 class NCSemanticEntityParserImpl(
     stemmer: NCSemanticTextStemmer,
-    macros: Map[String, String],
-    elements: Seq[NCSemanticElement]
+    macros: Map[String, String] = null,
+    elements: Seq[NCSemanticElement] = null,
+    is: InputStream = null,
+    typ: NCSemanticSourceType = null
 ) extends NCEntityParser with LazyLogging:
-    private var sortedSyns: Map[Int, Map[String, Seq[NCSynonym]]] = _
+    require(stemmer != null)
+    require(macros != null && elements != null || is != null && typ != null)
+
+    @volatile private var h: NCSemanticSynonymsHolder = _
 
     override def start(cfg: NCModelConfig): Unit =
-        val p = new NCMacroParser
-
-        for ((name, body) <- macros) p.addMacro(name, body)
-
-        case class Holder(elemId: String, synonyms: Seq[NCSynonym])
-
-        val buf = mutable.ArrayBuffer.empty[Holder]
-
-        elements.foreach(e =>
-            if e.getSynonyms != null then
-                val syns = e.getSynonyms.asScala
-                val susp = syns.filter(syn => !syn.contains("//") && 
SUSP_SYNS_CHARS.exists(susp => syn.contains(susp)))
-
-                if susp.nonEmpty then
-                    logger.warn(
-                        s"Suspicious synonyms detected (use of 
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
-                            s"elementId=${e.getId}, " +
-                            s"synonyms=[${susp.mkString(", ")}]" +
-                            s"]"
-                    )
-
-                // TODO: NCSynonym + trim for lines etc
-                buf += Holder(
-                    e.getId,
-                    syns.
-                        flatMap(p.expand).
-                        map(t => cfg.getTokenizer.tokenize(cfg, 
t).asScala.map(w => mkChunk(stemmer, w.getText)).toSeq).
-                        // TODO:
-                        toSeq.map(chunks => NCSynonym(false, false, null, 
chunks))
-                )
-
-                // TODO: values, elementID
-        )
+        val (macros, elements) =
+            if (is != null)
+                val src = NCSemanticDataReader.read(is, typ)
+                (src.macros, src.elements)
+            else
+                (this.macros, this.elements)
 
-        sortedSyns =
-            buf.groupBy(_.synonyms.size).map { (len, hs) =>
-               len -> hs.groupBy(_.elemId).map { case (id, seq) => id -> 
seq.flatMap(_.synonyms).toSeq.sorted }
-            }
+        h = NCSemanticSynonymsProcessor.prepare(cfg, stemmer, macros, elements)
 
-    override def stop(): Unit = sortedSyns = null
+    override def stop(): Unit = h = null
 
     override def parse(req: NCRequest, cfg: NCModelConfig, toksList: 
JList[NCToken]): JList[NCEntity] =
-        val cache = mutable.HashSet.empty[Seq[Int]]
-        val ents = mutable.ArrayBuffer.empty[NCEntity]
         val toks = toksList.asScala.toSeq
+        val cache = mutable.HashSet.empty[Seq[Int]] // Variants (tokens 
without stopwords) can be repeated.
+        val ents = mutable.ArrayBuffer.empty[NCEntity]
+
+        for (piece <- getPieces(toks); variant <- Seq(piece.baseTokens) ++ 
piece.variants)
+            def addEntity(elemId: String): Unit =
+                ents +=
+                    new NCPropertyMapAdapter with NCEntity:
+                        override def getTokens: JList[NCToken] = 
piece.baseTokens.asJava
+                        override def getRequestId: String = req.getRequestId
+                        override def getId: String = elemId
 
-        for (piece <- getPieces(toks); extra <- Seq(piece.main) ++ piece.extra)
-            val idxs = toks.map(_.getIndex)
+            val idxs = variant.map(_.getIndex)
 
             if cache.add(idxs) then
-                for ((id, syns) <- sortedSyns.getOrElse(toks.size, Seq.empty))
-                    var found = false
-
-                    for (s <- syns if !found)
-                        if s.isMatch(toks) then
-                            found = true
-                            ents +=
-                                new NCPropertyMapAdapter with NCEntity:
-                                    override def getTokens: JList[NCToken] = 
piece.main.asJava
-                                    override def getRequestId: String = 
req.getRequestId
-                                    override def getId: String = id
+                h.textSynonyms.get(variant.map(_.getStem).mkString(" ")) match
+                    case Some(elemIds) => elemIds.foreach(addEntity)
+                    case None =>
+                        for ((elemId, syns) <- 
h.mixedSynonyms.getOrElse(variant.size, Seq.empty))
+                            var found = false
+
+                            for (s <- syns if !found)
+                                found =
+                                    s.chunks.zip(variant).
+                                        sortBy { case (chunk, _) => if 
chunk.isText then 0 else 1 }.
+                                        forall { case (chunk, tok) =>
+                                            if chunk.isText then
+                                                chunk.stem == tok.getStem
+                                            else
+                                                def match0(txt: String) = 
chunk.regex.matcher(txt).matches()
+
+                                                match0(tok.getText) || 
match0(tok.getText.toLowerCase)
+                                        }
+                                if (found)
+                                    addEntity(elemId)
 
         ents.toSeq.asJava
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonym.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonym.scala
new file mode 100644
index 0000000..fbdfe38
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonym.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
+
+import org.apache.nlpcraft.NCToken
+import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticChunkKind.*
+
+import java.util.regex.Pattern
+
+/**
+  *
+  */
+private[impl] object NCSemanticChunkKind extends Enumeration:
+    type NCSemanticChunkKind = Value
+    val TEXT, REGEX = Value
+
+/**
+  *
+  * @param kind Kind of synonym chunk.
+  * @param text Original text.
+  * @param stem Optional stem for a single word synonyms.
+  * @param regex Optional regex expression to match on.
+  */
+private[impl] case class NCSemanticSynonymChunk(
+    kind: NCSemanticChunkKind, text: String, stem: String = null, regex: 
Pattern = null
+) {
+    require(text != null && kind != null)
+
+    val isText: Boolean = text != null
+
+    override def toString = s"($text|$kind)"
+}
+
+private[impl] case class NCSemanticSynonym(
+    chunks: Seq[NCSemanticSynonymChunk], isElementId: Boolean = false, 
isValueName: Boolean = false, value: String = null,
+) extends Comparable[NCSemanticSynonym]:
+    require(chunks != null)
+    require(chunks.nonEmpty)
+    if isElementId then require(!isValueName && value == null)
+    if isValueName || value != null then require(!isElementId)
+
+    final val size = chunks.size
+    private final val regexCount = size - chunks.count(_.kind == TEXT)
+    final val isText = regexCount == 0
+    final lazy val stem = if isText then chunks.map(_.stem).mkString(" ") else 
null
+
+    override def compareTo(o: NCSemanticSynonym): Int = 
Integer.compare(regexCount, o.regexCount)
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
new file mode 100644
index 0000000..3374d8d
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
+
+import com.fasterxml.jackson.databind.*
+import com.fasterxml.jackson.dataformat.yaml.*
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.makro.NCMacroParser
+import org.apache.nlpcraft.nlp.entity.parser.semantic.*
+import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticChunkKind.*
+
+import java.io.InputStream
+import java.util
+import java.util.List as JList
+import java.util.regex.*
+import scala.collection.mutable
+import scala.jdk.CollectionConverters.*
+
+private[impl] case class NCSemanticSynonymsHolder(
+    textSynonyms: Map[String, Set[String]],
+    mixedSynonyms: Map[Int, Map[String, Seq[NCSemanticSynonym]]]
+)
+
+/**
+  *
+  */
+private[impl] object NCSemanticSynonymsProcessor:
+    private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
+    private final val REGEX_FIX = "//"
+
+    private def validate(macros: Map[String, String], elements: 
Seq[NCSemanticElement]): Unit =
+        () // TODO:
+
+    private def startsAndEnds(fix: String, s: String): Boolean = 
s.startsWith(fix) && s.endsWith(fix)
+    private def mkChunk(stemmer: NCSemanticTextStemmer, chunk: String): 
NCSemanticSynonymChunk =
+        def stripSuffix(fix: String, s: String): String = s.slice(fix.length, 
s.length - fix.length)
+
+        // Regex synonym.
+        if startsAndEnds(REGEX_FIX, chunk) then
+            val ptrn = stripSuffix(REGEX_FIX, chunk)
+            if ptrn.nonEmpty then
+                try
+                    NCSemanticSynonymChunk(kind = REGEX, text = chunk, regex = 
Pattern.compile(ptrn))
+                catch
+                    case e: PatternSyntaxException =>
+                        throw new NCException(s"Invalid regex synonym syntax 
detected [" +
+                            s"chunk=$chunk" +
+                            s"]", e)
+            else
+                throw new NCException(s"Empty regex synonym detected [" +
+                    s"chunk=$chunk" +
+                    s"]")
+        else
+            NCSemanticSynonymChunk(kind = TEXT, text = chunk, stem = 
stemmer.stem(chunk))
+
+    /**
+      *
+      * @param cfg
+      * @param stemmer
+      * @param macros
+      * @param elements
+      * @throws NCException // TODO
+      */
+    def prepare(
+        cfg: NCModelConfig,
+        stemmer: NCSemanticTextStemmer,
+        macros: Map[String, String],
+        elements: Seq[NCSemanticElement]
+    ): NCSemanticSynonymsHolder =
+        validate(macros, elements)
+
+        val p = new NCMacroParser
+
+        for ((name, body) <- macros) p.addMacro(name, body)
+
+        case class Holder(synonym: NCSemanticSynonym, elementId: String)
+
+        val buf = mutable.ArrayBuffer.empty[Holder]
+
+        for (e <- elements)
+            val elemId = e.getId
+
+            def add(syns: Seq[NCSemanticSynonym]): Unit = buf ++= 
syns.map(Holder(_, elemId))
+            def convert(syns: JList[String]): Seq[Seq[NCSemanticSynonymChunk]] 
=
+                syns.asScala.flatMap(p.expand).
+                    map(t => cfg.getTokenizer.tokenize(cfg, t).asScala.map(w 
=> mkChunk(stemmer, w.getText)).toSeq).toSeq
+            def mkSpecChunk(id: String): NCSemanticSynonymChunk = 
NCSemanticSynonymChunk(TEXT, id, stemmer.stem(id))
+
+            // TODO:
+            add(Seq(NCSemanticSynonym(Seq(mkSpecChunk(elemId)), isElementId = 
true)))
+
+            if e.getSynonyms != null then
+                // TODO: NCSynonym + trim for lines etc
+                add(convert(e.getSynonyms).map(chunks => 
NCSemanticSynonym(chunks)))
+
+            if e.getValues != null then
+                for (v <- e.getValues.asScala)
+                    add(Seq(NCSemanticSynonym(Seq(mkSpecChunk(v.getName)), 
isValueName = true, value = v.getName)))
+
+                    if (v.getSynonyms != null)
+                        add(convert(v.getSynonyms).map(chunks => 
NCSemanticSynonym(chunks, value = v.getName)))
+
+        val txtBuf = buf.filter(_.synonym.isText)
+
+        buf --= txtBuf
+
+        val txtSyns = txtBuf.groupBy(_.synonym.stem).map { (stem, hs) => stem 
-> hs.map(_.elementId).toSet }
+        val mixedSyns = buf.groupBy(_.synonym.size).
+            map { case (size, hs) => size -> hs.groupBy(_.elementId).map { 
(id, hs) => id -> hs.map(_.synonym).toSeq } }
+
+        NCSemanticSynonymsHolder(txtSyns, mixedSyns)
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
deleted file mode 100644
index 0fe259b..0000000
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
-
-import org.apache.nlpcraft.NCToken
-import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSynonymChunkKind._
-
-case class NCSynonym(
-    isElementId: Boolean,
-    isValueName: Boolean,
-    value: String = null,
-    chunks: Seq[NCSynonymChunk]
-) extends Comparable[NCSynonym]:
-    private final val size = chunks.size
-    private final val regexCount = size - chunks.count(_.kind == TEXT)
-    private final val isText = regexCount == 0
-
-    private lazy val stem = if isText then chunks.map(_.stem).mkString(" ") 
else null
-
-    def isMatch(toks: Seq[NCToken]): Boolean =
-        size == toks.size && (
-            if isText then
-                stem == toks.map(_.getStem).mkString(" ")
-            else
-                chunks.zip(toks).
-                    sortBy { case (chunk, _) => if chunk.isText then 0 else 1 
}.
-                    forall { (chunk, tok) =>
-                        if chunk.isText then
-                            chunk.stem == tok.getStem
-                        else
-                            def match0(txt: String) = 
chunk.regex.matcher(txt).matches()
-
-                            match0(tok.getText) || 
match0(tok.getText.toLowerCase)
-                    }
-        )
-
-    override def compareTo(o: NCSynonym): Int = Integer.compare(regexCount, 
o.regexCount)
-
-
-
-
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
deleted file mode 100644
index fc782c9..0000000
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
-
-import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSynonymChunkKind.NCSynonymChunkKind
-
-import java.util.regex.Pattern
-
-/**
- *
- * @param kind Kind of synonym chunk.
- * @param text Original text.
- * @param stem Optional stem for a single word synonyms.
- * @param pos Optional PoS tag to match on.
- * @param regex Optional regex expression to match on.
- */
-case class NCSynonymChunk(
-    kind: NCSynonymChunkKind,
-    text: String,
-    stem: String = null, // Only for kind == TEXT.
-    pos: String = null,
-    regex: Pattern = null
-) {
-    require(text != null)
-    require(kind != null)
-
-    val isText: Boolean = text != null
-
-    override def toString = s"($text|$kind)"
-}
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserJsonSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserJsonSpec.scala
new file mode 100644
index 0000000..77a002b
--- /dev/null
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserJsonSpec.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.entity.parser.semantic
+
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
+import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en.*
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
+import org.apache.nlpcraft.nlp.util.NCTestConfig.*
+import org.junit.jupiter.api.*
+
+import java.util
+import scala.collection.mutable
+import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.*
+import scala.jdk.OptionConverters.RichOptional
+
+/**
+  *
+  */
+class NCSemanticEntityParserJsonSpec:
+    private var parser: NCSemanticEntityParser = _
+
+    @BeforeEach
+    def start(): Unit =
+        parser =
+            NCTestUtils.makeAndStart(
+                new NCSemanticEntityParser(new NCEnSemanticTextStemmer, 
"models/alarm_model.json")
+            )
+
+    private def checkSingleEntity(txt: String, expected: String): Unit =
+        val req = NCTestRequest(txt)
+        val res = parser.parse(req, EN_MDL_CFG, EN_PARSER.parse(req, 
EN_MDL_CFG)).asScala.toSeq
+
+        NCTestUtils.printEntities(txt, res)
+
+    @Test
+    def test(): Unit =
+        checkSingleEntity("Ping me in 3 minutes tomorrow", "test")
\ No newline at end of file
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserYamlSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserYamlSpec.scala
new file mode 100644
index 0000000..273f7d1
--- /dev/null
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserYamlSpec.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.entity.parser.semantic
+
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
+import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en.*
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
+import org.apache.nlpcraft.nlp.util.NCTestConfig.*
+import org.junit.jupiter.api.*
+
+import java.util
+import scala.collection.mutable
+import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.*
+import scala.jdk.OptionConverters.RichOptional
+/**
+  *
+  */
+class NCSemanticEntityParserYamlSpec:
+    private var parser: NCSemanticEntityParser = _
+
+    @BeforeEach
+    def start(): Unit =
+        parser =
+            NCTestUtils.makeAndStart(
+                new NCSemanticEntityParser(new NCEnSemanticTextStemmer, 
"models/lightswitch_model.yaml")
+            )
+
+    private def checkSingleEntity(txt: String, expected: String): Unit =
+        val req = NCTestRequest(txt)
+        val res = parser.parse(req, EN_MDL_CFG, EN_PARSER.parse(req, 
EN_MDL_CFG)).asScala.toSeq
+
+        NCTestUtils.printEntities(txt, res)
+
+    @Test
+    def test(): Unit =
+        checkSingleEntity("Turn the lights off in the entire house.", "test")
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
 b/nlpcraft/src/test/resources/models/alarm_model.json
similarity index 66%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
rename to nlpcraft/src/test/resources/models/alarm_model.json
index aeceeee..b12bb30 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
+++ b/nlpcraft/src/test/resources/models/alarm_model.json
@@ -6,7 +6,7 @@
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
- *      https://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -15,14 +15,15 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
-
-/**
-  * Synonym element type.
-  */
-object NCSynonymChunkKind extends Enumeration {
-    type NCSynonymChunkKind = Value
-    
-    val TEXT: Value = Value // Simple word.
-    val REGEX: Value = Value // RegEx match expression (//[abd]+//).
-}
+{
+    "elements": [
+        {
+            "id": "x:alarm",
+            "description": "Alarm token indicator.",
+            "synonyms": [
+                "{ping|buzz|wake|call|hit} {me|up|me up|_}",
+                "{set|_} {my|_} {wake|wake up|_} 
{alarm|timer|clock|buzzer|call} {clock|_} {up|_}"
+            ]
+        }
+    ]
+}
\ No newline at end of file
diff --git a/nlpcraft/src/test/resources/models/lightswitch_model.yaml 
b/nlpcraft/src/test/resources/models/lightswitch_model.yaml
new file mode 100644
index 0000000..e063e25
--- /dev/null
+++ b/nlpcraft/src/test/resources/models/lightswitch_model.yaml
@@ -0,0 +1,52 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+macros:
+  "<ACTION>" : "{turn|switch|dial|let|set|get|put}"
+  "<KILL>" : "{shut|kill|stop|eliminate}"
+  "<ENTIRE_OPT>" : "{entire|full|whole|total|_}"
+  "<FLOOR_OPT>" : 
"{upstairs|downstairs|{1st|first|2nd|second|3rd|third|4th|fourth|5th|fifth|top|ground}
 floor|_}"
+  "<TYPE>" : "{room|closet|attic|loft|{store|storage} {room|_}}"
+  "<LIGHT>" : "{all|_} {it|them|light|illumination|lamp|lamplight}"
+
+elements:
+  - id: "ls:loc"
+    description: "Location of lights."
+    synonyms:
+      - "<ENTIRE_OPT> <FLOOR_OPT> 
{kitchen|library|closet|garage|office|playroom|{dinning|laundry|play} <TYPE>}"
+      - "<ENTIRE_OPT> <FLOOR_OPT> {master|kid|children|child|guest|_} 
{bedroom|bathroom|washroom|storage} {<TYPE>|_}"
+      - "<ENTIRE_OPT> {house|home|building|{1st|first} floor|{2nd|second} 
floor}"
+
+  - id: "ls:on"
+    groups:
+      - "act"
+    description: "Light switch ON action."
+    synonyms:
+      - "<ACTION> {on|up|_} <LIGHT> {on|up|_}"
+      - "<LIGHT> {on|up}"
+
+  - id: "ls:off"
+    groups:
+      - "act"
+    description: "Light switch OFF action."
+    synonyms:
+      - "<ACTION> <LIGHT> {off|out|down}"
+      - "{<ACTION>|<KILL>} {off|out|down} <LIGHT>"
+      - "<KILL> <LIGHT>"
+      - "<LIGHT> <KILL>"
+      - "{out|no|off|down} <LIGHT>"
+      - "<LIGHT> {out|off|down}"
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 37f93be..c156505 100644
--- a/pom.xml
+++ b/pom.xml
@@ -105,6 +105,8 @@
         <junit.ver>5.8.2</junit.ver>
         <scalatest.ver>3.2.9</scalatest.ver>
         <gson.ver>2.8.5</gson.ver>
+        <jackson.ver>2.13.1</jackson.ver>
+        <jackson.yaml.ver>2.13.1</jackson.yaml.ver>
         <apache.opennlp.ver>1.9.4</apache.opennlp.ver>
         <jmh.version>1.33</jmh.version>
 
@@ -170,6 +172,24 @@
             </dependency>
 
             <dependency>
+                <groupId>com.fasterxml.jackson.dataformat</groupId>
+                <artifactId>jackson-dataformat-yaml</artifactId>
+                <version>${jackson.ver}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.fasterxml.jackson.core</groupId>
+                <artifactId>jackson-databind</artifactId>
+                <version>${jackson.ver}</version>
+            </dependency>
+
+            <dependency>
+                <groupId>com.fasterxml.jackson.module</groupId>
+                <artifactId>jackson-module-scala_3</artifactId>
+                <version>${jackson.ver}</version>
+            </dependency>
+
+            <dependency>
                 <groupId>org.antlr</groupId>
                 <artifactId>antlr4-runtime</artifactId>
                 <version>${org.antlr4.ver}</version>

[incubator-nlpcraft] branch NLPCRAFT-472 updated: WIP.

Reply via email to