This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new add3aa5 WIP.
add3aa5 is described below
commit add3aa58b1aa15cc26ddc59cd1e5040244aeb512
Author: Sergey Kamov <[email protected]>
AuthorDate: Thu Dec 30 16:23:10 2021 +0300
WIP.
---
nlpcraft/pom.xml | 15 ++
.../parser/semantic/NCSemanticEntityParser.java | 12 ++
.../semantic/impl/NCSemanticDataReader.scala | 99 ++++++++++++
.../semantic/impl/NCSemanticEntityParserImpl.scala | 175 ++++++++++-----------
.../parser/semantic/impl/NCSemanticSynonym.scala | 61 +++++++
.../impl/NCSemanticSynonymsProcessor.scala | 126 +++++++++++++++
.../entity/parser/semantic/impl/NCSynonym.scala | 55 -------
.../parser/semantic/impl/NCSynonymChunk.scala | 45 ------
.../semantic/NCSemanticEntityParserJsonSpec.scala | 56 +++++++
.../semantic/NCSemanticEntityParserYamlSpec.scala | 55 +++++++
.../resources/models/alarm_model.json} | 25 +--
.../test/resources/models/lightswitch_model.yaml | 52 ++++++
pom.xml | 20 +++
13 files changed, 595 insertions(+), 201 deletions(-)
diff --git a/nlpcraft/pom.xml b/nlpcraft/pom.xml
index e181e50..4cba752 100644
--- a/nlpcraft/pom.xml
+++ b/nlpcraft/pom.xml
@@ -105,6 +105,21 @@
</dependency>
<dependency>
+ <groupId>com.fasterxml.jackson.dataformat</groupId>
+ <artifactId>jackson-dataformat-yaml</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>com.fasterxml.jackson.module</groupId>
+ <artifactId>jackson-module-scala_3</artifactId>
+ </dependency>
+
+ <dependency>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
</dependency>
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
index 22bdea3..223d5dd 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
@@ -39,6 +39,9 @@ public class NCSemanticEntityParser implements NCEntityParser
{
* @param elems
*/
public NCSemanticEntityParser(NCSemanticTextStemmer stemmer,
List<NCSemanticElement> elems) {
+ Objects.requireNonNull(stemmer, "Stemmer cannot be null");
+ Objects.requireNonNull(elems, "Elements cannot be null");
+
impl = NCSemanticEntityParserImpl.apply(stemmer,
Collections.emptyMap(), elems);
}
@@ -49,6 +52,9 @@ public class NCSemanticEntityParser implements NCEntityParser
{
* @param elems
*/
public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, Map<String,
String> macros, List<NCSemanticElement> elems) {
+ Objects.requireNonNull(stemmer, "Stemmer cannot be null");
+ Objects.requireNonNull(elems, "Elements cannot be null");
+
impl = NCSemanticEntityParserImpl.apply(stemmer, macros, elems);
}
@@ -58,6 +64,9 @@ public class NCSemanticEntityParser implements NCEntityParser
{
* @param mdlFile
*/
public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, File mdlFile)
{
+ Objects.requireNonNull(stemmer, "Stemmer cannot be null");
+ Objects.requireNonNull(mdlFile, "File cannot be null");
+
impl = NCSemanticEntityParserImpl.apply(stemmer, mdlFile);
}
@@ -67,6 +76,9 @@ public class NCSemanticEntityParser implements NCEntityParser
{
* @param mdlSrc
*/
public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, String
mdlSrc) {
+ Objects.requireNonNull(stemmer, "Stemmer cannot be null");
+ Objects.requireNonNull(mdlSrc, "Source cannot be null");
+
impl = NCSemanticEntityParserImpl.apply(stemmer, mdlSrc);
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
new file mode 100644
index 0000000..f7c1df5
--- /dev/null
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
+
+import com.fasterxml.jackson.core.JsonParser
+import com.fasterxml.jackson.databind.*
+import com.fasterxml.jackson.dataformat.yaml.*
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.nlp.entity.parser.semantic.*
+import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticSourceType.{JSON,
NCSemanticSourceType, YAML}
+
+import java.io.InputStream
+import java.util
+import java.util.{List as JList, Map as JMap}
+import scala.jdk.CollectionConverters.*
+
+/**
+ *
+ */
+private[impl] object NCSemanticSourceType extends Enumeration:
+ type NCSemanticSourceType = Value
+ val JSON, YAML = Value
+
+ def apply(src: String): NCSemanticSourceType =
+ val lc = src.toLowerCase
+
+ if lc.endsWith(".json") || lc.endsWith(".js") then JSON
+ else if lc.endsWith(".yaml") || lc.endsWith(".yml") then YAML
+ else throw new NCException("Unexpected data type. Expected `yaml` or
`json` formats.") // TODO:
+
+/**
+ *
+ */
+private[impl] case class NCSemanticData(macros: Map[String, String], elements:
Seq[NCSemanticElement])
+
+/**
+ *
+ */
+private[impl] object NCSemanticDataReader:
+ case class Value(name: String, synonyms: Seq[String])
+ case class Element(
+ id: String, description: String, groups: Seq[String], synonyms:
Seq[String], values: Seq[Value]
+ )
+ case class Source(macros: Map[String, String], elements: Seq[Element])
+
+ private def emptyList[T]: JList[T] = util.Collections.emptyList()
+ private def nvl[T](seq: Seq[T]): JList[T] = if seq == null then emptyList
else seq.asJava
+ private def nvlConvert[T, R](seq: Seq[T], to: T => R): JList[R] = if seq
== null then emptyList else seq.map(to).asJava
+ private def convertValue(v: Value) =
+ new NCSemanticElementValue:
+ override def getName: String = v.name
+ override def getSynonyms: JList[String] = nvl(v.synonyms)
+ private def convertElement(e: Element) =
+ new NCSemanticElement:
+ override def getId: String = e.id
+ override def getGroups: JList[String] = nvl(e.groups)
+ override def getDescription: String = e.description
+ override def getValues: JList[NCSemanticElementValue] =
nvlConvert(e.values, convertValue)
+ override def getSynonyms: JList[String] = nvl(e.synonyms)
+
+ /**
+ *
+ * @param is
+ * @param typ
+ * @return
+ */
+ def read(is: InputStream, typ: NCSemanticSourceType): NCSemanticData =
+ val mapper =
+ typ match
+ case JSON => new ObjectMapper()
+ case YAML => new ObjectMapper(new YAMLFactory())
+ case _ => throw new AssertionError(s"Unexpected type: $typ")
+
+ mapper.
+ registerModule(DefaultScalaModule).
+ enable(JsonParser.Feature.ALLOW_COMMENTS).
+ configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, true)
+
+ val src = mapper.readValue(is, classOf[Source])
+
+ NCSemanticData(
+ if src.macros == null then Map.empty else src.macros,
+ if src.elements == null then Seq.empty else
src.elements.map(convertElement)
+ )
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index cbed8d8..665f0a7 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -21,53 +21,53 @@ import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.makro.NCMacroParser
import org.apache.nlpcraft.internal.util.NCUtils
-import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSynonymChunkKind.*
import org.apache.nlpcraft.nlp.entity.parser.semantic.*
+import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticSourceType.*
+import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticChunkKind.*
-import java.io.File
-import java.util.{List as JList, Map as Jmap}
+import java.io.*
import java.util.regex.*
+import java.util.{List as JList, Map as Jmap}
import scala.collection.mutable
import scala.jdk.CollectionConverters.*
object NCSemanticEntityParserImpl:
def apply(stemmer: NCSemanticTextStemmer, macros: Jmap[String, String],
elems: JList[NCSemanticElement]): NCSemanticEntityParserImpl =
- new NCSemanticEntityParserImpl(stemmer, macros.asScala.toMap,
elems.asScala.toSeq)
+ require(stemmer != null)
+ require(macros != null)
+
+ new NCSemanticEntityParserImpl(
+ stemmer,
+ macros = if macros == null then Map.empty else
macros.asScala.toMap,
+ elements = elems.asScala.toSeq
+ )
+
def apply(stemmer: NCSemanticTextStemmer, mdlFile: File):
NCSemanticEntityParserImpl =
- new NCSemanticEntityParserImpl(stemmer, null, null)
+ require(stemmer != null)
+ require(mdlFile != null)
+
+ new NCSemanticEntityParserImpl(
+ stemmer,
+ is = new BufferedInputStream(new FileInputStream(mdlFile)),
+ typ = NCSemanticSourceType(mdlFile.getName)
+ )
+
def apply(stemmer: NCSemanticTextStemmer, mdlSrc: String):
NCSemanticEntityParserImpl =
- new NCSemanticEntityParserImpl(stemmer, null, null)
+ require(stemmer != null)
+ require(mdlSrc != null)
- private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
- private final val REGEX_FIX = "//"
+ new NCSemanticEntityParserImpl(
+ stemmer,
+ is = new BufferedInputStream(NCUtils.getStream(mdlSrc)),
+ typ = NCSemanticSourceType(mdlSrc)
+ )
/**
- * @param main Tokens.
- * @param extra Variants without stopwords.
+ * @param baseTokens Tokens.
+ * @param variants Variants without stopwords.
*/
- private case class Piece(main: Seq[NCToken], extra: Seq[Seq[NCToken]])
-
- private def startsAndEnds(fix: String, s: String): Boolean =
s.startsWith(fix) && s.endsWith(fix)
- private def mkChunk(stemmer: NCSemanticTextStemmer, chunk: String):
NCSynonymChunk =
- def stripSuffix(fix: String, s: String): String = s.slice(fix.length,
s.length - fix.length)
-
- // Regex synonym.
- if startsAndEnds(REGEX_FIX, chunk) then
- val ptrn = stripSuffix(REGEX_FIX, chunk)
- if ptrn.nonEmpty then
- try
- NCSynonymChunk(kind = REGEX, text = chunk, regex =
Pattern.compile(ptrn))
- catch
- case e: PatternSyntaxException =>
- throw new NCException(s"Invalid regex synonym syntax
detected [" +
- s"chunk=$chunk" +
- s"]", e)
- else
- throw new NCException(s"Empty regex synonym detected [" +
- s"chunk=$chunk" +
- s"]")
- else
- NCSynonymChunk(kind = TEXT, text = chunk, stem =
stemmer.stem(chunk))
+ private case class Piece(baseTokens: Seq[NCToken], variants:
Seq[Seq[NCToken]])
+
/**
*
* 1. Prepares combination of tokens (sliding).
@@ -125,74 +125,71 @@ object NCSemanticEntityParserImpl:
import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl.*
+/**
+ *
+ * @param stemmer
+ * @param macros
+ * @param elements
+ */
class NCSemanticEntityParserImpl(
stemmer: NCSemanticTextStemmer,
- macros: Map[String, String],
- elements: Seq[NCSemanticElement]
+ macros: Map[String, String] = null,
+ elements: Seq[NCSemanticElement] = null,
+ is: InputStream = null,
+ typ: NCSemanticSourceType = null
) extends NCEntityParser with LazyLogging:
- private var sortedSyns: Map[Int, Map[String, Seq[NCSynonym]]] = _
+ require(stemmer != null)
+ require(macros != null && elements != null || is != null && typ != null)
+
+ @volatile private var h: NCSemanticSynonymsHolder = _
override def start(cfg: NCModelConfig): Unit =
- val p = new NCMacroParser
-
- for ((name, body) <- macros) p.addMacro(name, body)
-
- case class Holder(elemId: String, synonyms: Seq[NCSynonym])
-
- val buf = mutable.ArrayBuffer.empty[Holder]
-
- elements.foreach(e =>
- if e.getSynonyms != null then
- val syns = e.getSynonyms.asScala
- val susp = syns.filter(syn => !syn.contains("//") &&
SUSP_SYNS_CHARS.exists(susp => syn.contains(susp)))
-
- if susp.nonEmpty then
- logger.warn(
- s"Suspicious synonyms detected (use of
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
- s"elementId=${e.getId}, " +
- s"synonyms=[${susp.mkString(", ")}]" +
- s"]"
- )
-
- // TODO: NCSynonym + trim for lines etc
- buf += Holder(
- e.getId,
- syns.
- flatMap(p.expand).
- map(t => cfg.getTokenizer.tokenize(cfg,
t).asScala.map(w => mkChunk(stemmer, w.getText)).toSeq).
- // TODO:
- toSeq.map(chunks => NCSynonym(false, false, null,
chunks))
- )
-
- // TODO: values, elementID
- )
+ val (macros, elements) =
+ if (is != null)
+ val src = NCSemanticDataReader.read(is, typ)
+ (src.macros, src.elements)
+ else
+ (this.macros, this.elements)
- sortedSyns =
- buf.groupBy(_.synonyms.size).map { (len, hs) =>
- len -> hs.groupBy(_.elemId).map { case (id, seq) => id ->
seq.flatMap(_.synonyms).toSeq.sorted }
- }
+ h = NCSemanticSynonymsProcessor.prepare(cfg, stemmer, macros, elements)
- override def stop(): Unit = sortedSyns = null
+ override def stop(): Unit = h = null
override def parse(req: NCRequest, cfg: NCModelConfig, toksList:
JList[NCToken]): JList[NCEntity] =
- val cache = mutable.HashSet.empty[Seq[Int]]
- val ents = mutable.ArrayBuffer.empty[NCEntity]
val toks = toksList.asScala.toSeq
+ val cache = mutable.HashSet.empty[Seq[Int]] // Variants (tokens
without stopwords) can be repeated.
+ val ents = mutable.ArrayBuffer.empty[NCEntity]
+
+ for (piece <- getPieces(toks); variant <- Seq(piece.baseTokens) ++
piece.variants)
+ def addEntity(elemId: String): Unit =
+ ents +=
+ new NCPropertyMapAdapter with NCEntity:
+ override def getTokens: JList[NCToken] =
piece.baseTokens.asJava
+ override def getRequestId: String = req.getRequestId
+ override def getId: String = elemId
- for (piece <- getPieces(toks); extra <- Seq(piece.main) ++ piece.extra)
- val idxs = toks.map(_.getIndex)
+ val idxs = variant.map(_.getIndex)
if cache.add(idxs) then
- for ((id, syns) <- sortedSyns.getOrElse(toks.size, Seq.empty))
- var found = false
-
- for (s <- syns if !found)
- if s.isMatch(toks) then
- found = true
- ents +=
- new NCPropertyMapAdapter with NCEntity:
- override def getTokens: JList[NCToken] =
piece.main.asJava
- override def getRequestId: String =
req.getRequestId
- override def getId: String = id
+ h.textSynonyms.get(variant.map(_.getStem).mkString(" ")) match
+ case Some(elemIds) => elemIds.foreach(addEntity)
+ case None =>
+ for ((elemId, syns) <-
h.mixedSynonyms.getOrElse(variant.size, Seq.empty))
+ var found = false
+
+ for (s <- syns if !found)
+ found =
+ s.chunks.zip(variant).
+ sortBy { case (chunk, _) => if
chunk.isText then 0 else 1 }.
+ forall { case (chunk, tok) =>
+ if chunk.isText then
+ chunk.stem == tok.getStem
+ else
+ def match0(txt: String) =
chunk.regex.matcher(txt).matches()
+
+ match0(tok.getText) ||
match0(tok.getText.toLowerCase)
+ }
+ if (found)
+ addEntity(elemId)
ents.toSeq.asJava
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonym.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonym.scala
new file mode 100644
index 0000000..fbdfe38
--- /dev/null
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonym.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
+
+import org.apache.nlpcraft.NCToken
+import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticChunkKind.*
+
+import java.util.regex.Pattern
+
+/**
+ *
+ */
+private[impl] object NCSemanticChunkKind extends Enumeration:
+ type NCSemanticChunkKind = Value
+ val TEXT, REGEX = Value
+
+/**
+ *
+ * @param kind Kind of synonym chunk.
+ * @param text Original text.
+ * @param stem Optional stem for a single word synonyms.
+ * @param regex Optional regex expression to match on.
+ */
+private[impl] case class NCSemanticSynonymChunk(
+ kind: NCSemanticChunkKind, text: String, stem: String = null, regex:
Pattern = null
+) {
+ require(text != null && kind != null)
+
+ val isText: Boolean = text != null
+
+ override def toString = s"($text|$kind)"
+}
+
+private[impl] case class NCSemanticSynonym(
+ chunks: Seq[NCSemanticSynonymChunk], isElementId: Boolean = false,
isValueName: Boolean = false, value: String = null,
+) extends Comparable[NCSemanticSynonym]:
+ require(chunks != null)
+ require(chunks.nonEmpty)
+ if isElementId then require(!isValueName && value == null)
+ if isValueName || value != null then require(!isElementId)
+
+ final val size = chunks.size
+ private final val regexCount = size - chunks.count(_.kind == TEXT)
+ final val isText = regexCount == 0
+ final lazy val stem = if isText then chunks.map(_.stem).mkString(" ") else
null
+
+ override def compareTo(o: NCSemanticSynonym): Int =
Integer.compare(regexCount, o.regexCount)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
new file mode 100644
index 0000000..3374d8d
--- /dev/null
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
+
+import com.fasterxml.jackson.databind.*
+import com.fasterxml.jackson.dataformat.yaml.*
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.makro.NCMacroParser
+import org.apache.nlpcraft.nlp.entity.parser.semantic.*
+import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticChunkKind.*
+
+import java.io.InputStream
+import java.util
+import java.util.List as JList
+import java.util.regex.*
+import scala.collection.mutable
+import scala.jdk.CollectionConverters.*
+
+private[impl] case class NCSemanticSynonymsHolder(
+ textSynonyms: Map[String, Set[String]],
+ mixedSynonyms: Map[Int, Map[String, Seq[NCSemanticSynonym]]]
+)
+
+/**
+ *
+ */
+private[impl] object NCSemanticSynonymsProcessor:
+ private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
+ private final val REGEX_FIX = "//"
+
+ private def validate(macros: Map[String, String], elements:
Seq[NCSemanticElement]): Unit =
+ () // TODO:
+
+ private def startsAndEnds(fix: String, s: String): Boolean =
s.startsWith(fix) && s.endsWith(fix)
+ private def mkChunk(stemmer: NCSemanticTextStemmer, chunk: String):
NCSemanticSynonymChunk =
+ def stripSuffix(fix: String, s: String): String = s.slice(fix.length,
s.length - fix.length)
+
+ // Regex synonym.
+ if startsAndEnds(REGEX_FIX, chunk) then
+ val ptrn = stripSuffix(REGEX_FIX, chunk)
+ if ptrn.nonEmpty then
+ try
+ NCSemanticSynonymChunk(kind = REGEX, text = chunk, regex =
Pattern.compile(ptrn))
+ catch
+ case e: PatternSyntaxException =>
+ throw new NCException(s"Invalid regex synonym syntax
detected [" +
+ s"chunk=$chunk" +
+ s"]", e)
+ else
+ throw new NCException(s"Empty regex synonym detected [" +
+ s"chunk=$chunk" +
+ s"]")
+ else
+ NCSemanticSynonymChunk(kind = TEXT, text = chunk, stem =
stemmer.stem(chunk))
+
+ /**
+ *
+ * @param cfg
+ * @param stemmer
+ * @param macros
+ * @param elements
+ * @throws NCException // TODO
+ */
+ def prepare(
+ cfg: NCModelConfig,
+ stemmer: NCSemanticTextStemmer,
+ macros: Map[String, String],
+ elements: Seq[NCSemanticElement]
+ ): NCSemanticSynonymsHolder =
+ validate(macros, elements)
+
+ val p = new NCMacroParser
+
+ for ((name, body) <- macros) p.addMacro(name, body)
+
+ case class Holder(synonym: NCSemanticSynonym, elementId: String)
+
+ val buf = mutable.ArrayBuffer.empty[Holder]
+
+ for (e <- elements)
+ val elemId = e.getId
+
+ def add(syns: Seq[NCSemanticSynonym]): Unit = buf ++=
syns.map(Holder(_, elemId))
+ def convert(syns: JList[String]): Seq[Seq[NCSemanticSynonymChunk]]
=
+ syns.asScala.flatMap(p.expand).
+ map(t => cfg.getTokenizer.tokenize(cfg, t).asScala.map(w
=> mkChunk(stemmer, w.getText)).toSeq).toSeq
+ def mkSpecChunk(id: String): NCSemanticSynonymChunk =
NCSemanticSynonymChunk(TEXT, id, stemmer.stem(id))
+
+ // TODO:
+ add(Seq(NCSemanticSynonym(Seq(mkSpecChunk(elemId)), isElementId =
true)))
+
+ if e.getSynonyms != null then
+ // TODO: NCSynonym + trim for lines etc
+ add(convert(e.getSynonyms).map(chunks =>
NCSemanticSynonym(chunks)))
+
+ if e.getValues != null then
+ for (v <- e.getValues.asScala)
+ add(Seq(NCSemanticSynonym(Seq(mkSpecChunk(v.getName)),
isValueName = true, value = v.getName)))
+
+ if (v.getSynonyms != null)
+ add(convert(v.getSynonyms).map(chunks =>
NCSemanticSynonym(chunks, value = v.getName)))
+
+ val txtBuf = buf.filter(_.synonym.isText)
+
+ buf --= txtBuf
+
+ val txtSyns = txtBuf.groupBy(_.synonym.stem).map { (stem, hs) => stem
-> hs.map(_.elementId).toSet }
+ val mixedSyns = buf.groupBy(_.synonym.size).
+ map { case (size, hs) => size -> hs.groupBy(_.elementId).map {
(id, hs) => id -> hs.map(_.synonym).toSeq } }
+
+ NCSemanticSynonymsHolder(txtSyns, mixedSyns)
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
deleted file mode 100644
index 0fe259b..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
-
-import org.apache.nlpcraft.NCToken
-import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSynonymChunkKind._
-
-case class NCSynonym(
- isElementId: Boolean,
- isValueName: Boolean,
- value: String = null,
- chunks: Seq[NCSynonymChunk]
-) extends Comparable[NCSynonym]:
- private final val size = chunks.size
- private final val regexCount = size - chunks.count(_.kind == TEXT)
- private final val isText = regexCount == 0
-
- private lazy val stem = if isText then chunks.map(_.stem).mkString(" ")
else null
-
- def isMatch(toks: Seq[NCToken]): Boolean =
- size == toks.size && (
- if isText then
- stem == toks.map(_.getStem).mkString(" ")
- else
- chunks.zip(toks).
- sortBy { case (chunk, _) => if chunk.isText then 0 else 1
}.
- forall { (chunk, tok) =>
- if chunk.isText then
- chunk.stem == tok.getStem
- else
- def match0(txt: String) =
chunk.regex.matcher(txt).matches()
-
- match0(tok.getText) ||
match0(tok.getText.toLowerCase)
- }
- )
-
- override def compareTo(o: NCSynonym): Int = Integer.compare(regexCount,
o.regexCount)
-
-
-
-
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
deleted file mode 100644
index fc782c9..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
-
-import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSynonymChunkKind.NCSynonymChunkKind
-
-import java.util.regex.Pattern
-
-/**
- *
- * @param kind Kind of synonym chunk.
- * @param text Original text.
- * @param stem Optional stem for a single word synonyms.
- * @param pos Optional PoS tag to match on.
- * @param regex Optional regex expression to match on.
- */
-case class NCSynonymChunk(
- kind: NCSynonymChunkKind,
- text: String,
- stem: String = null, // Only for kind == TEXT.
- pos: String = null,
- regex: Pattern = null
-) {
- require(text != null)
- require(kind != null)
-
- val isText: Boolean = text != null
-
- override def toString = s"($text|$kind)"
-}
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserJsonSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserJsonSpec.scala
new file mode 100644
index 0000000..77a002b
--- /dev/null
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserJsonSpec.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.entity.parser.semantic
+
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
+import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en.*
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
+import org.apache.nlpcraft.nlp.util.NCTestConfig.*
+import org.junit.jupiter.api.*
+
+import java.util
+import scala.collection.mutable
+import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.*
+import scala.jdk.OptionConverters.RichOptional
+
+/**
+ *
+ */
+class NCSemanticEntityParserJsonSpec:
+ private var parser: NCSemanticEntityParser = _
+
+ @BeforeEach
+ def start(): Unit =
+ parser =
+ NCTestUtils.makeAndStart(
+ new NCSemanticEntityParser(new NCEnSemanticTextStemmer,
"models/alarm_model.json")
+ )
+
+ private def checkSingleEntity(txt: String, expected: String): Unit =
+ val req = NCTestRequest(txt)
+ val res = parser.parse(req, EN_MDL_CFG, EN_PARSER.parse(req,
EN_MDL_CFG)).asScala.toSeq
+
+ NCTestUtils.printEntities(txt, res)
+
+ @Test
+ def test(): Unit =
+ checkSingleEntity("Ping me in 3 minutes tomorrow", "test")
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserYamlSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserYamlSpec.scala
new file mode 100644
index 0000000..273f7d1
--- /dev/null
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserYamlSpec.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.entity.parser.semantic
+
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
+import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en.*
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
+import org.apache.nlpcraft.nlp.util.NCTestConfig.*
+import org.junit.jupiter.api.*
+
+import java.util
+import scala.collection.mutable
+import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.*
+import scala.jdk.OptionConverters.RichOptional
+/**
+ *
+ */
+class NCSemanticEntityParserYamlSpec:
+ private var parser: NCSemanticEntityParser = _
+
+ @BeforeEach
+ def start(): Unit =
+ parser =
+ NCTestUtils.makeAndStart(
+ new NCSemanticEntityParser(new NCEnSemanticTextStemmer,
"models/lightswitch_model.yaml")
+ )
+
+ private def checkSingleEntity(txt: String, expected: String): Unit =
+ val req = NCTestRequest(txt)
+ val res = parser.parse(req, EN_MDL_CFG, EN_PARSER.parse(req,
EN_MDL_CFG)).asScala.toSeq
+
+ NCTestUtils.printEntities(txt, res)
+
+ @Test
+ def test(): Unit =
+ checkSingleEntity("Turn the lights off in the entire house.", "test")
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
b/nlpcraft/src/test/resources/models/alarm_model.json
similarity index 66%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
rename to nlpcraft/src/test/resources/models/alarm_model.json
index aeceeee..b12bb30 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunkKind.scala
+++ b/nlpcraft/src/test/resources/models/alarm_model.json
@@ -6,7 +6,7 @@
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
- * https://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -15,14 +15,15 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
-
-/**
- * Synonym element type.
- */
-object NCSynonymChunkKind extends Enumeration {
- type NCSynonymChunkKind = Value
-
- val TEXT: Value = Value // Simple word.
- val REGEX: Value = Value // RegEx match expression (//[abd]+//).
-}
+{
+ "elements": [
+ {
+ "id": "x:alarm",
+ "description": "Alarm token indicator.",
+ "synonyms": [
+ "{ping|buzz|wake|call|hit} {me|up|me up|_}",
+ "{set|_} {my|_} {wake|wake up|_}
{alarm|timer|clock|buzzer|call} {clock|_} {up|_}"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/nlpcraft/src/test/resources/models/lightswitch_model.yaml
b/nlpcraft/src/test/resources/models/lightswitch_model.yaml
new file mode 100644
index 0000000..e063e25
--- /dev/null
+++ b/nlpcraft/src/test/resources/models/lightswitch_model.yaml
@@ -0,0 +1,52 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+macros:
+ "<ACTION>" : "{turn|switch|dial|let|set|get|put}"
+ "<KILL>" : "{shut|kill|stop|eliminate}"
+ "<ENTIRE_OPT>" : "{entire|full|whole|total|_}"
+ "<FLOOR_OPT>" :
"{upstairs|downstairs|{1st|first|2nd|second|3rd|third|4th|fourth|5th|fifth|top|ground}
floor|_}"
+ "<TYPE>" : "{room|closet|attic|loft|{store|storage} {room|_}}"
+ "<LIGHT>" : "{all|_} {it|them|light|illumination|lamp|lamplight}"
+
+elements:
+ - id: "ls:loc"
+ description: "Location of lights."
+ synonyms:
+ - "<ENTIRE_OPT> <FLOOR_OPT>
{kitchen|library|closet|garage|office|playroom|{dinning|laundry|play} <TYPE>}"
+ - "<ENTIRE_OPT> <FLOOR_OPT> {master|kid|children|child|guest|_}
{bedroom|bathroom|washroom|storage} {<TYPE>|_}"
+ - "<ENTIRE_OPT> {house|home|building|{1st|first} floor|{2nd|second}
floor}"
+
+ - id: "ls:on"
+ groups:
+ - "act"
+ description: "Light switch ON action."
+ synonyms:
+ - "<ACTION> {on|up|_} <LIGHT> {on|up|_}"
+ - "<LIGHT> {on|up}"
+
+ - id: "ls:off"
+ groups:
+ - "act"
+ description: "Light switch OFF action."
+ synonyms:
+ - "<ACTION> <LIGHT> {off|out|down}"
+ - "{<ACTION>|<KILL>} {off|out|down} <LIGHT>"
+ - "<KILL> <LIGHT>"
+ - "<LIGHT> <KILL>"
+ - "{out|no|off|down} <LIGHT>"
+ - "<LIGHT> {out|off|down}"
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 37f93be..c156505 100644
--- a/pom.xml
+++ b/pom.xml
@@ -105,6 +105,8 @@
<junit.ver>5.8.2</junit.ver>
<scalatest.ver>3.2.9</scalatest.ver>
<gson.ver>2.8.5</gson.ver>
+ <jackson.ver>2.13.1</jackson.ver>
+ <jackson.yaml.ver>2.13.1</jackson.yaml.ver>
<apache.opennlp.ver>1.9.4</apache.opennlp.ver>
<jmh.version>1.33</jmh.version>
@@ -170,6 +172,24 @@
</dependency>
<dependency>
+ <groupId>com.fasterxml.jackson.dataformat</groupId>
+ <artifactId>jackson-dataformat-yaml</artifactId>
+ <version>${jackson.ver}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ <version>${jackson.ver}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.fasterxml.jackson.module</groupId>
+ <artifactId>jackson-module-scala_3</artifactId>
+ <version>${jackson.ver}</version>
+ </dependency>
+
+ <dependency>
<groupId>org.antlr</groupId>
<artifactId>antlr4-runtime</artifactId>
<version>${org.antlr4.ver}</version>