This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
     new 84d9d73  WIP.
84d9d73 is described below

commit 84d9d739d3d06e0712cb437fae74e3e0c0deab57
Author: Sergey Kamov <[email protected]>
AuthorDate: Thu Dec 30 19:23:50 2021 +0300

    WIP.
---
 .../semantic/impl/NCSemanticDataReader.scala       |  16 ++-
 .../semantic/impl/NCSemanticEntityParserImpl.scala |   4 +-
 .../impl/NCSemanticSynonymsProcessor.scala         | 124 +++++++++++++++++----
 3 files changed, 108 insertions(+), 36 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
index f7c1df5..5125392 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
@@ -58,15 +58,16 @@ private[impl] object NCSemanticDataReader:
     )
     case class Source(macros: Map[String, String], elements: Seq[Element])
 
-    private def emptyList[T]: JList[T] = util.Collections.emptyList()
-    private def nvl[T](seq: Seq[T]): JList[T] = if seq == null then emptyList 
else seq.asJava
-    private def nvlConvert[T, R](seq: Seq[T], to: T => R): JList[R] = if seq 
== null then emptyList else seq.map(to).asJava
+    private def nvl[T](seq: Seq[T]): JList[T] = if seq == null then null else 
seq.asJava
+    private def nvlConvert[T, R](seq: Seq[T], to: T => R): JList[R] = if seq 
== null then null else seq.map(to).asJava
     private def convertValue(v: Value) =
-        new NCSemanticElementValue:
+        if v == null then null
+        else new NCSemanticElementValue:
             override def getName: String = v.name
             override def getSynonyms: JList[String] = nvl(v.synonyms)
     private def convertElement(e: Element) =
-        new NCSemanticElement:
+        if e == null then null
+        else new NCSemanticElement:
             override def getId: String = e.id
             override def getGroups: JList[String] = nvl(e.groups)
             override def getDescription: String = e.description
@@ -93,7 +94,4 @@ private[impl] object NCSemanticDataReader:
 
         val src = mapper.readValue(is, classOf[Source])
 
-        NCSemanticData(
-            if src.macros == null then Map.empty else src.macros,
-            if src.elements == null then Seq.empty else 
src.elements.map(convertElement)
-        )
\ No newline at end of file
+        NCSemanticData(src.macros, if src.elements == null then null else 
src.elements.map(convertElement))
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index 5f355e7..2540ec1 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -37,9 +37,7 @@ object NCSemanticEntityParserImpl:
         require(elems != null)
 
         new NCSemanticEntityParserImpl(
-            stemmer,
-            macros = if macros == null then Map.empty else 
macros.asScala.toMap,
-            elements = elems.asScala.toSeq
+            stemmer, macros = if macros == null then null else 
macros.asScala.toMap, elements = elems.asScala.toSeq
         )
 
     def apply(stemmer: NCSemanticTextStemmer, mdlFile: File): 
NCSemanticEntityParserImpl =
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
index e296472..b92a9f8 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
@@ -23,8 +23,8 @@ import org.apache.nlpcraft.*
 import org.apache.nlpcraft.internal.makro.NCMacroParser
 import org.apache.nlpcraft.nlp.entity.parser.semantic.*
 import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticChunkKind.*
-
 import com.typesafe.scalalogging.LazyLogging
+
 import java.io.InputStream
 import java.util
 import java.util.List as JList
@@ -43,23 +43,99 @@ private[impl] case class NCSemanticSynonymsHolder(
 private[impl] object NCSemanticSynonymsProcessor extends LazyLogging:
     private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
     private final val REGEX_FIX = "//"
+    private final val ID_REGEX = "^[_a-zA-Z]+[a-zA-Z0-9:\\-_]*$"
+
+    /**
+      *
+      * @param macros
+      * @param elements
+      */
+    private def checkMacros(macros: Map[String, String], elements: 
Seq[NCSemanticElement]): Unit =
+        if macros != null then
+            if macros.contains(null) then throw new NCException("Some macro 
are null")
+
+            val set = elements.filter(_.getSynonyms != 
null).flatMap(_.getSynonyms.asScala) ++ macros.values
+
+            for (makro <- macros.keys if !set.exists(_.contains(makro)))
+                logger.warn(s"Unused macro detected [macro=$makro]")
+
+            def isSuspicious(s: String): Boolean = 
SUSP_SYNS_CHARS.exists(s.contains)
+
+            for ((mkName, mkVal) <- macros)
+                // Ignore suspicious chars if regex is used in macro...
+                if isSuspicious(mkName) || (isSuspicious(mkVal) && 
!mkVal.contains("//")) then
+                    logger.warn(s"Suspicious macro definition (use of 
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
+                        s"macro=$mkName" +
+                        s"]")
+
+    /**
+      *
+      * @param s
+      * @return
+      */
+    private def hasWhitespace(s: String): Boolean = s.exists(_.isWhitespace)
+
+    /**
+      *
+      * @param syns
+      * @param elemId
+      * @param valueName
+      */
+    private def checkSynonyms(syns: JList[String], elemId: String, valueName: 
Option[String] = None): Unit =
+        if syns != null then
+            if syns.contains(null) then throw new NCException(
+                "Some synonyms are null[" +
+                s"id=$elemId, " +
+                (if valueName.isDefined then s"value=${valueName.get}, " else 
"") +
+                "]"
+            )
+
+            val susp = syns.asScala.filter(syn => !syn.contains("//") && 
SUSP_SYNS_CHARS.exists(susp => syn.contains(susp)))
+
+            if susp.nonEmpty then
+                logger.warn(
+                    s"Suspicious synonyms detected (use of 
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
+                    s"id=$elemId, " +
+                    (if valueName.isDefined then s"value=${valueName.get}, " 
else "") +
+                    s"synonyms=[${susp.mkString(", ")}]" +
+                    s"]"
+                )
+
+    /**
+      *
+      * @param elements
+      */
+    private def checkElements(elements: Seq[NCSemanticElement]): Unit =
+        if elements == null || elements.isEmpty then throw new 
NCException("Elements cannot be null or empty")
+        if elements.contains(null) then throw new NCException("Some elements 
are null")
+
+        // Duplicates.
+        val ids = mutable.HashSet.empty[String]
 
-    // TODO: extend.
-    private def validate(macros: Map[String, String], elements: 
Seq[NCSemanticElement]): Unit =
-        if (elements == null || elements.isEmpty)
-            throw new NCException("Elements cannot be empty") // TODO:
+        for (id <- elements.map(_.getId))
+            if ids.contains(id) then throw new NCException(s"Duplicate element 
ID [element=$id]")
+            else ids += id
 
         for (e <- elements)
-            if (e.getSynonyms != null)
-                val susp = e.getSynonyms.asScala.filter(syn => 
!syn.contains("//") && SUSP_SYNS_CHARS.exists(susp => syn.contains(susp)))
+            val elemId = e.getId
+
+            if elemId == null || elemId.isEmpty then
+                throw new NCException(s"Some element IDs are not provided or 
empty")
+            else if !elemId.matches(ID_REGEX) then
+                throw new NCException(s"Element ID does not match regex 
[element=$elemId, regex=$ID_REGEX]")
+            else if hasWhitespace(elemId) then
+                throw new NCException(s"Element ID cannot have whitespaces 
[element=$elemId]")
+
+            checkSynonyms(e.getSynonyms, elemId)
+
+            if e.getValues != null then
+                for (v <- e.getValues.asScala)
+                    val name = v.getName
 
-                if susp.nonEmpty then
-                    logger.warn(
-                        s"Suspicious synonyms detected (use of 
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
-                            s"elementId=${e.getId}, " +
-                            s"synonyms=[${susp.mkString(", ")}]" +
-                            s"]"
-                    )
+                    if name == null || name.isEmpty then
+                        throw new NCException(s"Some value names are not 
provided or empty [element=$elemId]")
+
+                    checkSynonyms(v.getSynonyms, elemId, Some(name))
 
     private def startsAndEnds(fix: String, s: String): Boolean = 
s.startsWith(fix) && s.endsWith(fix)
     private def mkChunk(stemmer: NCSemanticTextStemmer, chunk: String): 
NCSemanticSynonymChunk =
@@ -73,13 +149,9 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
                     NCSemanticSynonymChunk(kind = REGEX, text = chunk, regex = 
Pattern.compile(ptrn))
                 catch
                     case e: PatternSyntaxException =>
-                        throw new NCException(s"Invalid regex synonym syntax 
detected [" +
-                            s"chunk=$chunk" +
-                            s"]", e)
+                        throw new NCException(s"Invalid regex synonym syntax 
detected [chunk=$chunk]", e)
             else
-                throw new NCException(s"Empty regex synonym detected [" +
-                    s"chunk=$chunk" +
-                    s"]")
+                throw new NCException(s"Empty regex synonym detected 
[chunk=$chunk]")
         else
             NCSemanticSynonymChunk(kind = TEXT, text = chunk, stem = 
stemmer.stem(chunk))
 
@@ -89,7 +161,7 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
       * @param stemmer
       * @param macros
       * @param elements
-      * @throws NCException // TODO
+      * @throws NCException
       */
     def prepare(
         cfg: NCModelConfig,
@@ -97,11 +169,15 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
         macros: Map[String, String],
         elements: Seq[NCSemanticElement]
     ): NCSemanticSynonymsHolder =
-        validate(macros, elements)
+        require(cfg != null && stemmer != null)
+
+        checkElements(elements)
+        checkMacros(macros, elements)
 
         val p = new NCMacroParser
 
-        for ((name, body) <- macros) p.addMacro(name, body)
+        if macros != null then
+            for ((name, body) <- macros) p.addMacro(name, body)
 
         case class Holder(synonym: NCSemanticSynonym, elementId: String)
 
@@ -127,7 +203,7 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
                 for (v <- e.getValues.asScala)
                     add(Seq(NCSemanticSynonym(Seq(mkSpecChunk(v.getName)), 
isValueName = true, value = v.getName)))
 
-                    if (v.getSynonyms != null)
+                    if v.getSynonyms != null then
                         add(convert(v.getSynonyms).map(chunks => 
NCSemanticSynonym(chunks, value = v.getName)))
 
         val txtBuf = buf.filter(_.synonym.isText)

Reply via email to