[incubator-nlpcraft] branch NLPCRAFT-472 updated: CR WIP

aradzinski Sun, 16 Jan 2022 13:43:50 -0800

This is an automated email from the ASF dual-hosted git repository.

aradzinski pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
     new 19e832b  CR WIP
19e832b is described below

commit 19e832bacf3dbe688e729b58e80a8458ffe586f1
Author: Aaron Radzinski <[email protected]>
AuthorDate: Sun Jan 16 13:43:39 2022 -0800

    CR WIP
---
 .../main/scala/org/apache/nlpcraft/NCEntity.java   |  7 +--
 .../entity/parser/semantic/NCSemanticElement.java  | 20 +--------
 .../parser/semantic/NCSemanticEntityParser.java    | 38 +++++++---------
 .../entity/parser/semantic/NCSemanticStemmer.java  |  2 +-
 .../semantic/impl/NCSemanticEntityParserImpl.scala | 21 +++++----
 .../semantic/impl/NCSemanticSourceReader.scala     |  5 +--
 .../impl/NCSemanticSynonymsProcessor.scala         | 52 ++++++++--------------
 .../parser/semantic/impl/en/NCEnPorterStemmer.java |  2 +-
 .../semantic/NCSemanticEntityParserSpec.scala      |  6 +--
 9 files changed, 59 insertions(+), 94 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
index 22fc53d..91dfa73 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
@@ -17,7 +17,8 @@
 
 package org.apache.nlpcraft;
 
-import java.util.List;
+import java.util.Collections;
+import java.util.*;
 
 /**
  *
@@ -37,10 +38,10 @@ public interface NCEntity extends NCPropertyMap {
     String getRequestId();
 
     /**
-     * 
+     *
      * @return
      */
-    default String getGroup() { return getId(); }
+    default Set<String> getGroups() { return Collections.singleton(getId()); }
 
     /**
      *
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
index 81b3f52..882734d 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
@@ -36,24 +36,8 @@ public interface NCSemanticElement {
      *
      * @return
      */
-    default List<String> getGroups() {
-        return Collections.singletonList(getId());
-    }
-
-    /**
-     * TODO: why do we need it?
-     * @return
-     */
-    default boolean isMemberOf(String grp) {
-        return getGroups().contains(grp);
-    }
-
-    /**
-     * TODO: why do we need it?
-     * @return
-     */
-    default String getDescription() {
-        return null;
+    default Set<String> getGroups() {
+        return Collections.singleton(getId());
     }
 
     /**
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
index cb80fb0..880ec0e 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
@@ -17,12 +17,7 @@
 
 package org.apache.nlpcraft.nlp.entity.parser.semantic;
 
-import org.apache.nlpcraft.NCEntity;
-import org.apache.nlpcraft.NCEntityParser;
-import org.apache.nlpcraft.NCModelConfig;
-import org.apache.nlpcraft.NCRequest;
-import org.apache.nlpcraft.NCToken;
-import org.apache.nlpcraft.NCTokenParser;
+import org.apache.nlpcraft.*;
 import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl;
 
 import java.util.Collections;
@@ -40,15 +35,15 @@ public class NCSemanticEntityParser implements 
NCEntityParser {
      *
      * @param stemmer
      * @param parser
-     * @param elems
+     * @param elms
      */
-    public NCSemanticEntityParser(NCSemanticStemmer stemmer, NCTokenParser 
parser, List<NCSemanticElement> elems) {
-        // TODO: error texts.
+    public NCSemanticEntityParser(NCSemanticStemmer stemmer, NCTokenParser 
parser, List<NCSemanticElement> elms) {
         Objects.requireNonNull(stemmer, "Stemmer cannot be null.");
         Objects.requireNonNull(parser, "Parser cannot be null.");
-        Objects.requireNonNull(elems, "Elements cannot be null.");
+        Objects.requireNonNull(elms, "Elements cannot be null.");
+        if (elms.size() == 0) throw new NCException("Element list cannot be 
empty.");
 
-        impl = NCSemanticEntityParserImpl.apply(stemmer, parser, 
Collections.emptyMap(), elems);
+        impl = NCSemanticEntityParserImpl.apply(stemmer, parser, 
Collections.emptyMap(), elms);
     }
 
     /**
@@ -56,31 +51,28 @@ public class NCSemanticEntityParser implements 
NCEntityParser {
      * @param stemmer
      * @param parser
      * @param macros
-     * @param elems
+     * @param elms
      */
-    public NCSemanticEntityParser(
-        NCSemanticStemmer stemmer, NCTokenParser parser, Map<String, String> 
macros, List<NCSemanticElement> elems
-    ) {
-        // TODO: error texts.
+    public NCSemanticEntityParser(NCSemanticStemmer stemmer, NCTokenParser 
parser, Map<String, String> macros, List<NCSemanticElement> elms) {
         Objects.requireNonNull(stemmer, "Stemmer cannot be null.");
         Objects.requireNonNull(parser, "Parser cannot be null.");
-        Objects.requireNonNull(elems, "Elements cannot be null.");
+        Objects.requireNonNull(elms, "Elements cannot be null.");
+        if (elms.size() == 0) throw new NCException("Element list cannot be 
empty.");
 
-        impl = NCSemanticEntityParserImpl.apply(stemmer, parser, macros, 
elems);
+        impl = NCSemanticEntityParserImpl.apply(stemmer, parser, macros, elms);
     }
 
     /**
      *
      * @param stemmer
-     * @param mdlSrc
+     * @param src
      */
-    public NCSemanticEntityParser(NCSemanticStemmer stemmer, NCTokenParser 
parser, String mdlSrc) {
-        // TODO: error texts.
+    public NCSemanticEntityParser(NCSemanticStemmer stemmer, NCTokenParser 
parser, String src) {
         Objects.requireNonNull(stemmer, "Stemmer cannot be null.");
         Objects.requireNonNull(parser, "Parser cannot be null.");
-        Objects.requireNonNull(mdlSrc, "Source cannot be null.");
+        Objects.requireNonNull(src, "Source cannot be null.");
 
-        impl = NCSemanticEntityParserImpl.apply(stemmer, parser, mdlSrc);
+        impl = NCSemanticEntityParserImpl.apply(stemmer, parser, src);
     }
 
     @Override
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticStemmer.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticStemmer.java
index dcf2fec..279e4f4 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticStemmer.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticStemmer.java
@@ -18,7 +18,7 @@
 package org.apache.nlpcraft.nlp.entity.parser.semantic;
 
 /**
- * TODO:
+ * 
  */
 public interface NCSemanticStemmer {
     /**
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index e373199..13402c0 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -40,35 +40,35 @@ object NCSemanticEntityParserImpl:
       * @param stemmer
       * @param parser
       * @param macros
-      * @param elems
+      * @param elms
       * @return
       */
     def apply(
         stemmer: NCSemanticStemmer,
         parser: NCTokenParser,
         macros: Jmap[String, String],
-        elems: JList[NCSemanticElement]
+        elms: JList[NCSemanticElement]
     ): NCSemanticEntityParserImpl =
-        require(elems != null)
+        require(elms != null)
 
         new NCSemanticEntityParserImpl(
             stemmer,
             parser,
             macros = if macros == null then null else macros.asScala.toMap,
-            elements = elems.asScala.toSeq
+            elements = elms.asScala.toSeq
         )
 
     /**
       *
       * @param stemmer
       * @param parser
-      * @param mdlSrc
+      * @param src
       * @return
       */
-    def apply(stemmer: NCSemanticStemmer, parser: NCTokenParser, mdlSrc: 
String): NCSemanticEntityParserImpl =
-        require(mdlSrc != null)
+    def apply(stemmer: NCSemanticStemmer, parser: NCTokenParser, src: String): 
NCSemanticEntityParserImpl =
+        require(src != null)
 
-        new NCSemanticEntityParserImpl(stemmer, parser, mdlSrc = mdlSrc, 
scrType = NCSemanticSourceType.detect(mdlSrc))
+        new NCSemanticEntityParserImpl(stemmer, parser, mdlSrc = src, scrType 
= NCSemanticSourceType.detect(src))
 
     /**
       * @param baseTokens Tokens.
@@ -161,6 +161,9 @@ class NCSemanticEntityParserImpl(
 
     init()
 
+    /**
+      *
+      */
     private def init(): Unit =
         val (macros, elements, elemsMap) =
             def toMap(elems: Seq[NCSemanticElement]): Map[String, 
NCSemanticElement] = elems.map(p => p.getId -> p).toMap
@@ -182,7 +185,7 @@ class NCSemanticEntityParserImpl(
         val stems = toks.map(p => p -> stemmer.stem(p.getText)).toMap
 
         if toks.exists(_.getOpt[Boolean]("stopword").isEmpty) then
-            logger.warn("Stopwords tokens enricher isn't configured.") // 
TODO: warning text.
+            logger.warn("'stopword' property not found. Is stopword token 
enricher configured?")
 
         val cache = mutable.HashSet.empty[Seq[Int]] // Variants (tokens 
without stopwords) can be repeated.
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSourceReader.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSourceReader.scala
index 8a0a003..fd53f67 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSourceReader.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSourceReader.scala
@@ -38,7 +38,7 @@ private[impl] object NCSemanticSourceType:
 
         if lc.endsWith(".json") || lc.endsWith(".js") then JSON
         else if lc.endsWith(".yaml") || lc.endsWith(".yml") then YAML
-        else E("Unexpected data type. Expected `yaml` or `json` formats.") // 
TODO: error text.
+        else E(s"Expected `yaml` or `json` formats, but got: $src")
 
 /**
   *
@@ -78,8 +78,7 @@ private[impl] object NCSemanticSourceReader:
         else
             new NCPropertyMapAdapter with NCSemanticElement:
                 override val getId: String = e.id
-                override val getGroups: JList[String] = nvl(e.groups)
-                override val getDescription: String = e.description
+                override val getGroups: JSet[String] = nvl(e.groups.toSet)
                 override val getValues: JMap[String, JSet[String]] = 
nvlValues(e.values)
                 override val getSynonyms: JSet[String] = nvl(e.synonyms)
                 override val getProperties: JMap[String, AnyRef] = 
nvlProperties(e.properties)
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
index 2b57f71..23e7989 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
@@ -75,8 +75,8 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
         require(elements != null)
 
         if macros != null then
-            if hasNullOrEmpty(macros.keySet) then E("Some macro names are null 
or empty.") // TODO: error text.
-            if hasNullOrEmpty(macros.values) then E("Some macro bodies are 
null or empty.") // TODO: error text.
+            if hasNullOrEmpty(macros.keySet) then E("Some macro names are null 
or empty.")
+            if hasNullOrEmpty(macros.values) then E("Some macro bodies are 
null or empty.")
 
             val set = elements.filter(_.getSynonyms != 
null).flatMap(_.getSynonyms.asScala) ++ macros.values
 
@@ -87,10 +87,9 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
 
             // Ignore suspicious chars if regex is used in macro...
             for ((name, value) <- macros if isSuspicious(name) || 
(isSuspicious(value) && !value.contains("//")))
-                // TODO: error text.
                 logger.warn(
                     s"Suspicious macro definition (use of 
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
-                    s"macro=$name" +
+                        s"macro=$name" +
                     s"]"
                 )
 
@@ -103,16 +102,12 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
     private def checkSynonyms(syns: JSet[String], elemId: String, valueName: 
Option[String] = None): Unit =
         def mkDesc: String =
             val valuePart = if valueName.isDefined then s", 
value=${valueName.get}" else ""
-
             s"[id=$elemId$valuePart]"
 
         if syns != null then
-            if hasNullOrEmpty(syns.asScala) then E(s"Some synonyms are null or 
empty $mkDesc") // TODO: error text.
-
+            if hasNullOrEmpty(syns.asScala) then E(s"Some synonyms are null or 
empty $mkDesc")
             val susp = syns.asScala.filter(syn => !syn.contains("//") && 
SUSP_SYNS_CHARS.exists(susp => syn.contains(susp)))
-
             if susp.nonEmpty then
-                // TODO: error text.
                 logger.warn(
                     s"Suspicious synonyms detected (use of 
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) $mkDesc"
                 )
@@ -121,30 +116,28 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
       * @param elems
       */
     private def checkElements(elems: Seq[NCSemanticElement]): Unit =
-        if elems == null || elems.isEmpty then E("Elements cannot be null or 
empty.") // TODO: error text.
-        if elems.contains(null) then E("Some elements are null.") // TODO: 
error text.
+        if elems == null || elems.isEmpty then E("Elements cannot be null or 
empty.")
+        if elems.contains(null) then E("Some elements are null.")
 
         // Duplicates.
         val ids = mutable.HashSet.empty[String]
 
         for (id <- elems.map(_.getId))
-            if ids.contains(id) then E(s"Duplicate element ID [element=$id]") 
// TODO: error text.
+            if ids.contains(id) then E(s"Duplicate element ID [element=$id]")
             else ids += id
 
         for (e <- elems)
             val elemId = e.getId
 
-            if elemId == null || elemId.isEmpty then E(s"Some element IDs are 
not provided or empty.") // TODO: error text.
-            else if !elemId.matches(ID_REGEX) then E(s"Element ID does not 
match regex [element=$elemId, regex=$ID_REGEX]") // TODO: error text.
-            else if elemId.exists(_.isWhitespace) then E(s"Element ID cannot 
have whitespaces [element=$elemId]") // TODO: error text.
+            if elemId == null || elemId.isEmpty then E(s"Some element IDs are 
not provided or empty.")
+            else if !elemId.matches(ID_REGEX) then E(s"Element ID does not 
match regex [element=$elemId, regex=$ID_REGEX]")
+            else if elemId.exists(_.isWhitespace) then E(s"Element ID cannot 
have whitespaces [element=$elemId]")
 
             checkSynonyms(e.getSynonyms, elemId)
 
             val vals = e.getValues
-
             if vals != null then
-                if hasNullOrEmpty(vals.keySet().asScala) then E(s"Some values 
names are null or empty [element=$elemId]") // TODO: error text.
-
+                if hasNullOrEmpty(vals.keySet().asScala) then E(s"Some values 
names are null or empty [element=$elemId]")
                 for ((name, syns) <- vals.asScala)
                     checkSynonyms(syns, elemId, Some(name))
 
@@ -173,15 +166,13 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
                 if ptrn.nonEmpty then
                     try NCSemanticSynonymChunk(REGEX, text, regex = 
Pattern.compile(ptrn))
                     catch case e: PatternSyntaxException => E(s"Invalid regex 
synonym syntax detected [element=$elemId, chunk=$text]", e)
-                else E(s"Empty regex synonym detected [element=$elemId]") // 
TODO: error text.
+                else E(s"Empty regex synonym detected [element=$elemId]")
 
         val regexes = mutable.HashMap.empty[Int, RegexHolder]
 
         def findRegex(t: NCToken): Option[RegexHolder] =
-            if regexes.nonEmpty then
-                (t.getStartCharIndex to 
t.getEndCharIndex).flatMap(regexes.get).to(LazyList).headOption
-            else
-                None
+            if regexes.nonEmpty then (t.getStartCharIndex to 
t.getEndCharIndex).flatMap(regexes.get).to(LazyList).headOption
+            else None
 
         syns.asScala.flatMap(macroParser.expand).
             map(syn => {
@@ -199,15 +190,13 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
 
                     if ch.startsWith(REGEX_FIX) && ch.endsWith(REGEX_FIX) then
                         val r = RegexHolder(ch)
-
                         (start to end).foreach(regexes += _ -> r)
 
                 // Tokenizes synonym without regex chunks. Regex chunks are 
used as is, without tokenization.
                 tokParser.tokenize(normSyn.mkString(" ")).asScala.flatMap(tok 
=>
                     findRegex(tok) match
                         case Some(regex) =>
-                            if regex.used then
-                                None
+                            if regex.used then None
                             else
                                 regex.used = true
                                 Some(regex.mkChunk())
@@ -237,8 +226,7 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
 
         val macroParser = new NCMacroParser
 
-        if macros != null then
-            for ((name, body) <- macros) macroParser.addMacro(name, body)
+        if macros != null then for ((name, body) <- macros) 
macroParser.addMacro(name, body)
 
         case class Holder(synonym: NCSemanticSynonym, elementId: String) {
             lazy val root: String = synonym.chunks.map(p => if p.isText then 
p.stem else p.text).mkString(" ")
@@ -273,16 +261,14 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
 
             if elemIds.size > 1 then
                 for (s <- hs.map(_.synonym).distinct)
-                    // TODO: error text.
                     logger.warn(
-                        s"Synonym is related to various elements " +
-                        s"[synonym='${s.chunks.mkString(" ")}'" +
-                        s", elements=${elemIds.mkString("{", ",", "}")}" +
+                        s"Synonym appears in multiple elements [" +
+                            s"synonym='${s.chunks.mkString(" ")}', " +
+                            s"elements=${elemIds.mkString("{", ",", "}")}" +
                         s"]")
         })
 
         val txtBuf = buf.filter(_.synonym.isText)
-
         val txtSyns =
             txtBuf.groupBy(_.synonym.stem).
             map { (stem, hs) =>
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCEnPorterStemmer.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCEnPorterStemmer.java
index 38aa159..8ec7f2a 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCEnPorterStemmer.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCEnPorterStemmer.java
@@ -21,7 +21,7 @@ import opennlp.tools.stemmer.PorterStemmer;
 import org.apache.nlpcraft.nlp.entity.parser.semantic.NCSemanticStemmer;
 
 /**
- * TODO:
+ * 
  */
 public class NCEnPorterStemmer implements NCSemanticStemmer {
     /** */
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
index 4b8288a..fb8bbaf 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
@@ -48,13 +48,13 @@ case class NCSemanticTestElement(
     props: Map[String, AnyRef] = Map.empty
 ) extends NCSemanticElement:
     override def getId: String = id
-    override def getGroups: JList[String] = groups.asJava
+    override def getGroups: JSet[String] = groups.toSet.asJava
     override def getValues: JMap[String, JSet[String]] = values.map { (k, v) 
=> k -> v.asJava}.asJava
     override def getSynonyms: JSet[String] = synonyms.asJava
     override def getProperties: JMap[String, Object] = props.asJava
 
 /**
-  * 
+  *
   */
 object NCSemanticTestElement:
     def apply(id: String, synonyms: String*) = new NCSemanticTestElement(id, 
synonyms = synonyms.toSet)
@@ -138,7 +138,7 @@ class NCSemanticEntityParserSpec:
         ents.map(_.getId).sorted.zip(ids.sorted).foreach { case (eId, id) => 
require(eId == id) }
 
     /**
-      * 
+      *
       */
     @Test
     def test(): Unit =

[incubator-nlpcraft] branch NLPCRAFT-472 updated: CR WIP

Reply via email to