This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new 19e832b CR WIP
19e832b is described below
commit 19e832bacf3dbe688e729b58e80a8458ffe586f1
Author: Aaron Radzinski <[email protected]>
AuthorDate: Sun Jan 16 13:43:39 2022 -0800
CR WIP
---
.../main/scala/org/apache/nlpcraft/NCEntity.java | 7 +--
.../entity/parser/semantic/NCSemanticElement.java | 20 +--------
.../parser/semantic/NCSemanticEntityParser.java | 38 +++++++---------
.../entity/parser/semantic/NCSemanticStemmer.java | 2 +-
.../semantic/impl/NCSemanticEntityParserImpl.scala | 21 +++++----
.../semantic/impl/NCSemanticSourceReader.scala | 5 +--
.../impl/NCSemanticSynonymsProcessor.scala | 52 ++++++++--------------
.../parser/semantic/impl/en/NCEnPorterStemmer.java | 2 +-
.../semantic/NCSemanticEntityParserSpec.scala | 6 +--
9 files changed, 59 insertions(+), 94 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
index 22fc53d..91dfa73 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
@@ -17,7 +17,8 @@
package org.apache.nlpcraft;
-import java.util.List;
+import java.util.Collections;
+import java.util.*;
/**
*
@@ -37,10 +38,10 @@ public interface NCEntity extends NCPropertyMap {
String getRequestId();
/**
- *
+ *
* @return
*/
- default String getGroup() { return getId(); }
+ default Set<String> getGroups() { return Collections.singleton(getId()); }
/**
*
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
index 81b3f52..882734d 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticElement.java
@@ -36,24 +36,8 @@ public interface NCSemanticElement {
*
* @return
*/
- default List<String> getGroups() {
- return Collections.singletonList(getId());
- }
-
- /**
- * TODO: why do we need it?
- * @return
- */
- default boolean isMemberOf(String grp) {
- return getGroups().contains(grp);
- }
-
- /**
- * TODO: why do we need it?
- * @return
- */
- default String getDescription() {
- return null;
+ default Set<String> getGroups() {
+ return Collections.singleton(getId());
}
/**
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
index cb80fb0..880ec0e 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
@@ -17,12 +17,7 @@
package org.apache.nlpcraft.nlp.entity.parser.semantic;
-import org.apache.nlpcraft.NCEntity;
-import org.apache.nlpcraft.NCEntityParser;
-import org.apache.nlpcraft.NCModelConfig;
-import org.apache.nlpcraft.NCRequest;
-import org.apache.nlpcraft.NCToken;
-import org.apache.nlpcraft.NCTokenParser;
+import org.apache.nlpcraft.*;
import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl;
import java.util.Collections;
@@ -40,15 +35,15 @@ public class NCSemanticEntityParser implements
NCEntityParser {
*
* @param stemmer
* @param parser
- * @param elems
+ * @param elms
*/
- public NCSemanticEntityParser(NCSemanticStemmer stemmer, NCTokenParser
parser, List<NCSemanticElement> elems) {
- // TODO: error texts.
+ public NCSemanticEntityParser(NCSemanticStemmer stemmer, NCTokenParser
parser, List<NCSemanticElement> elms) {
Objects.requireNonNull(stemmer, "Stemmer cannot be null.");
Objects.requireNonNull(parser, "Parser cannot be null.");
- Objects.requireNonNull(elems, "Elements cannot be null.");
+ Objects.requireNonNull(elms, "Elements cannot be null.");
+ if (elms.size() == 0) throw new NCException("Element list cannot be
empty.");
- impl = NCSemanticEntityParserImpl.apply(stemmer, parser,
Collections.emptyMap(), elems);
+ impl = NCSemanticEntityParserImpl.apply(stemmer, parser,
Collections.emptyMap(), elms);
}
/**
@@ -56,31 +51,28 @@ public class NCSemanticEntityParser implements
NCEntityParser {
* @param stemmer
* @param parser
* @param macros
- * @param elems
+ * @param elms
*/
- public NCSemanticEntityParser(
- NCSemanticStemmer stemmer, NCTokenParser parser, Map<String, String>
macros, List<NCSemanticElement> elems
- ) {
- // TODO: error texts.
+ public NCSemanticEntityParser(NCSemanticStemmer stemmer, NCTokenParser
parser, Map<String, String> macros, List<NCSemanticElement> elms) {
Objects.requireNonNull(stemmer, "Stemmer cannot be null.");
Objects.requireNonNull(parser, "Parser cannot be null.");
- Objects.requireNonNull(elems, "Elements cannot be null.");
+ Objects.requireNonNull(elms, "Elements cannot be null.");
+ if (elms.size() == 0) throw new NCException("Element list cannot be
empty.");
- impl = NCSemanticEntityParserImpl.apply(stemmer, parser, macros,
elems);
+ impl = NCSemanticEntityParserImpl.apply(stemmer, parser, macros, elms);
}
/**
*
* @param stemmer
- * @param mdlSrc
+ * @param src
*/
- public NCSemanticEntityParser(NCSemanticStemmer stemmer, NCTokenParser
parser, String mdlSrc) {
- // TODO: error texts.
+ public NCSemanticEntityParser(NCSemanticStemmer stemmer, NCTokenParser
parser, String src) {
Objects.requireNonNull(stemmer, "Stemmer cannot be null.");
Objects.requireNonNull(parser, "Parser cannot be null.");
- Objects.requireNonNull(mdlSrc, "Source cannot be null.");
+ Objects.requireNonNull(src, "Source cannot be null.");
- impl = NCSemanticEntityParserImpl.apply(stemmer, parser, mdlSrc);
+ impl = NCSemanticEntityParserImpl.apply(stemmer, parser, src);
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticStemmer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticStemmer.java
index dcf2fec..279e4f4 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticStemmer.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticStemmer.java
@@ -18,7 +18,7 @@
package org.apache.nlpcraft.nlp.entity.parser.semantic;
/**
- * TODO:
+ *
*/
public interface NCSemanticStemmer {
/**
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index e373199..13402c0 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -40,35 +40,35 @@ object NCSemanticEntityParserImpl:
* @param stemmer
* @param parser
* @param macros
- * @param elems
+ * @param elms
* @return
*/
def apply(
stemmer: NCSemanticStemmer,
parser: NCTokenParser,
macros: Jmap[String, String],
- elems: JList[NCSemanticElement]
+ elms: JList[NCSemanticElement]
): NCSemanticEntityParserImpl =
- require(elems != null)
+ require(elms != null)
new NCSemanticEntityParserImpl(
stemmer,
parser,
macros = if macros == null then null else macros.asScala.toMap,
- elements = elems.asScala.toSeq
+ elements = elms.asScala.toSeq
)
/**
*
* @param stemmer
* @param parser
- * @param mdlSrc
+ * @param src
* @return
*/
- def apply(stemmer: NCSemanticStemmer, parser: NCTokenParser, mdlSrc:
String): NCSemanticEntityParserImpl =
- require(mdlSrc != null)
+ def apply(stemmer: NCSemanticStemmer, parser: NCTokenParser, src: String):
NCSemanticEntityParserImpl =
+ require(src != null)
- new NCSemanticEntityParserImpl(stemmer, parser, mdlSrc = mdlSrc,
scrType = NCSemanticSourceType.detect(mdlSrc))
+ new NCSemanticEntityParserImpl(stemmer, parser, mdlSrc = src, scrType
= NCSemanticSourceType.detect(src))
/**
* @param baseTokens Tokens.
@@ -161,6 +161,9 @@ class NCSemanticEntityParserImpl(
init()
+ /**
+ *
+ */
private def init(): Unit =
val (macros, elements, elemsMap) =
def toMap(elems: Seq[NCSemanticElement]): Map[String,
NCSemanticElement] = elems.map(p => p.getId -> p).toMap
@@ -182,7 +185,7 @@ class NCSemanticEntityParserImpl(
val stems = toks.map(p => p -> stemmer.stem(p.getText)).toMap
if toks.exists(_.getOpt[Boolean]("stopword").isEmpty) then
- logger.warn("Stopwords tokens enricher isn't configured.") //
TODO: warning text.
+ logger.warn("'stopword' property not found. Is stopword token
enricher configured?")
val cache = mutable.HashSet.empty[Seq[Int]] // Variants (tokens
without stopwords) can be repeated.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSourceReader.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSourceReader.scala
index 8a0a003..fd53f67 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSourceReader.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSourceReader.scala
@@ -38,7 +38,7 @@ private[impl] object NCSemanticSourceType:
if lc.endsWith(".json") || lc.endsWith(".js") then JSON
else if lc.endsWith(".yaml") || lc.endsWith(".yml") then YAML
- else E("Unexpected data type. Expected `yaml` or `json` formats.") //
TODO: error text.
+ else E(s"Expected `yaml` or `json` formats, but got: $src")
/**
*
@@ -78,8 +78,7 @@ private[impl] object NCSemanticSourceReader:
else
new NCPropertyMapAdapter with NCSemanticElement:
override val getId: String = e.id
- override val getGroups: JList[String] = nvl(e.groups)
- override val getDescription: String = e.description
+ override val getGroups: JSet[String] = nvl(e.groups.toSet)
override val getValues: JMap[String, JSet[String]] =
nvlValues(e.values)
override val getSynonyms: JSet[String] = nvl(e.synonyms)
override val getProperties: JMap[String, AnyRef] =
nvlProperties(e.properties)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
index 2b57f71..23e7989 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
@@ -75,8 +75,8 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
require(elements != null)
if macros != null then
- if hasNullOrEmpty(macros.keySet) then E("Some macro names are null
or empty.") // TODO: error text.
- if hasNullOrEmpty(macros.values) then E("Some macro bodies are
null or empty.") // TODO: error text.
+ if hasNullOrEmpty(macros.keySet) then E("Some macro names are null
or empty.")
+ if hasNullOrEmpty(macros.values) then E("Some macro bodies are
null or empty.")
val set = elements.filter(_.getSynonyms !=
null).flatMap(_.getSynonyms.asScala) ++ macros.values
@@ -87,10 +87,9 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
// Ignore suspicious chars if regex is used in macro...
for ((name, value) <- macros if isSuspicious(name) ||
(isSuspicious(value) && !value.contains("//")))
- // TODO: error text.
logger.warn(
s"Suspicious macro definition (use of
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
- s"macro=$name" +
+ s"macro=$name" +
s"]"
)
@@ -103,16 +102,12 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
private def checkSynonyms(syns: JSet[String], elemId: String, valueName:
Option[String] = None): Unit =
def mkDesc: String =
val valuePart = if valueName.isDefined then s",
value=${valueName.get}" else ""
-
s"[id=$elemId$valuePart]"
if syns != null then
- if hasNullOrEmpty(syns.asScala) then E(s"Some synonyms are null or
empty $mkDesc") // TODO: error text.
-
+ if hasNullOrEmpty(syns.asScala) then E(s"Some synonyms are null or
empty $mkDesc")
val susp = syns.asScala.filter(syn => !syn.contains("//") &&
SUSP_SYNS_CHARS.exists(susp => syn.contains(susp)))
-
if susp.nonEmpty then
- // TODO: error text.
logger.warn(
s"Suspicious synonyms detected (use of
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) $mkDesc"
)
@@ -121,30 +116,28 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
* @param elems
*/
private def checkElements(elems: Seq[NCSemanticElement]): Unit =
- if elems == null || elems.isEmpty then E("Elements cannot be null or
empty.") // TODO: error text.
- if elems.contains(null) then E("Some elements are null.") // TODO:
error text.
+ if elems == null || elems.isEmpty then E("Elements cannot be null or
empty.")
+ if elems.contains(null) then E("Some elements are null.")
// Duplicates.
val ids = mutable.HashSet.empty[String]
for (id <- elems.map(_.getId))
- if ids.contains(id) then E(s"Duplicate element ID [element=$id]")
// TODO: error text.
+ if ids.contains(id) then E(s"Duplicate element ID [element=$id]")
else ids += id
for (e <- elems)
val elemId = e.getId
- if elemId == null || elemId.isEmpty then E(s"Some element IDs are
not provided or empty.") // TODO: error text.
- else if !elemId.matches(ID_REGEX) then E(s"Element ID does not
match regex [element=$elemId, regex=$ID_REGEX]") // TODO: error text.
- else if elemId.exists(_.isWhitespace) then E(s"Element ID cannot
have whitespaces [element=$elemId]") // TODO: error text.
+ if elemId == null || elemId.isEmpty then E(s"Some element IDs are
not provided or empty.")
+ else if !elemId.matches(ID_REGEX) then E(s"Element ID does not
match regex [element=$elemId, regex=$ID_REGEX]")
+ else if elemId.exists(_.isWhitespace) then E(s"Element ID cannot
have whitespaces [element=$elemId]")
checkSynonyms(e.getSynonyms, elemId)
val vals = e.getValues
-
if vals != null then
- if hasNullOrEmpty(vals.keySet().asScala) then E(s"Some values
names are null or empty [element=$elemId]") // TODO: error text.
-
+ if hasNullOrEmpty(vals.keySet().asScala) then E(s"Some values
names are null or empty [element=$elemId]")
for ((name, syns) <- vals.asScala)
checkSynonyms(syns, elemId, Some(name))
@@ -173,15 +166,13 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
if ptrn.nonEmpty then
try NCSemanticSynonymChunk(REGEX, text, regex =
Pattern.compile(ptrn))
catch case e: PatternSyntaxException => E(s"Invalid regex
synonym syntax detected [element=$elemId, chunk=$text]", e)
- else E(s"Empty regex synonym detected [element=$elemId]") //
TODO: error text.
+ else E(s"Empty regex synonym detected [element=$elemId]")
val regexes = mutable.HashMap.empty[Int, RegexHolder]
def findRegex(t: NCToken): Option[RegexHolder] =
- if regexes.nonEmpty then
- (t.getStartCharIndex to
t.getEndCharIndex).flatMap(regexes.get).to(LazyList).headOption
- else
- None
+ if regexes.nonEmpty then (t.getStartCharIndex to
t.getEndCharIndex).flatMap(regexes.get).to(LazyList).headOption
+ else None
syns.asScala.flatMap(macroParser.expand).
map(syn => {
@@ -199,15 +190,13 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
if ch.startsWith(REGEX_FIX) && ch.endsWith(REGEX_FIX) then
val r = RegexHolder(ch)
-
(start to end).foreach(regexes += _ -> r)
// Tokenizes synonym without regex chunks. Regex chunks are
used as is, without tokenization.
tokParser.tokenize(normSyn.mkString(" ")).asScala.flatMap(tok
=>
findRegex(tok) match
case Some(regex) =>
- if regex.used then
- None
+ if regex.used then None
else
regex.used = true
Some(regex.mkChunk())
@@ -237,8 +226,7 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
val macroParser = new NCMacroParser
- if macros != null then
- for ((name, body) <- macros) macroParser.addMacro(name, body)
+ if macros != null then for ((name, body) <- macros)
macroParser.addMacro(name, body)
case class Holder(synonym: NCSemanticSynonym, elementId: String) {
lazy val root: String = synonym.chunks.map(p => if p.isText then
p.stem else p.text).mkString(" ")
@@ -273,16 +261,14 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
if elemIds.size > 1 then
for (s <- hs.map(_.synonym).distinct)
- // TODO: error text.
logger.warn(
- s"Synonym is related to various elements " +
- s"[synonym='${s.chunks.mkString(" ")}'" +
- s", elements=${elemIds.mkString("{", ",", "}")}" +
+ s"Synonym appears in multiple elements [" +
+ s"synonym='${s.chunks.mkString(" ")}', " +
+ s"elements=${elemIds.mkString("{", ",", "}")}" +
s"]")
})
val txtBuf = buf.filter(_.synonym.isText)
-
val txtSyns =
txtBuf.groupBy(_.synonym.stem).
map { (stem, hs) =>
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCEnPorterStemmer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCEnPorterStemmer.java
index 38aa159..8ec7f2a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCEnPorterStemmer.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/en/NCEnPorterStemmer.java
@@ -21,7 +21,7 @@ import opennlp.tools.stemmer.PorterStemmer;
import org.apache.nlpcraft.nlp.entity.parser.semantic.NCSemanticStemmer;
/**
- * TODO:
+ *
*/
public class NCEnPorterStemmer implements NCSemanticStemmer {
/** */
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
index 4b8288a..fb8bbaf 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala
@@ -48,13 +48,13 @@ case class NCSemanticTestElement(
props: Map[String, AnyRef] = Map.empty
) extends NCSemanticElement:
override def getId: String = id
- override def getGroups: JList[String] = groups.asJava
+ override def getGroups: JSet[String] = groups.toSet.asJava
override def getValues: JMap[String, JSet[String]] = values.map { (k, v)
=> k -> v.asJava}.asJava
override def getSynonyms: JSet[String] = synonyms.asJava
override def getProperties: JMap[String, Object] = props.asJava
/**
- *
+ *
*/
object NCSemanticTestElement:
def apply(id: String, synonyms: String*) = new NCSemanticTestElement(id,
synonyms = synonyms.toSet)
@@ -138,7 +138,7 @@ class NCSemanticEntityParserSpec:
ents.map(_.getId).sorted.zip(ids.sorted).foreach { case (eId, id) =>
require(eId == id) }
/**
- *
+ *
*/
@Test
def test(): Unit =