This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new 84d9d73 WIP.
84d9d73 is described below
commit 84d9d739d3d06e0712cb437fae74e3e0c0deab57
Author: Sergey Kamov <[email protected]>
AuthorDate: Thu Dec 30 19:23:50 2021 +0300
WIP.
---
.../semantic/impl/NCSemanticDataReader.scala | 16 ++-
.../semantic/impl/NCSemanticEntityParserImpl.scala | 4 +-
.../impl/NCSemanticSynonymsProcessor.scala | 124 +++++++++++++++++----
3 files changed, 108 insertions(+), 36 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
index f7c1df5..5125392 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticDataReader.scala
@@ -58,15 +58,16 @@ private[impl] object NCSemanticDataReader:
)
case class Source(macros: Map[String, String], elements: Seq[Element])
- private def emptyList[T]: JList[T] = util.Collections.emptyList()
- private def nvl[T](seq: Seq[T]): JList[T] = if seq == null then emptyList
else seq.asJava
- private def nvlConvert[T, R](seq: Seq[T], to: T => R): JList[R] = if seq
== null then emptyList else seq.map(to).asJava
+ private def nvl[T](seq: Seq[T]): JList[T] = if seq == null then null else
seq.asJava
+ private def nvlConvert[T, R](seq: Seq[T], to: T => R): JList[R] = if seq
== null then null else seq.map(to).asJava
private def convertValue(v: Value) =
- new NCSemanticElementValue:
+ if v == null then null
+ else new NCSemanticElementValue:
override def getName: String = v.name
override def getSynonyms: JList[String] = nvl(v.synonyms)
private def convertElement(e: Element) =
- new NCSemanticElement:
+ if e == null then null
+ else new NCSemanticElement:
override def getId: String = e.id
override def getGroups: JList[String] = nvl(e.groups)
override def getDescription: String = e.description
@@ -93,7 +94,4 @@ private[impl] object NCSemanticDataReader:
val src = mapper.readValue(is, classOf[Source])
- NCSemanticData(
- if src.macros == null then Map.empty else src.macros,
- if src.elements == null then Seq.empty else
src.elements.map(convertElement)
- )
\ No newline at end of file
+ NCSemanticData(src.macros, if src.elements == null then null else
src.elements.map(convertElement))
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index 5f355e7..2540ec1 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -37,9 +37,7 @@ object NCSemanticEntityParserImpl:
require(elems != null)
new NCSemanticEntityParserImpl(
- stemmer,
- macros = if macros == null then Map.empty else
macros.asScala.toMap,
- elements = elems.asScala.toSeq
+ stemmer, macros = if macros == null then null else
macros.asScala.toMap, elements = elems.asScala.toSeq
)
def apply(stemmer: NCSemanticTextStemmer, mdlFile: File):
NCSemanticEntityParserImpl =
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
index e296472..b92a9f8 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
@@ -23,8 +23,8 @@ import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.makro.NCMacroParser
import org.apache.nlpcraft.nlp.entity.parser.semantic.*
import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticChunkKind.*
-
import com.typesafe.scalalogging.LazyLogging
+
import java.io.InputStream
import java.util
import java.util.List as JList
@@ -43,23 +43,99 @@ private[impl] case class NCSemanticSynonymsHolder(
private[impl] object NCSemanticSynonymsProcessor extends LazyLogging:
private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
private final val REGEX_FIX = "//"
+ private final val ID_REGEX = "^[_a-zA-Z]+[a-zA-Z0-9:\\-_]*$"
+
+ /**
+ *
+ * @param macros
+ * @param elements
+ */
+ private def checkMacros(macros: Map[String, String], elements:
Seq[NCSemanticElement]): Unit =
+ if macros != null then
+ if macros.contains(null) then throw new NCException("Some macro
are null")
+
+ val set = elements.filter(_.getSynonyms !=
null).flatMap(_.getSynonyms.asScala) ++ macros.values
+
+ for (makro <- macros.keys if !set.exists(_.contains(makro)))
+ logger.warn(s"Unused macro detected [macro=$makro]")
+
+ def isSuspicious(s: String): Boolean =
SUSP_SYNS_CHARS.exists(s.contains)
+
+ for ((mkName, mkVal) <- macros)
+ // Ignore suspicious chars if regex is used in macro...
+ if isSuspicious(mkName) || (isSuspicious(mkVal) &&
!mkVal.contains("//")) then
+ logger.warn(s"Suspicious macro definition (use of
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
+ s"macro=$mkName" +
+ s"]")
+
+ /**
+ *
+ * @param s
+ * @return
+ */
+ private def hasWhitespace(s: String): Boolean = s.exists(_.isWhitespace)
+
+ /**
+ *
+ * @param syns
+ * @param elemId
+ * @param valueName
+ */
+ private def checkSynonyms(syns: JList[String], elemId: String, valueName:
Option[String] = None): Unit =
+ if syns != null then
+ if syns.contains(null) then throw new NCException(
+ "Some synonyms are null[" +
+ s"id=$elemId, " +
+ (if valueName.isDefined then s"value=${valueName.get}, " else
"") +
+ "]"
+ )
+
+ val susp = syns.asScala.filter(syn => !syn.contains("//") &&
SUSP_SYNS_CHARS.exists(susp => syn.contains(susp)))
+
+ if susp.nonEmpty then
+ logger.warn(
+ s"Suspicious synonyms detected (use of
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
+ s"id=$elemId, " +
+ (if valueName.isDefined then s"value=${valueName.get}, "
else "") +
+ s"synonyms=[${susp.mkString(", ")}]" +
+ s"]"
+ )
+
+ /**
+ *
+ * @param elements
+ */
+ private def checkElements(elements: Seq[NCSemanticElement]): Unit =
+ if elements == null || elements.isEmpty then throw new
NCException("Elements cannot be null or empty")
+ if elements.contains(null) then throw new NCException("Some elements
are null")
+
+ // Duplicates.
+ val ids = mutable.HashSet.empty[String]
- // TODO: extend.
- private def validate(macros: Map[String, String], elements:
Seq[NCSemanticElement]): Unit =
- if (elements == null || elements.isEmpty)
- throw new NCException("Elements cannot be empty") // TODO:
+ for (id <- elements.map(_.getId))
+ if ids.contains(id) then throw new NCException(s"Duplicate element
ID [element=$id]")
+ else ids += id
for (e <- elements)
- if (e.getSynonyms != null)
- val susp = e.getSynonyms.asScala.filter(syn =>
!syn.contains("//") && SUSP_SYNS_CHARS.exists(susp => syn.contains(susp)))
+ val elemId = e.getId
+
+ if elemId == null || elemId.isEmpty then
+ throw new NCException(s"Some element IDs are not provided or
empty")
+ else if !elemId.matches(ID_REGEX) then
+ throw new NCException(s"Element ID does not match regex
[element=$elemId, regex=$ID_REGEX]")
+ else if hasWhitespace(elemId) then
+ throw new NCException(s"Element ID cannot have whitespaces
[element=$elemId]")
+
+ checkSynonyms(e.getSynonyms, elemId)
+
+ if e.getValues != null then
+ for (v <- e.getValues.asScala)
+ val name = v.getName
- if susp.nonEmpty then
- logger.warn(
- s"Suspicious synonyms detected (use of
${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" +
- s"elementId=${e.getId}, " +
- s"synonyms=[${susp.mkString(", ")}]" +
- s"]"
- )
+ if name == null || name.isEmpty then
+ throw new NCException(s"Some value names are not
provided or empty [element=$elemId]")
+
+ checkSynonyms(v.getSynonyms, elemId, Some(name))
private def startsAndEnds(fix: String, s: String): Boolean =
s.startsWith(fix) && s.endsWith(fix)
private def mkChunk(stemmer: NCSemanticTextStemmer, chunk: String):
NCSemanticSynonymChunk =
@@ -73,13 +149,9 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
NCSemanticSynonymChunk(kind = REGEX, text = chunk, regex =
Pattern.compile(ptrn))
catch
case e: PatternSyntaxException =>
- throw new NCException(s"Invalid regex synonym syntax
detected [" +
- s"chunk=$chunk" +
- s"]", e)
+ throw new NCException(s"Invalid regex synonym syntax
detected [chunk=$chunk]", e)
else
- throw new NCException(s"Empty regex synonym detected [" +
- s"chunk=$chunk" +
- s"]")
+ throw new NCException(s"Empty regex synonym detected
[chunk=$chunk]")
else
NCSemanticSynonymChunk(kind = TEXT, text = chunk, stem =
stemmer.stem(chunk))
@@ -89,7 +161,7 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
* @param stemmer
* @param macros
* @param elements
- * @throws NCException // TODO
+ * @throws NCException
*/
def prepare(
cfg: NCModelConfig,
@@ -97,11 +169,15 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
macros: Map[String, String],
elements: Seq[NCSemanticElement]
): NCSemanticSynonymsHolder =
- validate(macros, elements)
+ require(cfg != null && stemmer != null)
+
+ checkElements(elements)
+ checkMacros(macros, elements)
val p = new NCMacroParser
- for ((name, body) <- macros) p.addMacro(name, body)
+ if macros != null then
+ for ((name, body) <- macros) p.addMacro(name, body)
case class Holder(synonym: NCSemanticSynonym, elementId: String)
@@ -127,7 +203,7 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
for (v <- e.getValues.asScala)
add(Seq(NCSemanticSynonym(Seq(mkSpecChunk(v.getName)),
isValueName = true, value = v.getName)))
- if (v.getSynonyms != null)
+ if v.getSynonyms != null then
add(convert(v.getSynonyms).map(chunks =>
NCSemanticSynonym(chunks, value = v.getName)))
val txtBuf = buf.filter(_.synonym.isText)