This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new 81d4b06 WIP.
81d4b06 is described below
commit 81d4b06a0e32d61aa87300bfeded7588c30858d6
Author: Sergey Kamov <[email protected]>
AuthorDate: Thu Dec 30 22:57:22 2021 +0300
WIP.
---
.../semantic/impl/NCSemanticEntityParserImpl.scala | 47 ++++++++++++++--------
.../impl/NCSemanticSynonymsProcessor.scala | 2 +-
2 files changed, 32 insertions(+), 17 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index 44adac1..1a3dfbe 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -44,7 +44,7 @@ object NCSemanticEntityParserImpl:
require(stemmer != null)
require(mdlSrc != null)
- new NCSemanticEntityParserImpl(stemmer, res = mdlSrc, typ =
NCSemanticSourceType(mdlSrc))
+ new NCSemanticEntityParserImpl(stemmer, mdlSrc = mdlSrc, typ =
NCSemanticSourceType(mdlSrc))
/**
* @param baseTokens Tokens.
@@ -119,18 +119,18 @@ class NCSemanticEntityParserImpl(
stemmer: NCSemanticTextStemmer,
macros: Map[String, String] = null,
elements: Seq[NCSemanticElement] = null,
- res: String = null,
+ mdlSrc: String = null,
typ: NCSemanticSourceType = null
) extends NCEntityParser with LazyLogging:
require(stemmer != null)
- require(macros != null && elements != null || res != null && typ != null)
+ require(macros != null && elements != null || mdlSrc != null && typ !=
null)
@volatile private var h: NCSemanticSynonymsHolder = _
override def start(cfg: NCModelConfig): Unit =
val (macros, elements) =
- if res != null then
- val src = NCSemanticDataReader.read(new
BufferedInputStream(NCUtils.getStream(res)), typ)
+ if mdlSrc != null then
+ val src = NCSemanticDataReader.read(new
BufferedInputStream(NCUtils.getStream(mdlSrc)), typ)
(src.macros, src.elements)
else
(this.macros, this.elements)
@@ -142,21 +142,19 @@ class NCSemanticEntityParserImpl(
override def parse(req: NCRequest, cfg: NCModelConfig, toksList:
JList[NCToken]): JList[NCEntity] =
val toks = toksList.asScala.toSeq
val cache = mutable.HashSet.empty[Seq[Int]] // Variants (tokens
without stopwords) can be repeated.
- val ents = mutable.ArrayBuffer.empty[NCEntity]
+
+ case class Holder(elemId: String, tokens: Seq[NCToken])
+
+ val hs = mutable.ArrayBuffer.empty[Holder]
for (piece <- getPieces(toks); variant <- Seq(piece.baseTokens) ++
piece.variants)
- def addEntity(elemId: String): Unit =
- ents +=
- new NCPropertyMapAdapter with NCEntity:
- override def getTokens: JList[NCToken] =
piece.baseTokens.asJava
- override def getRequestId: String = req.getRequestId
- override def getId: String = elemId
+ def add(elemId: String): Unit = hs += Holder(elemId,
piece.baseTokens)
val idxs = variant.map(_.getIndex)
if cache.add(idxs) then
h.textSynonyms.get(variant.map(_.getStem).mkString(" ")) match
- case Some(elemIds) => elemIds.foreach(addEntity)
+ case Some(elemIds) => elemIds.foreach(add)
case None =>
for ((elemId, syns) <-
h.mixedSynonyms.getOrElse(variant.size, Seq.empty))
var found = false
@@ -171,6 +169,23 @@ class NCSemanticEntityParserImpl(
if chunk.isText then chunk.stem ==
tok.getStem
else match0(tok.getText) ||
match0(tok.getText.toLowerCase)
}
- if found then addEntity(elemId)
-
- ents.toSeq.asJava
\ No newline at end of file
+ if found then add(elemId)
+
+ val hsIdxs = hs.zipWithIndex
+
+ // Drops redundant according to well-known theorem.
+ hs --=
+ hs.zipWithIndex.filter { (h1, idx1) =>
+ hsIdxs.exists { (h2, idx2) =>
+ idx2 != idx1 &&
+ h2.tokens.size > h1.tokens.size &&
+ h1.tokens.forall(h2.tokens.contains)
+ }
+ }.map { (h, _) => h }
+
+ hs.toSeq.map(h =>
+ new NCPropertyMapAdapter with NCEntity:
+ override def getTokens: JList[NCToken] = h.tokens.asJava
+ override def getRequestId: String = req.getRequestId
+ override def getId: String = h.elemId
+ ).asJava
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
index 9577451..4a49dae 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
@@ -54,7 +54,7 @@ private[impl] object NCSemanticSynonymsProcessor extends
LazyLogging:
if macros != null then
// TODO: check empty.
if macros.contains(null) then throw new NCException("Some macro
names are null")
- if macros.values.contains(null) then throw new NCException("Some
macro bodies are null")
+ // if macros.values.contains(null) then throw new
NCException("Some macro bodies are null")
val set = elements.filter(_.getSynonyms !=
null).flatMap(_.getSynonyms.asScala) ++ macros.values