[incubator-nlpcraft] branch NLPCRAFT-472 updated: WIP.

sergeykamov Thu, 30 Dec 2021 11:57:33 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
     new 81d4b06  WIP.
81d4b06 is described below

commit 81d4b06a0e32d61aa87300bfeded7588c30858d6
Author: Sergey Kamov <[email protected]>
AuthorDate: Thu Dec 30 22:57:22 2021 +0300

    WIP.
---
 .../semantic/impl/NCSemanticEntityParserImpl.scala | 47 ++++++++++++++--------
 .../impl/NCSemanticSynonymsProcessor.scala         |  2 +-
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index 44adac1..1a3dfbe 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -44,7 +44,7 @@ object NCSemanticEntityParserImpl:
         require(stemmer != null)
         require(mdlSrc != null)
 
-        new NCSemanticEntityParserImpl(stemmer, res = mdlSrc, typ = 
NCSemanticSourceType(mdlSrc))
+        new NCSemanticEntityParserImpl(stemmer, mdlSrc = mdlSrc, typ = 
NCSemanticSourceType(mdlSrc))
 
     /**
       * @param baseTokens Tokens.
@@ -119,18 +119,18 @@ class NCSemanticEntityParserImpl(
     stemmer: NCSemanticTextStemmer,
     macros: Map[String, String] = null,
     elements: Seq[NCSemanticElement] = null,
-    res: String = null,
+    mdlSrc: String = null,
     typ: NCSemanticSourceType = null
 ) extends NCEntityParser with LazyLogging:
     require(stemmer != null)
-    require(macros != null && elements != null || res != null && typ != null)
+    require(macros != null && elements != null || mdlSrc != null && typ != 
null)
 
     @volatile private var h: NCSemanticSynonymsHolder = _
 
     override def start(cfg: NCModelConfig): Unit =
         val (macros, elements) =
-            if res != null then
-                val src = NCSemanticDataReader.read(new 
BufferedInputStream(NCUtils.getStream(res)), typ)
+            if mdlSrc != null then
+                val src = NCSemanticDataReader.read(new 
BufferedInputStream(NCUtils.getStream(mdlSrc)), typ)
                 (src.macros, src.elements)
             else
                 (this.macros, this.elements)
@@ -142,21 +142,19 @@ class NCSemanticEntityParserImpl(
     override def parse(req: NCRequest, cfg: NCModelConfig, toksList: 
JList[NCToken]): JList[NCEntity] =
         val toks = toksList.asScala.toSeq
         val cache = mutable.HashSet.empty[Seq[Int]] // Variants (tokens 
without stopwords) can be repeated.
-        val ents = mutable.ArrayBuffer.empty[NCEntity]
+
+        case class Holder(elemId: String, tokens: Seq[NCToken])
+
+        val hs = mutable.ArrayBuffer.empty[Holder]
 
         for (piece <- getPieces(toks); variant <- Seq(piece.baseTokens) ++ 
piece.variants)
-            def addEntity(elemId: String): Unit =
-                ents +=
-                    new NCPropertyMapAdapter with NCEntity:
-                        override def getTokens: JList[NCToken] = 
piece.baseTokens.asJava
-                        override def getRequestId: String = req.getRequestId
-                        override def getId: String = elemId
+            def add(elemId: String): Unit = hs += Holder(elemId, 
piece.baseTokens)
 
             val idxs = variant.map(_.getIndex)
 
             if cache.add(idxs) then
                 h.textSynonyms.get(variant.map(_.getStem).mkString(" ")) match
-                    case Some(elemIds) => elemIds.foreach(addEntity)
+                    case Some(elemIds) => elemIds.foreach(add)
                     case None =>
                         for ((elemId, syns) <- 
h.mixedSynonyms.getOrElse(variant.size, Seq.empty))
                             var found = false
@@ -171,6 +169,23 @@ class NCSemanticEntityParserImpl(
                                             if chunk.isText then chunk.stem == 
tok.getStem
                                             else match0(tok.getText) || 
match0(tok.getText.toLowerCase)
                                         }
-                                if found then addEntity(elemId)
-
-        ents.toSeq.asJava
\ No newline at end of file
+                                if found then add(elemId)
+
+        val hsIdxs = hs.zipWithIndex
+
+        // Drops redundant according to well-known theorem.
+        hs --=
+            hs.zipWithIndex.filter { (h1, idx1) =>
+                hsIdxs.exists { (h2, idx2) =>
+                    idx2 != idx1 &&
+                    h2.tokens.size > h1.tokens.size &&
+                    h1.tokens.forall(h2.tokens.contains)
+                }
+            }.map { (h, _) => h }
+
+        hs.toSeq.map(h =>
+            new NCPropertyMapAdapter with NCEntity:
+                override def getTokens: JList[NCToken] = h.tokens.asJava
+                override def getRequestId: String = req.getRequestId
+                override def getId: String = h.elemId
+        ).asJava
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
index 9577451..4a49dae 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala
@@ -54,7 +54,7 @@ private[impl] object NCSemanticSynonymsProcessor extends 
LazyLogging:
         if macros != null then
             // TODO: check empty.
             if macros.contains(null) then throw new NCException("Some macro 
names are null")
-            if macros.values.contains(null) then throw new NCException("Some 
macro bodies are null")
+            // if macros.values.contains(null) then throw new 
NCException("Some macro bodies are null")
 
             val set = elements.filter(_.getSynonyms != 
null).flatMap(_.getSynonyms.asScala) ++ macros.values

[incubator-nlpcraft] branch NLPCRAFT-472 updated: WIP.

Reply via email to