[incubator-nlpcraft] branch NLPCRAFT-472 updated: WIP.

sergeykamov Wed, 29 Dec 2021 12:26:50 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
     new 55bc0da  WIP.
55bc0da is described below

commit 55bc0da823a43ed1e0a63fd2759daeb60f188f74
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 29 23:26:37 2021 +0300

    WIP.
---
 .../semantic/impl/NCSemanticEntityParserImpl.scala | 42 ++++++++++++++++------
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index 5ebd99e..1148690 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -41,11 +41,12 @@ object NCSemanticEntityParserImpl:
     private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
     private final val REGEX_FIX = "//"
 
+    /**
+      * @param main Tokens.
+      * @param extra Variants without stopwords.
+      */
     private case class Piece(main: Seq[NCToken], extra: Seq[Seq[NCToken]])
 
-    private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
-        (for (n <- toks.size until 0 by -1) yield 
toks.sliding(n)).flatten.map(p => p)
-
     private def startsAndEnds(fix: String, s: String): Boolean = 
s.startsWith(fix) && s.endsWith(fix)
     private def mkChunk(stemmer: NCStemmer, chunk: String): NCSynonymChunk =
         def stripSuffix(fix: String, s: String): String = s.slice(fix.length, 
s.length - fix.length)
@@ -67,9 +68,22 @@ object NCSemanticEntityParserImpl:
                     s"]")
         else
             NCSynonymChunk(kind = TEXT, text = chunk, stem = 
stemmer.stem(chunk))
-
+    /**
+      *
+      * 1. Prepares combination of tokens (sliding).
+      *  Example: 'A B C D' -> {'A B C', 'A B', 'B C', 'A', 'B', 'C'}
+      *  One sentence converted to 4 pieces.
+      *
+      * 2. Additionally, each piece converted into set of elements with all 
possible its stopwords permutations.
+      *  Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded 
 into
+      *  {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'}
+      *
+      *  3. All variants collected, duplicated sets deleted, etc.
+      *
+      * @param toks
+      */
     private def getPieces(toks: Seq[NCToken]): Seq[Piece] =
-        combos(toks).map(combo => {
+        (for (n <- toks.size until 0 by -1) yield 
toks.sliding(n)).flatten.map(p => p).map(combo => {
             val stops = combo.filter(s => s.isStopWord && s != combo.head && s 
!= combo.last)
             val slides = 
mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NCToken]]
 
@@ -101,12 +115,21 @@ object NCSemanticEntityParserImpl:
 
             stops4Delete = stops4Delete.filter(seq => 
!seq.contains(combo.head) && !seq.contains(combo.last))
 
-            Piece(combo,  stops4Delete.map(del => combo.filter(t => 
!del.contains(t))).filter(_.nonEmpty))
+            Piece(
+                combo,
+                stops4Delete.
+                    map(_.toSet).
+                    map(del => combo.filter(t => 
!del.contains(t))).filter(_.nonEmpty).sortBy(-_.size)
+            )
         })
 
 import 
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl.*
 
-class NCSemanticEntityParserImpl(stemmer: NCStemmer, macros: Map[String, 
String], elements: Seq[NCSemanticElement]) extends NCEntityParser with 
LazyLogging:
+class NCSemanticEntityParserImpl(
+    stemmer: NCStemmer,
+    macros: Map[String, String],
+    elements: Seq[NCSemanticElement]
+) extends NCEntityParser with LazyLogging:
     private var sortedSyns: Map[Int, Map[String, Seq[NCSynonym]]] = _
 
     override def start(cfg: NCModelConfig): Unit =
@@ -143,9 +166,8 @@ class NCSemanticEntityParserImpl(stemmer: NCStemmer, 
macros: Map[String, String]
         )
 
         sortedSyns =
-            buf.groupBy(_.synonyms.size).map {
-                (len, hs) =>
-                    len -> hs.groupBy(_.elemId).map { case (id, seq) => id -> 
seq.flatMap(_.synonyms).toSeq.sorted }
+            buf.groupBy(_.synonyms.size).map { (len, hs) =>
+               len -> hs.groupBy(_.elemId).map { case (id, seq) => id -> 
seq.flatMap(_.synonyms).toSeq.sorted }
             }
 
     override def stop(): Unit = sortedSyns = null

[incubator-nlpcraft] branch NLPCRAFT-472 updated: WIP.

Reply via email to