This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new 55bc0da WIP.
55bc0da is described below
commit 55bc0da823a43ed1e0a63fd2759daeb60f188f74
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 29 23:26:37 2021 +0300
WIP.
---
.../semantic/impl/NCSemanticEntityParserImpl.scala | 42 ++++++++++++++++------
1 file changed, 32 insertions(+), 10 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index 5ebd99e..1148690 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -41,11 +41,12 @@ object NCSemanticEntityParserImpl:
private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
private final val REGEX_FIX = "//"
+ /**
+ * @param main Tokens.
+ * @param extra Variants without stopwords.
+ */
private case class Piece(main: Seq[NCToken], extra: Seq[Seq[NCToken]])
- private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
- (for (n <- toks.size until 0 by -1) yield
toks.sliding(n)).flatten.map(p => p)
-
private def startsAndEnds(fix: String, s: String): Boolean =
s.startsWith(fix) && s.endsWith(fix)
private def mkChunk(stemmer: NCStemmer, chunk: String): NCSynonymChunk =
def stripSuffix(fix: String, s: String): String = s.slice(fix.length,
s.length - fix.length)
@@ -67,9 +68,22 @@ object NCSemanticEntityParserImpl:
s"]")
else
NCSynonymChunk(kind = TEXT, text = chunk, stem =
stemmer.stem(chunk))
-
+ /**
+ *
+ * 1. Prepares combination of tokens (sliding).
+ * Example: 'A B C D' -> {'A B C', 'A B', 'B C', 'A', 'B', 'C'}
+ * One sentence converted to 4 pieces.
+ *
+ * 2. Additionally, each piece converted into set of elements with all
possible its stopwords permutations.
+ * Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded
into
+ * {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'}
+ *
+ * 3. All variants collected, duplicated sets deleted, etc.
+ *
+ * @param toks
+ */
private def getPieces(toks: Seq[NCToken]): Seq[Piece] =
- combos(toks).map(combo => {
+ (for (n <- toks.size until 0 by -1) yield
toks.sliding(n)).flatten.map(p => p).map(combo => {
val stops = combo.filter(s => s.isStopWord && s != combo.head && s
!= combo.last)
val slides =
mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NCToken]]
@@ -101,12 +115,21 @@ object NCSemanticEntityParserImpl:
stops4Delete = stops4Delete.filter(seq =>
!seq.contains(combo.head) && !seq.contains(combo.last))
- Piece(combo, stops4Delete.map(del => combo.filter(t =>
!del.contains(t))).filter(_.nonEmpty))
+ Piece(
+ combo,
+ stops4Delete.
+ map(_.toSet).
+ map(del => combo.filter(t =>
!del.contains(t))).filter(_.nonEmpty).sortBy(-_.size)
+ )
})
import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl.*
-class NCSemanticEntityParserImpl(stemmer: NCStemmer, macros: Map[String,
String], elements: Seq[NCSemanticElement]) extends NCEntityParser with
LazyLogging:
+class NCSemanticEntityParserImpl(
+ stemmer: NCStemmer,
+ macros: Map[String, String],
+ elements: Seq[NCSemanticElement]
+) extends NCEntityParser with LazyLogging:
private var sortedSyns: Map[Int, Map[String, Seq[NCSynonym]]] = _
override def start(cfg: NCModelConfig): Unit =
@@ -143,9 +166,8 @@ class NCSemanticEntityParserImpl(stemmer: NCStemmer,
macros: Map[String, String]
)
sortedSyns =
- buf.groupBy(_.synonyms.size).map {
- (len, hs) =>
- len -> hs.groupBy(_.elemId).map { case (id, seq) => id ->
seq.flatMap(_.synonyms).toSeq.sorted }
+ buf.groupBy(_.synonyms.size).map { (len, hs) =>
+ len -> hs.groupBy(_.elemId).map { case (id, seq) => id ->
seq.flatMap(_.synonyms).toSeq.sorted }
}
override def stop(): Unit = sortedSyns = null