This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new c426006 WIP.
c426006 is described below
commit c42600663d50bc37d5ce5ba07e31797ec893ca05
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Dec 21 16:08:06 2021 +0300
WIP.
---
.../opennlp/impl/NCEnStopWordsFinderImpl.scala | 24 ++++++++++++++++++++++
.../parser/opennlp/NCOpenNlpTokenParserSpec.scala | 10 +++++++--
2 files changed, 32 insertions(+), 2 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
index ff1cfde..e41798d 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
@@ -84,6 +84,8 @@ object NCEnStopWordsFinderImpl:
private final val STOP_BEFORE_STOP: Seq[Word] = Seq("DT", "PRP", "PRP$",
"WDT", "WP", "WP$", "WRB")
+ private val Q_POS = Set("``", "''")
+
/**
* Stop words holder, used for hash search.
*
@@ -173,6 +175,8 @@ object NCEnStopWordsFinderImpl:
wildcardsLemmas.matches(toLemmaKey(toks), posOpt) ||
wildcardsOrigins.matches(toOriginalKey(toks), posOpt)
+ private def isQuote(t: NCToken): Boolean = Q_POS.contains(t.getPos)
+
private def toStemKey(toks: Seq[NCToken]): String =
toks.map(_.getStem).mkString(" ")
private def toLemmaKey(toks: Seq[NCToken]): String =
toks.map(_.getLemma).mkString(" ")
private def toValueKey(toks: Seq[NCToken]): String =
toks.map(_.getOriginalText.toLowerCase).mkString(" ")
@@ -573,6 +577,26 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
// +-------------------------------------------------+
processCommonStops(ns, stops)
+ val quotes = ns.filter(isQuote)
+
+ if (quotes.nonEmpty && quotes.size % 2 == 0)
+ val m = ns.zipWithIndex.toMap
+
+ val pairs =
+ quotes.zipWithIndex.
+ drop(1).
+ flatMap { case (t, idx) => if (idx % 2 != 0) Some(m(t) ->
m(quotes(idx - 1))) else None }
+
+ stops --=
+ stops.filter(t => pairs.exists { case (from, to) =>
+ val idx = m(t)
+
+ from > idx && to < idx
+ })
+ else
+ // TODO:
+ logger.debug(s"Unexpected quotes count, stop words processing
updating skipped for text: ${ns.map(_.getOriginalText).mkString(" ")}")
+
stops.toSeq.sortBy(_.getStartCharIndex).asJava
override def start(): Unit =
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
index b420805..714c9b2 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
@@ -76,7 +76,6 @@ class NCOpenNlpTokenParserSpec:
println(s"Request: $txt")
res.foreach(t =>
- pprint(t)
println(
s"Text: ${t.getOriginalText}" +
s", normalized: ${t.getNormalizedText}" +
@@ -105,4 +104,11 @@ class NCOpenNlpTokenParserSpec:
require(toks.length == 3)
require(!toks.head.isStopWord)
- require(toks.last.isStopWord)
\ No newline at end of file
+ require(toks.last.isStopWord)
+
+ toks = request("A ` A A ` A")
+
+ require(toks.length == 6)
+ require(toks.head.isStopWord)
+ require(toks.last.isStopWord)
+ require(toks.drop(1).reverse.drop(1).forall(!_.isStopWord))
\ No newline at end of file