This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new 03ca3bc WIP.
03ca3bc is described below
commit 03ca3bce0d8e499745554df074cf1d0489626891
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 22 23:41:18 2021 +0300
WIP.
---
.../nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
index e9ccbac..a90cce7 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
@@ -223,7 +223,6 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems:
Set[String], exclStop
)
(m(false), m(true))
-
private def read(path: String): Set[String] =
NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
/**
@@ -599,9 +598,13 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems:
Set[String], exclStop
// | Pass #9. |
// | Deletes stop words if they are marked as quoted.|
// +-------------------------------------------------+
- val quotes = toks.filter(isQuote)
+ var quotes = toks.filter(isQuote)
+
+ if (quotes.size % 2 != 0)
+ // Just ignore last odd quote.
+ quotes = quotes.reverse.drop(1).reverse
- if (quotes.nonEmpty && quotes.size % 2 == 0)
+ if (quotes.nonEmpty)
val m = toks.zipWithIndex.toMap
val pairs =
@@ -615,8 +618,5 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems:
Set[String], exclStop
from > idx && to < idx
})
- else
- // TODO: do we need such logs?
- logger.debug(s"Unexpected quotes count, stop words processing
updating skipped for text: ${toks.map(_.getOriginalText).mkString(" ")}")
stops.toSeq.sortBy(_.getStartCharIndex)
\ No newline at end of file