This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new c129873 WIP.
c129873 is described below
commit c1298731e32df94fec1e882fe9dc65d282035450
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Dec 24 23:29:22 2021 +0300
WIP.
---
.../nlp/token/enricher/impl/NCEnBracketsImpl.scala | 24 +++++----
.../nlp/token/enricher/impl/NCEnQuotesImpl.scala | 27 +++++-----
.../token/enricher/impl/NCEnSwearWordsImpl.scala | 11 +++--
.../parser/opennlp/impl/NCEnOpenNlpImpl.scala | 4 +-
.../parser/opennlp/impl/NCEnStopWordsFinder.scala | 40 ++++++++++++---
.../apache/nlpcraft/internal/util/NCUtils.scala | 57 +++++++++-------------
.../opennlp/NCEnOpenNlpTokenParserSpec.scala | 22 ++++++++-
7 files changed, 113 insertions(+), 72 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
index 51609b7..a4b94a8 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
@@ -17,6 +17,7 @@
package org.apache.nlpcraft.internal.nlp.token.enricher.impl
+import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
import java.io.*
@@ -25,23 +26,26 @@ import scala.jdk.CollectionConverters.CollectionHasAsScala
/**
*
*/
-class NCEnBracketsImpl extends NCTokenEnricher:
+class NCEnBracketsImpl extends NCTokenEnricher with LazyLogging:
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
java.util.List[NCToken]): Unit =
- val s = new java.util.Stack[String]()
+ val stack = new java.util.Stack[String]()
+ val map = mutable.HashMap.empty[NCToken, Boolean]
+ var ok = true
- // TODO: text
- def mkError: NCException = new NCException(s"Invalid brackets in text:
${req.getOriginalText}")
- def check(expected: String): Unit = if s.empty() || s.pop() !=
expected then throw mkError
- def mark(t: NCToken): Unit = t.put("brackets:en", !s.isEmpty)
+ def check(expected: String): Unit = if stack.empty() || stack.pop() !=
expected then ok = false
+ def mark(t: NCToken): Unit = map += t -> !stack.isEmpty
- toks.forEach(t =>
+ for (t <- toks.asScala if ok)
t.getOriginalText match {
- case "(" | "{" | "[" => mark(t); s.push(t.getOriginalText)
+ case "(" | "{" | "[" | "<" => mark(t);
stack.push(t.getOriginalText)
case ")" => check("("); mark(t)
case "}" => check("{"); mark(t)
case "]" => check("["); mark(t)
+ case ">" => check("<"); mark(t)
case _ => mark(t)
}
- )
- if !s.isEmpty then throw mkError
\ No newline at end of file
+ if ok && stack.isEmpty then
+ map.foreach { (tok, b) => tok.put("brackets:en", b) }
+ else
+ logger.trace(s"Invalid brackets: ${req.getOriginalText}")
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
index b3d8893..8935864 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
@@ -17,6 +17,7 @@
package org.apache.nlpcraft.internal.nlp.token.enricher.impl
+import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
import scala.jdk.CollectionConverters.*
@@ -29,8 +30,7 @@ import NCEnQuotesImpl.*
/**
*
*/
-class NCEnQuotesImpl extends NCTokenEnricher:
-
+class NCEnQuotesImpl extends NCTokenEnricher with LazyLogging:
/**
*
* @param req
@@ -41,15 +41,14 @@ class NCEnQuotesImpl extends NCTokenEnricher:
val toksSeq = toks.asScala
val quotes = toksSeq.filter(isQuote)
- // TODO:
- if quotes.size % 2 != 0 then throw new NCException(s"Invalid quotes in
text: ${req.getOriginalText}")
-
- val m = toksSeq.zipWithIndex.toMap
- val pairs = quotes.zipWithIndex.drop(1).flatMap {
- case (t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx -
1))) else None
- }
-
- // Do not rewrite it with case, it works only such way, with tuples
(scala 3.1 error?)
- toksSeq.zipWithIndex.foreach(p =>
- p._1.put("quoted:en", pairs.exists { case (from, to) => from >
p._2 && to < p._2 })
- )
\ No newline at end of file
+ if quotes.size % 2 == 0 then
+ val m = toksSeq.zipWithIndex.toMap
+ val pairs = quotes.zipWithIndex.drop(1).flatMap {
+ (t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx -
1))) else None
+ }
+
+ toksSeq.zipWithIndex.foreach { (tok, idx) =>
+ tok.put("quoted:en", pairs.exists { case (from, to) => from >
idx && to < idx })
+ }
+ else
+ logger.trace(s"Invalid quotes: ${req.getOriginalText}")
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
index 1bf4ba3..cd645f5 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
@@ -17,10 +17,12 @@
package org.apache.nlpcraft.internal.nlp.token.enricher.impl
+import com.typesafe.scalalogging.LazyLogging
import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.*
import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnOpenNlpImpl
import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.internal.util.NCUtils.getStream
import java.io.*
@@ -33,24 +35,27 @@ object NCEnSwearWordsImpl:
* @param mdlFile
* @return
*/
- def apply(mdlFile: File): NCEnSwearWordsImpl = new NCEnSwearWordsImpl(new
BufferedInputStream(new FileInputStream(mdlFile)))
+ def apply(mdlFile: File): NCEnSwearWordsImpl = new NCEnSwearWordsImpl(
+ new BufferedInputStream(new FileInputStream(mdlFile)), mdlFile.getPath
+ )
/**
*
* @param mdlSrc
* @return
*/
- def apply(mdlSrc: String): NCEnSwearWordsImpl = new
NCEnSwearWordsImpl(NCUtils.getStream(mdlSrc))
+ def apply(mdlSrc: String): NCEnSwearWordsImpl = new
NCEnSwearWordsImpl(NCUtils.getStream(mdlSrc), mdlSrc)
/**
*
*/
-class NCEnSwearWordsImpl(is: InputStream) extends NCTokenEnricher:
+class NCEnSwearWordsImpl(is: InputStream, res: String) extends NCTokenEnricher
with LazyLogging:
@volatile private var swearWords: Set[String] = _
override def start(): Unit =
val stemmer = new PorterStemmer
swearWords = NCUtils.readTextStream(is,
"UTF-8").map(stemmer.stem).toSet
+ logger.trace(s"Loaded resource: $res")
override def stop(): Unit = swearWords = null
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
index 91fd44f..38db56a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
@@ -155,10 +155,10 @@ class NCEnOpenNlpImpl(
lemmatize(suspIdxs.map(i => words(i)).toArray,
suspIdxs.map(_ => "NNN").toArray).
zipWithIndex.
flatMap {
- case (lemma, i) => if lemma != "0" then
Some(suspIdxs(i) -> lemma) else None
+ (lemma, i) => if lemma != "0" then Some(suspIdxs(i) ->
lemma) else None
}.toMap
lemmas = lemmas.zipWithIndex.map {
- case (lemma, idx) => fixes.getOrElse(idx, lemma)
+ (lemma, idx) => fixes.getOrElse(idx, lemma)
}
val res: Seq[NCToken] =
holders.zip(posTags).zip(lemmas).toIndexedSeq.map { case ((h, pos), lemma) =>
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
index ecbe1c1..fc8cf6a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
@@ -119,17 +119,17 @@ private[impl] object NCEnStopWordsFinder:
includes: Map[String, Set[Wildcard]],
excludes: Map[String, Set[Wildcard]]
):
- require(!any.exists { case (begin, end) => begin.isEmpty &&
end.isEmpty })
+ require(!any.exists { (begin, end) => begin.isEmpty && end.isEmpty })
// Optimization for full wildcard cases (configurations like * | DT)
private val inclPoses = filterPoses(includes)
private val exclPoses = filterPoses(excludes)
private def filterPoses(m: Map[String, Set[Wildcard]]): Set[String] =
- m.filter { case(_, pair) => pair.exists { case (begin, end) =>
begin.isEmpty && end.isEmpty } }.keySet
+ m.filter { case(_, pair) => pair.exists { (begin, end) =>
begin.isEmpty && end.isEmpty } }.keySet
private def matches(s: String, set: Set[Wildcard]): Boolean =
- set.exists { case (b, e) => (b.isEmpty || s.startsWith(b)) &&
(e.isEmpty || s.endsWith(e)) }
+ set.exists { (b, e) => (b.isEmpty || s.startsWith(b)) &&
(e.isEmpty || s.endsWith(e)) }
def matches(s: String, posOpt: Option[String]): Boolean =
if s.contains(' ') then
@@ -544,12 +544,36 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
if quotes.nonEmpty then
val m = toks.zipWithIndex.toMap
val pairs = quotes.zipWithIndex.drop(1).flatMap {
- case (t, idx) => if idx % 2 != 0 then Some(m(t) ->
m(quotes(idx - 1))) else None
+ (t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx -
1))) else None
}
- stops --= stops.filter(t => pairs.exists {
- case (from, to) =>
- val idx = m(t)
- from > idx && to < idx
+ stops --= stops.filter(t => pairs.exists { (from, to) =>
+ val idx = m(t)
+ from > idx && to < idx
})
+ // +-------------------------------------------------+
+ // | Pass #9. |
+ // | Deletes stop words if they are brackets. |
+ // +-------------------------------------------------+
+ val stack = new java.util.Stack[String]()
+ val set = mutable.HashSet.empty[NCToken]
+ var ok = true
+
+ def check(expected: String): Unit = if stack.empty() || stack.pop() !=
expected then ok = false
+ def mark(t: NCToken): Unit = if (!stack.isEmpty) set += t
+
+ for (t <- toks if ok)
+ t.getOriginalText match {
+ case "(" | "{" | "[" | "<" => mark(t);
stack.push(t.getOriginalText)
+ case ")" => check("("); mark(t)
+ case "}" => check("{"); mark(t)
+ case "]" => check("["); mark(t)
+ case ">" => check("<"); mark(t)
+ case _ => mark(t)
+ }
+
+ // Just ignore invalid brackets.
+ if ok && stack.isEmpty then
+ stops --= stops.intersect(set)
+
stops.toSeq.sortBy(_.getStartCharIndex)
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index 6041741..e79caa1 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -38,7 +38,7 @@ import scala.util.control.Exception.ignoring
import scala.io.BufferedSource
/**
- * TODO: logs for all files reading methods (fix or remove)
+ *
*/
object NCUtils extends LazyLogging:
final val NL = System getProperty "line.separator"
@@ -870,33 +870,18 @@ object NCUtils extends LazyLogging:
* @param log Logger to use.
*/
def readResource(res: String, enc: String = "UTF-8", log: Logger =
logger): List[String] =
- readStream(getStream(res), enc, log)
+ val list =
+ try
+ Using.resource(Source.fromInputStream(getStream(res), enc)) {
src =>
+ src.getLines().toList
+ }
+ catch
+ case e: IOException => throw new NCException(s"Failed to read
stream.", e)
+
+ log.trace(s"Loaded resource: $res")
- /**
- * Reads lines from given stream.
- *
- * @param in Stream to read from.
- * @param enc Encoding.
- * @param log Logger to use.
- */
- def readStream(in: InputStream, enc: String = "UTF-8", log: Logger =
logger): List[String] =
- mapStream(in, enc, log, _.map(p => p).toList)
+ list
- /**
- * Maps lines from the given stream to an object.
- *
- * @param in Stream to read from.
- * @param enc Encoding.
- * @param log Logger to use.
- * @param mapper Function to read lines.
- */
- private def mapStream[T](in: InputStream, enc: String, log: Logger =
logger, mapper: Iterator[String] => T): T =
- try
- Using.resource(Source.fromInputStream(in, enc)) { src =>
- mapper(src.getLines())
- }
- catch
- case e: IOException => throw new NCException(s"Failed to read
stream.", e)
/**
*
@@ -915,12 +900,17 @@ object NCUtils extends LazyLogging:
* @param log Logger to use.
*/
def readTextGzipResource(res: String, enc: String, log: Logger = logger):
List[String] =
- try
- Using.resource(Source.fromInputStream(new
GZIPInputStream(getStream(res)), enc)) { src =>
- readLcTrimFilter(src)
- }
- catch
- case e: IOException => throw new NCException(s"Failed to read
stream.", e)
+ val list =
+ try
+ Using.resource(Source.fromInputStream(new
GZIPInputStream(getStream(res)), enc)) { src =>
+ readLcTrimFilter(src)
+ }
+ catch
+ case e: IOException => throw new NCException(s"Failed to read
stream.", e)
+
+ log.trace(s"Loaded resource: $res")
+
+ list
/**
* Reads lines from given stream converting to lower case, trimming, and
filtering
@@ -928,9 +918,8 @@ object NCUtils extends LazyLogging:
*
* @param in Stream to read from.
* @param enc Encoding.
- * @param log Logger to use.
*/
- def readTextStream(in: InputStream, enc: String, log: Logger = logger):
List[String] =
+ def readTextStream(in: InputStream, enc: String): List[String] =
try
Using.resource(Source.fromInputStream(in, enc)) { src =>
readLcTrimFilter(src)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
index 88c807c..90ab8e6 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
@@ -75,4 +75,24 @@ class NCEnOpenNlpTokenParserSpec:
require(toks.head.isStopWord);
require(toks.last.isStopWord);
require(toks.drop(1).reverse.drop(1).forall(!_.isStopWord))
- )
\ No newline at end of file
+ )
+ test(
+ // First and last are stop words,
+ // Third and fourth are not because brackets.
+ "A ( A A ) A",
+ toks =>
+ require(toks.length == 6);
+ require(toks.head.isStopWord);
+ require(toks.last.isStopWord);
+ require(toks.drop(1).reverse.drop(1).forall(!_.isStopWord))
+ )
+ test(
+ // Invalid brackets.
+ "A ( A A A",
+ toks => toks.filter(_.getNormalizedText !=
"(").forall(_.isStopWord)
+ )
+ test(
+ // Nested brackets.
+ "< < [ A ] > >",
+ toks => require(!toks.find(_.getNormalizedText ==
"a").get.isStopWord)
+ )