This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new 9005fa7 WIP.
9005fa7 is described below
commit 9005fa77495828b65652c5b2744cc7c7558be42e
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Dec 21 11:33:20 2021 +0300
WIP.
---
.../token/parser/opennlp/NCEnStopWordsFinder.java | 63 +++++++
.../token/parser/opennlp/NCOpenNlpTokenParser.java | 53 ++----
.../token/parser/opennlp/NCStopWordsFinder.java | 35 ++++
...Generator.scala => NCEnStopWordGenerator.scala} | 2 +-
...ocessor.scala => NCEnStopWordsFinderImpl.scala} | 194 ++++++++-------------
.../token/parser/opennlp/impl/NCOpenNlpImpl.scala | 84 +++++----
.../apache/nlpcraft/internal/util/NCUtils.scala | 10 --
.../parser/opennlp/NCOpenNlpTokenParserSpec.scala | 3 +-
8 files changed, 238 insertions(+), 206 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnStopWordsFinder.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnStopWordsFinder.java
new file mode 100644
index 0000000..f631fc2
--- /dev/null
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnStopWordsFinder.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.nlp.token.parser.opennlp;
+
+import org.apache.nlpcraft.NCToken;
+import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnStopWordsFinderImpl;
+
+import java.util.Collections;
+import java.util.Set;
+import java.util.List;
+
+/**
+ *
+ */
+public class NCEnStopWordsFinder implements NCStopWordsFinder {
+ private final NCEnStopWordsFinderImpl impl;
+
+ /**
+ *
+ */
+ public NCEnStopWordsFinder() {
+ impl = new NCEnStopWordsFinderImpl(Collections.emptySet(),
Collections.emptySet());
+ }
+
+ /**
+ *
+ * @param addStopWords
+ * @param exclStopWords
+ */
+ public NCEnStopWordsFinder(Set<String> addStopWords, Set<String>
exclStopWords) {
+ impl = new NCEnStopWordsFinderImpl(addStopWords, exclStopWords);
+ }
+
+ @Override
+ public void start() {
+ impl.start();
+ }
+
+ @Override
+ public void stop() {
+ impl.stop();
+ }
+
+ @Override
+ public List<NCToken> find(List<NCToken> sen) {
+ return impl.find(sen);
+ }
+}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
index 118ef86..075b030 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
@@ -39,16 +39,27 @@ import java.util.Set;
public class NCOpenNlpTokenParser implements NCTokenParser {
private final NCOpenNlpImpl impl;
+ @Override
+ public void start() {
+ impl.start();
+ }
+
+ @Override
+ public void stop() {
+ impl.stop();
+ }
+
/**
*
* @param tokMdl
* @param posMdl
* @param lemmaDic
+ * @param swFinder
* @throws NCException
*/
- public NCOpenNlpTokenParser(File tokMdl, File posMdl, File lemmaDic) {
+ public NCOpenNlpTokenParser(File tokMdl, File posMdl, File lemmaDic,
NCStopWordsFinder swFinder) {
try {
- impl = NCOpenNlpImpl.apply(tokMdl, posMdl, lemmaDic);
+ impl = NCOpenNlpImpl.apply(tokMdl, posMdl, lemmaDic, swFinder);
}
catch (Exception e) {
throw new NCException("Failed to create OpenNLP token parser.", e);
@@ -60,11 +71,12 @@ public class NCOpenNlpTokenParser implements NCTokenParser {
* @param tokMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tokenizer model.
* @param posMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tagger model.
* @param lemmaDicSrc Local filesystem path, resources file path or URL
for OpenNLP lemmatizer dictionary.
+ * @param swFinder
* @throws NCException
*/
- public NCOpenNlpTokenParser(String tokMdlSrc, String posMdlSrc, String
lemmaDicSrc) {
+ public NCOpenNlpTokenParser(String tokMdlSrc, String posMdlSrc, String
lemmaDicSrc, NCStopWordsFinder swFinder) {
try {
- impl = NCOpenNlpImpl.apply(tokMdlSrc, posMdlSrc, lemmaDicSrc);
+ impl = NCOpenNlpImpl.apply(tokMdlSrc, posMdlSrc, lemmaDicSrc,
swFinder);
}
catch (Exception e) {
throw new NCException("Failed to create OpenNLP token parser.", e);
@@ -73,39 +85,6 @@ public class NCOpenNlpTokenParser implements NCTokenParser {
@Override
public List<NCToken> parse(NCRequest req) {
- assert impl != null;
return impl.parse(req);
}
-
- /**
- *
- * @return
- */
- public Set<String> getAdditionalStopWords() {
- return impl.getAdditionalStopWords();
- }
-
- /**
- *
- * @param addStopWords
- */
- public void setAdditionalStopWords(Set<String> addStopWords) {
- impl.setAdditionalStopWords(addStopWords);
- }
-
- /**
- *
- * @return
- */
- public Set<String> getExcludedStopWords() {
- return impl.getExcludedStopWords();
- }
-
- /**
- *
- * @param exclStopWords
- */
- public void setExcludedStopWords(Set<String> exclStopWords) {
- impl.setExcludedStopWords(exclStopWords);
- }
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCStopWordsFinder.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCStopWordsFinder.java
new file mode 100644
index 0000000..8a8a6d2
--- /dev/null
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCStopWordsFinder.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.nlp.token.parser.opennlp;
+
+import org.apache.nlpcraft.NCLifecycle;
+import org.apache.nlpcraft.NCToken;
+
+import java.util.List;
+
+/**
+ *
+ */
+public interface NCStopWordsFinder extends NCLifecycle {
+ /**
+ *
+ * @param sen
+ * @return
+ */
+ List<NCToken> find(List<NCToken> sen);
+}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordGenerator.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
similarity index 99%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordGenerator.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
index 17b6bba..c470d51 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordGenerator.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
@@ -25,7 +25,7 @@ import scala.collection.mutable
/**
* Generates first word sequences.
*/
-object NCStopWordGenerator extends App:
+object NCEnStopWordGenerator extends App:
private final lazy val stemmer = new PorterStemmer
// Output files.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordsProcessor.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
similarity index 84%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordsProcessor.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
index 408cdbe..4bb9e85 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordsProcessor.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
@@ -19,20 +19,22 @@ package
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl
import com.typesafe.scalalogging.LazyLogging
import opennlp.tools.stemmer.PorterStemmer
+import org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCStopWordsFinder
+import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnStopWordsFinderImpl.*
import org.apache.nlpcraft.internal.util.NCUtils
-import org.apache.nlpcraft.{NCException, NCLifecycle, NCParameterizedAdapter,
NCToken}
+import org.apache.nlpcraft.{NCException, NCParameterizedAdapter, NCToken}
+import scala.jdk.CollectionConverters.SetHasAsScala
+import java.util
import scala.annotation.tailrec
import scala.collection.{Seq, mutable}
+import java.util.Set as JSet
+import java.util.List as JList
-/**
- * Stop-word and stop-sentence enricher.
- */
-private[impl] object NCStopWordsProcessor extends LazyLogging:
+object NCEnStopWordsFinderImpl:
// Condition types.
type Wildcard = (String, String)
type Word = String
- type Sentence = Seq[NCToken]
/** All POSes set. http://www.clips.ua.ac.be/pages/mbsp-tags */
private final val POSES = Set(
@@ -82,16 +84,6 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
private final val STOP_BEFORE_STOP: Seq[Word] = Seq("DT", "PRP", "PRP$",
"WDT", "WP", "WP$", "WRB")
- private var percents: Set[String] = _
- private var possessiveWords: Set[String] = _
- private var firstWords: Set[String] = _
- private var nounWords: Set[String] = _
- private var stopWords: StopWordHolder = _
- private var exceptions: StopWordHolder = _
-
- // TODO: lifecycle.
- start()
-
/**
* Stop words holder, used for hash search.
*
@@ -108,7 +100,7 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
posOpt match
case Some(pos) =>
!excludes.getOrElse(pos, Set.empty).contains(s) &&
- (any.contains(s) || includes.getOrElse(pos,
Set.empty).contains(s))
+ (any.contains(s) || includes.getOrElse(pos,
Set.empty).contains(s))
case _ => any.contains(s)
/**
@@ -145,9 +137,9 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
!matches(s, excludes.getOrElse(pos, Set.empty)) &&
(
inclPoses.contains(pos) ||
- matches(s, any) ||
- matches(s, includes.getOrElse(pos, Set.empty))
- )
+ matches(s, any) ||
+ matches(s, includes.getOrElse(pos,
Set.empty))
+ )
case _ => throw new AssertionError(s"Unexpected missed
POS.")
/**
@@ -186,22 +178,35 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
private def toValueKey(toks: Seq[NCToken]): String =
toks.map(_.getOriginalText.toLowerCase).mkString(" ")
private def toOriginalKey(toks: Seq[NCToken]): String =
toks.map(_.getOriginalText).mkString(" ")
+import NCEnStopWordsFinderImpl.*
+
+class NCEnStopWordsFinderImpl(addStopWords: JSet[String], exclStopWords:
JSet[String]) extends NCStopWordsFinder with LazyLogging:
+ private val addStopWordsStems = addStopWords.asScala
+ private val exclStopWordsStems = exclStopWords.asScala
+
+ private val stemmer = new PorterStemmer
+
+ @volatile private var percents: Set[String] = _
+ @volatile private var possessiveWords: Set[String] = _
+ @volatile private var firstWords: Set[String] = _
+ @volatile private var nounWords: Set[String] = _
+ @volatile private var stopWords: StopWordHolder = _
+ @volatile private var exceptions: StopWordHolder = _
+
/**
* Parses configuration template.
*
- * @param stemmer Stemmer.
* @param lines Configuration file content.
* @return Holder and `is-exception` flag.
*/
- @throws[NCException]
- private def readStopWords(stemmer: PorterStemmer, lines: Seq[String]):
Map[Boolean, StopWordHolder] =
+ private def readStopWords(lines: Seq[String]): Map[Boolean,
StopWordHolder] =
// 1. Prepares accumulation data structure.
object WordForm extends Enumeration:
type WordForm = Value
val STEM, LEM, ORIG = Value
- import WordForm._
+ import WordForm.*
class Condition[T]:
val any = mutable.HashSet.empty[T]
@@ -219,9 +224,9 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
case _ =>
val set = mutable.HashSet.empty[T]
- set += cond
+ set += cond
- m += pos -> set
+ m += pos -> set
)
add(includes, incl = true)
@@ -238,7 +243,7 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
WordForm.values.foreach(f =>
add(f, mkT, isExc = true)
- add(f, mkT, isExc = false)
+ add(f, mkT, isExc = false)
)
m.toMap
@@ -249,7 +254,6 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
// 2. Accumulates data of each parsed line.
for (line <- lines)
- @throws[NCException]
def throwError(msg: String): Unit =
throw new NCException(s"Invalid stop word configuration
[line=$line, reason=$msg]")
@@ -259,7 +263,6 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
if (s.length == 1 && !s.head.isLetter)
throwError("Invalid stop word")
- @throws[NCException]
def checkSingle(ch: Char): Unit = if (s.count(_ == ch) > 1)
throwError(s"Unexpected symbols count: $ch")
// Confusing special symbols.
@@ -348,12 +351,12 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
val incl = toImmutable(m((isExc, form)).includes)
val excl = toImmutable(m((isExc, form)).excludes)
- mkInstance(any ++ excl.values.flatten, incl, excl)
+ mkInstance(any ++ excl.values.flatten, incl, excl)
def mkHash(form: WordForm): HashHolder = mkHolder(mHash, form,
HashHolder.apply)
def mkScan(form: WordForm): ScanHolder = mkHolder(mScan, form,
ScanHolder.apply)
- isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM), mkHash(ORIG),
mkScan(LEM), mkScan(ORIG))
+ isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM),
mkHash(ORIG), mkScan(LEM), mkScan(ORIG))
).toMap
private def isVerb(pos: String): Boolean = pos.head == 'V'
@@ -369,7 +372,7 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
*/
@tailrec
private def markBefore(
- ns: Sentence,
+ ns: Seq[NCToken],
stopPoses: Seq[String],
lastIdx: Int,
isException: Seq[NCToken] => Boolean,
@@ -406,15 +409,16 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
cache += toks -> b
b
+
/**
* Marks as stopwords, words with POS from configured list, which also
placed before another stop words.
*/
- private def processCommonStops(ns: Sentence, stops:
mutable.HashSet[NCToken], exclStopWordsStems: Set[String]): Unit =
+ private def processCommonStops(ns: Seq[NCToken], stops:
mutable.HashSet[NCToken]): Unit =
/**
* Marks as stopwords, words with POS from configured list, which
also placed before another stop words.
*/
@tailrec
- def processCommonStops0(ns: Sentence): Unit =
+ def processCommonStops0(ns: Seq[NCToken]): Unit =
val max = ns.size - 1
var stop = true
@@ -435,82 +439,15 @@ private[impl] object NCStopWordsProcessor extends
LazyLogging:
processCommonStops0(ns)
- private def start(): Unit =
- val stemmer = new PorterStemmer
-
- percents = Set(
- "%",
- "pct",
- "pc",
- "percentage",
- "proportion",
- "interest",
- "rate",
- "percent"
- ).map(stemmer.stem)
-
- // Stemmatization is done already by generator.
- possessiveWords =
NCUtils.readTextGzipResource("stopwords/possessive_words.txt.gz", "UTF-8",
logger).toSet
- firstWords =
NCUtils.readTextGzipResource("stopwords/first_words.txt.gz", "UTF-8",
logger).toSet
- nounWords =
NCUtils.readTextGzipResource("stopwords/noun_words.txt.gz", "UTF-8",
logger).toSet
-
- // Case sensitive.
- val m =
- readStopWords(
- stemmer,
- NCUtils.readResource("stopwords/stop_words.txt", "UTF-8",
logger).
- map(_.strip).filter(s => s.nonEmpty && !s.startsWith("#"))
- )
-
- stopWords = m(false)
- exceptions = m(true)
-
-import NCStopWordsProcessor._
-
-private[impl] class NCStopWordsProcessor(stemmer: PorterStemmer) extends
LazyLogging:
- private var addStopWords = Set.empty[String]
- private var addStopWordsStems = Set.empty[String]
- private var exclStopWords = Set.empty[String]
- private var exclStopWordsStems = Set.empty[String]
-
/**
*
- * @param addStopWords
+ * @param toks
*/
- def setAdditionalStopWords(addStopWords: Set[String]): Unit =
- require(addStopWords != null)
-
- this.addStopWords = addStopWords
- this.addStopWordsStems = stemmer.synchronized {
addStopWords.map(stemmer.stem) }
+ override def find(toks: JList[NCToken]): JList[NCToken] =
+ import scala.jdk.CollectionConverters.*
- /**
- *
- * @return
- */
- def getAdditionalStopWords: Set[String] = addStopWordsStems
+ val ns = toks.asScala
- /**
- *
- * @param exclStopWords
- */
- def setExcludedStopWords(exclStopWords: Set[String]): Unit =
- require(exclStopWords != null)
-
- this.exclStopWords = exclStopWords
- this.exclStopWordsStems = stemmer.synchronized {
exclStopWords.map(stemmer.stem) }
-
- /**
- *
- * @return
- */
- def getExcludedStopWords: Set[String] = exclStopWords
-
- /**
- *
- * @param ns
- */
- @throws[NCException]
- def process(ns: Sentence): Sentence =
// Stop words and exceptions caches for this sentence.
val cacheSw = mutable.HashMap.empty[Seq[NCToken], Boolean]
val cacheEx = mutable.HashMap.empty[Seq[NCToken], Boolean]
@@ -627,20 +564,33 @@ private[impl] class NCStopWordsProcessor(stemmer:
PorterStemmer) extends LazyLog
// | configured list, which also placed before |
// | another stop words. |
// +-------------------------------------------------+
- processCommonStops(ns, stops, exclStopWordsStems)
-
- ns.map(tok =>
- if (stops.contains(tok))
- new NCParameterizedAdapter with NCToken:
- override def getOriginalText: String = tok.getOriginalText
- override def getNormalizedText: String =
tok.getNormalizedText
- override def getLemma: String = tok.getLemma
- override def getStem: String = tok.getStem
- override def getPos: String = tok.getPos
- override def isStopWord: Boolean = true
- override def getStartCharIndex: Int = tok.getStartCharIndex
- override def getEndCharIndex: Int = tok.getEndCharIndex
- override def getLength: Int = tok.getLength
- else
- tok
- )
\ No newline at end of file
+ processCommonStops(ns, stops)
+
+ stops.toSeq.sortBy(_.getStartCharIndex).asJava
+
+ override def start(): Unit =
+ percents = Set(
+ "%",
+ "pct",
+ "pc",
+ "percentage",
+ "proportion",
+ "interest",
+ "rate",
+ "percent"
+ ).map(stemmer.stem)
+
+ // Stemmatization is done already by generator.
+ possessiveWords =
NCUtils.readTextGzipResource("stopwords/possessive_words.txt.gz", "UTF-8",
logger).toSet
+ firstWords =
NCUtils.readTextGzipResource("stopwords/first_words.txt.gz", "UTF-8",
logger).toSet
+ nounWords =
NCUtils.readTextGzipResource("stopwords/noun_words.txt.gz", "UTF-8",
logger).toSet
+
+ // Case sensitive.
+ val m =
+ readStopWords(
+ NCUtils.readResource("stopwords/stop_words.txt", "UTF-8",
logger).
+ map(_.strip).filter(s => s.nonEmpty && !s.startsWith("#"))
+ )
+
+ stopWords = m(false)
+ exceptions = m(true)
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
index 7efdb84..058b95d 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
@@ -20,13 +20,16 @@ package
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl
import org.apache.nlpcraft.*
import java.io.*
-import java.util.Set as JSet
-import java.util.List as JList
+import java.util.{Collections, List as JList, Set as JSet}
import opennlp.tools.lemmatizer.*
import opennlp.tools.postag.*
import opennlp.tools.stemmer.*
import opennlp.tools.tokenize.*
+import org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCStopWordsFinder
import org.apache.nlpcraft.internal.util.NCUtils
+
+import java.util
+import java.util.stream.Collectors
import scala.jdk.CollectionConverters.*
object NCOpenNlpImpl:
@@ -35,22 +38,26 @@ object NCOpenNlpImpl:
* @param tokMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tokenizer model.
* @param posMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tagger model.
* @param lemmaDicSrc Local filesystem path, resources file path or URL
for OpenNLP lemmatizer dictionary.
+ * @param swFinder Stop words finder.
* @return
*/
- def apply(tokMdlSrc: String, posMdlSrc: String, lemmaDicSrc: String):
NCOpenNlpImpl =
- new NCOpenNlpImpl(NCUtils.getStream(tokMdlSrc),
NCUtils.getStream(posMdlSrc), NCUtils.getStream(lemmaDicSrc))
+ def apply(tokMdlSrc: String, posMdlSrc: String, lemmaDicSrc: String,
swFinder: NCStopWordsFinder): NCOpenNlpImpl =
+ new NCOpenNlpImpl(
+ NCUtils.getStream(tokMdlSrc), NCUtils.getStream(posMdlSrc),
NCUtils.getStream(lemmaDicSrc), swFinder
+ )
/**
*
* @param tokMdlFile Local file for OpenNLP tokenizer model.
* @param posMdlFile Local file for OpenNLP tagger model.
* @param lemmaDicFile Local file for OpenNLP lemmatizer dictionary.
+ * @param swFinder Stop words finder.
* @return
*/
- def apply(tokMdlFile: File, posMdlFile: File, lemmaDicFile: File):
NCOpenNlpImpl =
+ def apply(tokMdlFile: File, posMdlFile: File, lemmaDicFile: File,
swFinder: NCStopWordsFinder): NCOpenNlpImpl =
def toStream(f: File) = new BufferedInputStream(new FileInputStream(f))
- new NCOpenNlpImpl(toStream(tokMdlFile), toStream(posMdlFile),
toStream(lemmaDicFile))
+ new NCOpenNlpImpl(toStream(tokMdlFile), toStream(posMdlFile),
toStream(lemmaDicFile), swFinder)
/**
*
@@ -58,43 +65,31 @@ object NCOpenNlpImpl:
* @param posMdlIn
* @param lemmaDicIn
*/
-class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn: InputStream, lemmaDicIn:
InputStream):
- private val tokenizer = new TokenizerME(new TokenizerModel(tokMdlIn))
- private val tagger = new POSTaggerME(new POSModel(posMdlIn))
- private val lemmatizer = new DictionaryLemmatizer(lemmaDicIn)
+class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn: InputStream, lemmaDicIn:
InputStream, swFinder: NCStopWordsFinder) extends NCTokenParser :
private val stemmer = new PorterStemmer
- private val stopProc = new NCStopWordsProcessor(stemmer)
- /**
- *``
- * @return
- */
- def getAdditionalStopWords: JSet[String] =
stopProc.getAdditionalStopWords.asJava
+ @volatile var tokenizer: TokenizerME = _
+ @volatile var tagger: POSTaggerME = _
+ @volatile var lemmatizer: DictionaryLemmatizer = _
- /**
- *
- * @return
- */
- def getExcludedStopWords: JSet[String] =
stopProc.getExcludedStopWords.asJava
+ override def start(): Unit =
+ tokenizer = new TokenizerME(new TokenizerModel(tokMdlIn))
+ tagger = new POSTaggerME(new POSModel(posMdlIn))
+ lemmatizer = new DictionaryLemmatizer(lemmaDicIn)
- /**
- *
- * @param addStopWords
- */
- def setAdditionalStopWords(addStopWords: JSet[String]): Unit =
stopProc.setAdditionalStopWords(addStopWords.asScala.toSet)
+ if (swFinder != null)
+ swFinder.start()
- /**
- *
- * @param exclStopWords
- */
- def setExcludedStopWords(exclStopWords: JSet[String]): Unit =
stopProc.setExcludedStopWords(exclStopWords.asScala.toSet)
+ override def stop(): Unit =
+ if (swFinder != null)
+ swFinder.stop()
/**
*
* @param req
* @return
*/
- def parse(req: NCRequest): JList[NCToken] =
+ override def parse(req: NCRequest): JList[NCToken] =
// OpenNLP classes are not thread-safe.
this.synchronized {
val sen = req.getNormalizedText
@@ -130,8 +125,8 @@ class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn:
InputStream, lemmaDicIn: In
case (lemma, idx) => fixes.getOrElse(idx, lemma)
}
- stopProc.process(
- holders.zip(posTags).zip(lemmas).map { case ((h, pos), lemma)
=>
+ val res: Seq[NCToken] =
+ holders.zip(posTags).zip(lemmas).toIndexedSeq.map { case ((h,
pos), lemma) =>
new NCParameterizedAdapter with NCToken:
override def getOriginalText: String = h.origin
override def getNormalizedText: String = h.normalized
@@ -143,5 +138,24 @@ class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn:
InputStream, lemmaDicIn: In
override def getEndCharIndex: Int = h.end
override def getLength: Int = h.length
}
- ).asJava
+
+ val resJava = res.asJava
+ val stops = if (swFinder != null) swFinder.find(resJava) else null
+ val stopsSet = if (stops != null) new util.HashSet(stops) else
Collections.emptySet
+
+ resJava.stream().map(tok =>
+ if (stopsSet.contains(tok))
+ new NCParameterizedAdapter with NCToken:
+ override def getOriginalText: String =
tok.getOriginalText
+ override def getNormalizedText: String =
tok.getNormalizedText
+ override def getLemma: String = tok.getLemma
+ override def getStem: String = tok.getStem
+ override def getPos: String = tok.getPos
+ override def isStopWord: Boolean = true
+ override def getStartCharIndex: Int =
tok.getStartCharIndex
+ override def getEndCharIndex: Int = tok.getEndCharIndex
+ override def getLength: Int = tok.getLength
+ else
+ tok
+ ).collect(Collectors.toList)
}
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index 66d2e13..1784da9 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -733,7 +733,6 @@ object NCUtils extends LazyLogging:
* @param lines Text data.
* @param sort Whether to sort output or not.
*/
- @throws[IOException]
def mkTextFile(path: String, lines: scala.Iterable[Any], sort: Boolean =
true): Unit =
val file = new File(path)
@@ -789,7 +788,6 @@ object NCUtils extends LazyLogging:
* @param enc Encoding.
* @param log Logger to use.
*/
- @throws[NCException]
def readGzipPath(path: String, enc: String = "UTF-8", log: Logger =
logger): List[String] =
readGzipFile(new File(path), enc, log)
@@ -800,7 +798,6 @@ object NCUtils extends LazyLogging:
* @param enc Encoding.
* @param log Logger to use.
*/
- @throws[NCException]
def readGzipFile(f: File, enc: String, log: Logger = logger): List[String]
=
try
Using.resource(Source.fromInputStream(new GZIPInputStream(new
FileInputStream(f)), enc)) { src =>
@@ -815,7 +812,6 @@ object NCUtils extends LazyLogging:
* @param f File.
* @param log Logger.
*/
- @throws[NCException]
def readFileBytes(f: File, log: Logger = logger): Array[Byte] =
try
val arr = new Array[Byte](f.length().toInt)
@@ -835,7 +831,6 @@ object NCUtils extends LazyLogging:
* @param path File path.
* @param log Logger.
*/
- @throws[NCException]
def gzipPath(path: String, log: Logger = logger): Unit = gzipFile(new
File(path), log)
/**
@@ -844,7 +839,6 @@ object NCUtils extends LazyLogging:
* @param f File.
* @param log Logger.
*/
- @throws[NCException]
def gzipFile(f: File, log: Logger = logger): Unit =
val gz = s"${f.getAbsolutePath}.gz"
@@ -881,7 +875,6 @@ object NCUtils extends LazyLogging:
* @param enc Encoding.
* @param log Logger to use.
*/
- @throws[NCException]
def readResource(res: String, enc: String = "UTF-8", log: Logger =
logger): List[String] = readStream(getStream(res), enc, log)
/**
@@ -891,7 +884,6 @@ object NCUtils extends LazyLogging:
* @param enc Encoding.
* @param log Logger to use.
*/
- @throws[NCException]
def readStream(in: InputStream, enc: String = "UTF-8", log: Logger =
logger): List[String] =
mapStream(in, enc, log, _.map(p => p).toList)
@@ -903,7 +895,6 @@ object NCUtils extends LazyLogging:
* @param log Logger to use.
* @param mapper Function to read lines.
*/
- @throws[NCException]
def mapStream[T](in: InputStream, enc: String, log: Logger = logger,
mapper: Iterator[String] => T): T =
try
Using.resource(Source.fromInputStream(in, enc)) { src =>
@@ -928,7 +919,6 @@ object NCUtils extends LazyLogging:
* @param enc Encoding.
* @param log Logger to use.
*/
- @throws[NCException]
def readTextGzipResource(res: String, enc: String, log: Logger = logger):
List[String] =
try
Using.resource(Source.fromInputStream(new
GZIPInputStream(getStream(res)), enc)) { src =>
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
index 2f92878..79665e7 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
@@ -29,7 +29,8 @@ class NCOpenNlpTokenParserSpec {
new NCOpenNlpTokenParser(
"opennlp/en-token.bin",
"opennlp/en-pos-maxent.bin",
- "opennlp/en-lemmatizer.dict"
+ "opennlp/en-lemmatizer.dict",
+ new NCEnStopWordsFinder()
)
parser.start()