This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new 42c6826 WIP.
42c6826 is described below
commit 42c6826aef2598dd79fc28b8e03bb58138974a71
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 22 23:19:07 2021 +0300
WIP.
---
.../token/parser/opennlp/NCEnStopWordsFinder.java | 66 -----------
.../token/parser/opennlp/NCOpenNlpTokenParser.java | 56 +++++++++-
.../token/parser/opennlp/NCStopWordsFinder.java | 35 ------
...sFinderImpl.scala => NCEnStopWordsFinder.scala} | 124 +++++++++------------
.../token/parser/opennlp/impl/NCOpenNlpImpl.scala | 73 ++++++++----
.../opennlp/NCOpenNlpTokenParserBenchmark.java | 8 +-
.../parser/opennlp/NCOpenNlpTokenParserSpec.scala | 10 +-
7 files changed, 159 insertions(+), 213 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnStopWordsFinder.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnStopWordsFinder.java
deleted file mode 100644
index 80d11e3..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnStopWordsFinder.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.internal.nlp.token.parser.opennlp;
-
-import org.apache.nlpcraft.NCToken;
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnStopWordsFinderImpl;
-
-import java.util.Collections;
-import java.util.Set;
-import java.util.List;
-
-/**
- *
- */
-public class NCEnStopWordsFinder implements NCStopWordsFinder {
- private final NCEnStopWordsFinderImpl impl;
-
- /**
- *
- */
- public NCEnStopWordsFinder() {
- this(Collections.emptySet(), Collections.emptySet());
- }
-
- /**
- *
- * @param addStopWords
- * @param exclStopWords
- */
- public NCEnStopWordsFinder(Set<String> addStopWords, Set<String>
exclStopWords) {
- impl = new NCEnStopWordsFinderImpl(
- addStopWords == null ? Collections.emptySet() : addStopWords,
- exclStopWords == null ? Collections.emptySet() : exclStopWords
- );
- }
-
- @Override
- public void start() {
- impl.start();
- }
-
- @Override
- public void stop() {
- impl.stop();
- }
-
- @Override
- public List<NCToken> find(List<NCToken> sen) {
- return impl.find(sen);
- }
-}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
index 7e6a639..8213e78 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
@@ -26,6 +26,7 @@ import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCOpenNlpImpl;
import java.io.File;
import java.util.List;
import java.util.Objects;
+import java.util.Set;
/*
* Models can be downloaded from the following resources:
@@ -37,6 +38,7 @@ import java.util.Objects;
*
*/
public class NCOpenNlpTokenParser implements NCTokenParser {
+
private final NCOpenNlpImpl impl;
@Override
@@ -50,20 +52,31 @@ public class NCOpenNlpTokenParser implements NCTokenParser {
}
/**
+ * TODO: defaults.
+ * @throws NCException
+ */
+ public NCOpenNlpTokenParser() {
+ this(
+ "opennlp/en-token.bin",
+ "opennlp/en-pos-maxent.bin",
+ "opennlp/en-lemmatizer.dict"
+ );
+ }
+
+ /**
*
* @param tokMdl
* @param posMdl
* @param lemmaDic
- * @param swFinder
* @throws NCException
*/
- public NCOpenNlpTokenParser(File tokMdl, File posMdl, File lemmaDic,
NCStopWordsFinder swFinder) {
+ public NCOpenNlpTokenParser(File tokMdl, File posMdl, File lemmaDic) {
Objects.requireNonNull(tokMdl, "Tonenizer model cannot be null");
Objects.requireNonNull(posMdl, "POS model cannot be null");
Objects.requireNonNull(lemmaDic, "Lemmatizer model cannot be null");
try {
- impl = NCOpenNlpImpl.apply(tokMdl, posMdl, lemmaDic, swFinder);
+ impl = NCOpenNlpImpl.apply(tokMdl, posMdl, lemmaDic);
}
catch (Exception e) {
throw new NCException("Failed to create OpenNLP token parser.", e);
@@ -75,22 +88,53 @@ public class NCOpenNlpTokenParser implements NCTokenParser {
* @param tokMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tokenizer model.
* @param posMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tagger model.
* @param lemmaDicSrc Local filesystem path, resources file path or URL
for OpenNLP lemmatizer dictionary.
- * @param swFinder
* @throws NCException
*/
- public NCOpenNlpTokenParser(String tokMdlSrc, String posMdlSrc, String
lemmaDicSrc, NCStopWordsFinder swFinder) {
+ public NCOpenNlpTokenParser(String tokMdlSrc, String posMdlSrc, String
lemmaDicSrc) {
Objects.requireNonNull(tokMdlSrc, "Tonenizer model cannot be null");
Objects.requireNonNull(posMdlSrc, "POS model cannot be null");
Objects.requireNonNull(lemmaDicSrc, "Lemmatizer model cannot be null");
try {
- impl = NCOpenNlpImpl.apply(tokMdlSrc, posMdlSrc, lemmaDicSrc,
swFinder);
+ impl = NCOpenNlpImpl.apply(tokMdlSrc, posMdlSrc, lemmaDicSrc);
}
catch (Exception e) {
throw new NCException("Failed to create OpenNLP token parser.", e);
}
}
+ /**
+ *
+ * @return
+ */
+ public Set<String> getAdditionalStopWords() {
+ return impl.getAdditionalStopWords();
+ }
+
+ /**
+ *
+ * @param addStopWords
+ */
+ public void setAdditionalStopWords(Set<String> addStopWords) {
+ impl.setAdditionalStopWords(addStopWords);
+ }
+
+ /**
+ *
+ * @return
+ */
+ public Set<String> setExcludedStopWords() {
+ return impl.setExcludedStopWords();
+ }
+
+ /**
+ *
+ * @param exclStopWords
+ */
+ public void setExcludedStopWords(Set<String> exclStopWords) {
+ impl.setExcludedStopWords(exclStopWords);
+ }
+
@Override
public List<NCToken> parse(NCRequest req) {
return impl.parse(req);
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCStopWordsFinder.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCStopWordsFinder.java
deleted file mode 100644
index 8a8a6d2..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCStopWordsFinder.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.internal.nlp.token.parser.opennlp;
-
-import org.apache.nlpcraft.NCLifecycle;
-import org.apache.nlpcraft.NCToken;
-
-import java.util.List;
-
-/**
- *
- */
-public interface NCStopWordsFinder extends NCLifecycle {
- /**
- *
- * @param sen
- * @return
- */
- List<NCToken> find(List<NCToken> sen);
-}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
similarity index 89%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
index 01f8d13..db36d9a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
@@ -19,20 +19,21 @@ package
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl
import com.typesafe.scalalogging.LazyLogging
import opennlp.tools.stemmer.PorterStemmer
-import org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCStopWordsFinder
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnStopWordsFinderImpl.*
+import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnStopWordsFinder.*
import org.apache.nlpcraft.internal.util.NCUtils
import org.apache.nlpcraft.{NCException, NCParameterizedAdapter, NCToken}
-import scala.jdk.CollectionConverters.SetHasAsScala
import java.util
+import java.util.{List as JList, Set as JSet}
import scala.annotation.tailrec
import scala.collection.{Seq, mutable}
-import java.util.Set as JSet
-import java.util.List as JList
import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.SetHasAsScala
-object NCEnStopWordsFinderImpl:
+/**
+ *
+ */
+private[impl] object NCEnStopWordsFinder:
// Condition types.
type Wildcard = (String, String)
type Word = String
@@ -183,23 +184,47 @@ object NCEnStopWordsFinderImpl:
private def toValueKey(toks: Seq[NCToken]): String =
toks.map(_.getOriginalText.toLowerCase).mkString(" ")
private def toOriginalKey(toks: Seq[NCToken]): String =
toks.map(_.getOriginalText).mkString(" ")
-import NCEnStopWordsFinderImpl.*
-
-class NCEnStopWordsFinderImpl(addStopWords: JSet[String], exclStopWords:
JSet[String]) extends NCStopWordsFinder with LazyLogging:
- require(addStopWords != null)
- require(exclStopWords != null)
+import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnStopWordsFinder.*
- private val addStopWordsStems = addStopWords.asScala
- private val exclStopWordsStems = exclStopWords.asScala
+private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String],
exclStopWordsStems: Set[String]) extends LazyLogging:
+ require(addStopWordsStems != null)
+ require(exclStopWordsStems != null)
private val stemmer = new PorterStemmer
- @volatile private var percents: Set[String] = _
+ private val percents = Set(
+ "%",
+ "pct",
+ "pc",
+ "percentage",
+ "proportion",
+ "interest",
+ "rate",
+ "percent"
+ ).map(stemmer.stem)
+
@volatile private var possessiveWords: Set[String] = _
@volatile private var firstWords: Set[String] = _
@volatile private var nounWords: Set[String] = _
- @volatile private var stopWords: StopWordHolder = _
- @volatile private var exceptions: StopWordHolder = _
+
+ // Stemmatization is done already by generator.
+ NCUtils.executeParallel(
+ () => possessiveWords = read("stopwords/possessive_words.txt.gz"),
+ () => firstWords = read("stopwords/first_words.txt.gz"),
+ () => nounWords = read("stopwords/noun_words.txt.gz")
+ )(ExecutionContext.Implicits.global)
+
+ // Case sensitive.
+ private val (stopWords, exceptions) =
+ val m =
+ readStopWords(
+ NCUtils.readResource("stopwords/stop_words.txt", "UTF-8",
logger).
+ map(_.strip).filter(s => s.nonEmpty && !s.startsWith("#"))
+ )
+ (m(false), m(true))
+
+
+ private def read(path: String): Set[String] =
NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
/**
* Parses configuration template.
@@ -451,15 +476,7 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
*
* @param toks
*/
- override def find(toks: JList[NCToken]): JList[NCToken] =
- // TODO: check started? clear on stop?
- if (percents == null)
- throw new IllegalStateException(s"${this.getClass.getName} is not
started.")
-
- import scala.jdk.CollectionConverters.*
-
- val ns = toks.asScala
-
+ def find(toks: Seq[NCToken]): Seq[NCToken] =
// Stop words and exceptions caches for this sentence.
val cacheSw = mutable.HashMap.empty[Seq[NCToken], Boolean]
val cacheEx = mutable.HashMap.empty[Seq[NCToken], Boolean]
@@ -469,7 +486,7 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
val stops = mutable.HashSet.empty[NCToken]
- for (p <- ns.zipWithIndex)
+ for (p <- toks.zipWithIndex)
val tok = p._1
val idx = p._2
val pos = tok.getPos
@@ -477,10 +494,10 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
val stem = tok.getStem
def isFirst: Boolean = idx == 0
- def isLast: Boolean = idx == ns.length - 1
+ def isLast: Boolean = idx == toks.length - 1
- def next(): NCToken = ns(idx + 1)
- def prev(): NCToken = ns(idx - 1)
+ def next(): NCToken = toks(idx + 1)
+ def prev(): NCToken = toks(idx - 1)
def isCommonVerbs(firstVerb: String, secondVerb: String): Boolean =
isVerb(pos) && lemma == secondVerb ||
@@ -512,7 +529,7 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
// | Find all words from predefined list. |
// +--------------------------------------+
val buf = mutable.Buffer.empty[Seq[NCToken]]
- val mix = NCUtils.tokenMixWithStopWords(ns)
+ val mix = NCUtils.tokenMixWithStopWords(toks)
for (toks <- mix if !buf.exists(_.containsSlice(toks)) && isStop(toks)
&& !isException(toks))
toks.foreach(tok => stops += tok)
@@ -537,7 +554,7 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
val foundKeys = new mutable.HashSet[String]()
// All sentence first stop words + first non stop word.
- val startToks = ns.takeWhile(_.isStopWord) ++
ns.find(!_.isStopWord).map(p => p)
+ val startToks = toks.takeWhile(_.isStopWord) ++
toks.find(!_.isStopWord).map(p => p)
for (startTok <- startToks; tup <- origToks.filter(_._1.head ==
startTok); key = tup._2
if firstWords.contains(key) && !isException(tup._1))
tup._1.foreach(tok => stops += tok)
@@ -558,13 +575,13 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
// | Pass #6. |
// | Mark words with POSes before stop-words. |
// +-------------------------------------------------+
- markBefore(ns, STOP_BEFORE_STOP, ns.size - 1, isException, stops)
+ markBefore(toks, STOP_BEFORE_STOP, toks.size - 1, isException, stops)
// +-------------------------------------------------+
// | Pass #7. |
// | Processing additional and excluded stop words. |
// +-------------------------------------------------+
- for (t <- ns if addStopWordsStems.contains(t.getStem))
+ for (t <- toks if addStopWordsStems.contains(t.getStem))
stops += t
for (t <- stops.filter(t => exclStopWordsStems.contains(t.getStem)))
@@ -576,16 +593,16 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
// | configured list, which also placed before |
// | another stop words. |
// +-------------------------------------------------+
- processCommonStops(ns, stops)
+ processCommonStops(toks, stops)
// +-------------------------------------------------+
// | Pass #9. |
// | Deletes stop words if they are marked as quoted.|
// +-------------------------------------------------+
- val quotes = ns.filter(isQuote)
+ val quotes = toks.filter(isQuote)
if (quotes.nonEmpty && quotes.size % 2 == 0)
- val m = ns.zipWithIndex.toMap
+ val m = toks.zipWithIndex.toMap
val pairs =
quotes.zipWithIndex.
@@ -600,37 +617,6 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
})
else
// TODO:
- logger.debug(s"Unexpected quotes count, stop words processing
updating skipped for text: ${ns.map(_.getOriginalText).mkString(" ")}")
-
- stops.toSeq.sortBy(_.getStartCharIndex).asJava
-
- override def start(): Unit =
- percents = Set(
- "%",
- "pct",
- "pc",
- "percentage",
- "proportion",
- "interest",
- "rate",
- "percent"
- ).map(stemmer.stem)
-
- def read(path: String): Set[String] =
NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
-
- // Stemmatization is done already by generator.
- NCUtils.executeParallel(
- () => possessiveWords = read("stopwords/possessive_words.txt.gz"),
- () => firstWords = read("stopwords/first_words.txt.gz"),
- () => nounWords = read("stopwords/noun_words.txt.gz")
- )(ExecutionContext.Implicits.global)
-
- // Case sensitive.
- val m =
- readStopWords(
- NCUtils.readResource("stopwords/stop_words.txt", "UTF-8",
logger).
- map(_.strip).filter(s => s.nonEmpty && !s.startsWith("#"))
- )
+ logger.debug(s"Unexpected quotes count, stop words processing
updating skipped for text: ${toks.map(_.getOriginalText).mkString(" ")}")
- stopWords = m(false)
- exceptions = m(true)
\ No newline at end of file
+ stops.toSeq.sortBy(_.getStartCharIndex)
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
index 3dabaf0..6451d09 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
@@ -17,19 +17,17 @@
package org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl
-import org.apache.nlpcraft.*
-
-import java.io.*
-import java.util.{Collections, List as JList, Set as JSet}
import opennlp.tools.lemmatizer.*
import opennlp.tools.postag.*
import opennlp.tools.stemmer.*
import opennlp.tools.tokenize.*
-import org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCStopWordsFinder
+import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
+import java.io.*
import java.util
import java.util.stream.Collectors
+import java.util.{Collections, List as JList, Set as JSet}
import scala.concurrent.ExecutionContext
import scala.jdk.CollectionConverters.*
@@ -39,26 +37,22 @@ object NCOpenNlpImpl:
* @param tokMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tokenizer model.
* @param posMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tagger model.
* @param lemmaDicSrc Local filesystem path, resources file path or URL
for OpenNLP lemmatizer dictionary.
- * @param swFinder Stop words finder.
* @return
*/
- def apply(tokMdlSrc: String, posMdlSrc: String, lemmaDicSrc: String,
swFinder: NCStopWordsFinder): NCOpenNlpImpl =
- new NCOpenNlpImpl(
- NCUtils.getStream(tokMdlSrc), NCUtils.getStream(posMdlSrc),
NCUtils.getStream(lemmaDicSrc), swFinder
- )
+ def apply(tokMdlSrc: String, posMdlSrc: String, lemmaDicSrc: String):
NCOpenNlpImpl =
+ new NCOpenNlpImpl(NCUtils.getStream(tokMdlSrc),
NCUtils.getStream(posMdlSrc), NCUtils.getStream(lemmaDicSrc))
/**
*
* @param tokMdlFile Local file for OpenNLP tokenizer model.
* @param posMdlFile Local file for OpenNLP tagger model.
* @param lemmaDicFile Local file for OpenNLP lemmatizer dictionary.
- * @param swFinder Stop words finder.
* @return
*/
- def apply(tokMdlFile: File, posMdlFile: File, lemmaDicFile: File,
swFinder: NCStopWordsFinder): NCOpenNlpImpl =
+ def apply(tokMdlFile: File, posMdlFile: File, lemmaDicFile: File):
NCOpenNlpImpl =
def toStream(f: File) = new BufferedInputStream(new FileInputStream(f))
- new NCOpenNlpImpl(toStream(tokMdlFile), toStream(posMdlFile),
toStream(lemmaDicFile), swFinder)
+ new NCOpenNlpImpl(toStream(tokMdlFile), toStream(posMdlFile),
toStream(lemmaDicFile))
/**
*
@@ -66,22 +60,59 @@ object NCOpenNlpImpl:
* @param posMdlIn
* @param lemmaDicIn
*/
-class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn: InputStream, lemmaDicIn:
InputStream, swFinder: NCStopWordsFinder) extends NCTokenParser :
+class NCOpenNlpImpl(
+ tokMdlIn: InputStream,
+ posMdlIn: InputStream,
+ lemmaDicIn: InputStream
+) extends NCTokenParser :
private val stemmer = new PorterStemmer
@volatile var tokenizer: TokenizerME = _
@volatile var tagger: POSTaggerME = _
@volatile var lemmatizer: DictionaryLemmatizer = _
+ @volatile var sw: NCEnStopWordsFinder = _
+
+ private var addStopWords: JSet[String] = _
+ private var exclStopWords: JSet[String] = _
override def start(): Unit =
NCUtils.executeParallel(
() => tokenizer = new TokenizerME(new TokenizerModel(tokMdlIn)),
() => tagger = new POSTaggerME(new POSModel(posMdlIn)),
() => lemmatizer = new DictionaryLemmatizer(lemmaDicIn),
- () => if (swFinder != null) swFinder.start()
+ () => sw = new NCEnStopWordsFinder(stem(addStopWords),
stem(exclStopWords))
)(ExecutionContext.Implicits.global)
- override def stop(): Unit = if (swFinder != null) swFinder.stop()
+ /**
+ *
+ * @param addStopWords
+ */
+ def setAdditionalStopWords(addStopWords: JSet[String]): Unit =
this.addStopWords = addStopWords
+
+ /**
+ *
+ * @return
+ */
+ def getAdditionalStopWords(): JSet[String] = this.addStopWords
+
+ /**
+ *
+ * @param exclStopWords
+ */
+ def setExcludedStopWords(exclStopWords: JSet[String]): Unit =
this.exclStopWords = exclStopWords
+
+ /**
+ *
+ * @return
+ */
+ def setExcludedStopWords(): JSet[String] = this.exclStopWords
+
+ /**
+ *
+ * @param set
+ */
+ private def stem(set: JSet[String]): Set[String] =
+ if (set == null) Set.empty else set.asScala.toSet.map(stemmer.stem)
/**
*
@@ -142,12 +173,10 @@ class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn:
InputStream, lemmaDicIn: In
override def getLength: Int = h.length
}
- val resJava = res.asJava
- val stops = if (swFinder != null) swFinder.find(resJava) else null
- val stopsSet = if (stops != null) new util.HashSet(stops) else
Collections.emptySet
+ val stops = sw.find(res)
- resJava.stream().map(tok =>
- if (stopsSet.contains(tok))
+ res.map(tok =>
+ if (stops.contains(tok))
new NCParameterizedAdapter with NCToken:
override def getOriginalText: String =
tok.getOriginalText
override def getNormalizedText: String =
tok.getNormalizedText
@@ -160,5 +189,5 @@ class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn:
InputStream, lemmaDicIn: In
override def getLength: Int = tok.getLength
else
tok
- ).collect(Collectors.toList)
+ ).asJava
}
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/benchmark/token/parser/opennlp/NCOpenNlpTokenParserBenchmark.java
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/benchmark/token/parser/opennlp/NCOpenNlpTokenParserBenchmark.java
index b7501eb..ad6ac79 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/benchmark/token/parser/opennlp/NCOpenNlpTokenParserBenchmark.java
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/benchmark/token/parser/opennlp/NCOpenNlpTokenParserBenchmark.java
@@ -18,7 +18,6 @@
package org.apache.nlpcraft.internal.nlp.benchmark.token.parser.opennlp;
import org.apache.nlpcraft.internal.nlp.benchmark.NCBenchmarkAdapter;
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnStopWordsFinder;
import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCOpenNlpTokenParser;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.Setup;
@@ -47,12 +46,7 @@ public class NCOpenNlpTokenParserBenchmark extends
NCBenchmarkAdapter {
* @return
*/
private static NCOpenNlpTokenParser prepareParser() {
- NCOpenNlpTokenParser p = new NCOpenNlpTokenParser(
- "opennlp/en-token.bin",
- "opennlp/en-pos-maxent.bin",
- "opennlp/en-lemmatizer.dict",
- new NCEnStopWordsFinder()
- );
+ NCOpenNlpTokenParser p = new NCOpenNlpTokenParser();
p.start();
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
index 88c919c..f6d520a 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
@@ -35,13 +35,7 @@ class NCOpenNlpTokenParserSpec:
def start(): Unit =
val t1 = System.currentTimeMillis()
- parser =
- new NCOpenNlpTokenParser(
- "opennlp/en-token.bin",
- "opennlp/en-pos-maxent.bin",
- "opennlp/en-lemmatizer.dict",
- new NCEnStopWordsFinder()
- )
+ parser = new NCOpenNlpTokenParser()
val t2 = System.currentTimeMillis()
@@ -94,7 +88,7 @@ class NCOpenNlpTokenParserSpec:
validate(res)
@Test
- def benchmark(): Unit =
+ def test(): Unit =
test(
"Test requests!",
toks =>