This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new 60db649 WIP.
60db649 is described below
commit 60db6493b2367d5c8e4e4e4e70645f0d3664fd03
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 22 12:19:28 2021 +0300
WIP.
---
.../parser/opennlp/impl/NCEnStopWordsFinderImpl.scala | 13 +++++++++----
.../nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala | 19 +++++++++----------
.../org/apache/nlpcraft/internal/util/NCUtils.scala | 11 ++++++++++-
3 files changed, 28 insertions(+), 15 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
index b7fb2f3..01f8d13 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
@@ -23,13 +23,14 @@ import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCStopWordsFinder
import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnStopWordsFinderImpl.*
import org.apache.nlpcraft.internal.util.NCUtils
import org.apache.nlpcraft.{NCException, NCParameterizedAdapter, NCToken}
-import scala.jdk.CollectionConverters.SetHasAsScala
+import scala.jdk.CollectionConverters.SetHasAsScala
import java.util
import scala.annotation.tailrec
import scala.collection.{Seq, mutable}
import java.util.Set as JSet
import java.util.List as JList
+import scala.concurrent.ExecutionContext
object NCEnStopWordsFinderImpl:
// Condition types.
@@ -615,10 +616,14 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
"percent"
).map(stemmer.stem)
+ def read(path: String): Set[String] =
NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
+
// Stemmatization is done already by generator.
- possessiveWords =
NCUtils.readTextGzipResource("stopwords/possessive_words.txt.gz", "UTF-8",
logger).toSet
- firstWords =
NCUtils.readTextGzipResource("stopwords/first_words.txt.gz", "UTF-8",
logger).toSet
- nounWords =
NCUtils.readTextGzipResource("stopwords/noun_words.txt.gz", "UTF-8",
logger).toSet
+ NCUtils.executeParallel(
+ () => possessiveWords = read("stopwords/possessive_words.txt.gz"),
+ () => firstWords = read("stopwords/first_words.txt.gz"),
+ () => nounWords = read("stopwords/noun_words.txt.gz")
+ )(ExecutionContext.Implicits.global)
// Case sensitive.
val m =
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
index 0f54e7e..3dabaf0 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
@@ -30,6 +30,7 @@ import org.apache.nlpcraft.internal.util.NCUtils
import java.util
import java.util.stream.Collectors
+import scala.concurrent.ExecutionContext
import scala.jdk.CollectionConverters.*
object NCOpenNlpImpl:
@@ -73,16 +74,14 @@ class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn:
InputStream, lemmaDicIn: In
@volatile var lemmatizer: DictionaryLemmatizer = _
override def start(): Unit =
- tokenizer = new TokenizerME(new TokenizerModel(tokMdlIn))
- tagger = new POSTaggerME(new POSModel(posMdlIn))
- lemmatizer = new DictionaryLemmatizer(lemmaDicIn)
-
- if (swFinder != null)
- swFinder.start()
-
- override def stop(): Unit =
- if (swFinder != null)
- swFinder.stop()
+ NCUtils.executeParallel(
+ () => tokenizer = new TokenizerME(new TokenizerModel(tokMdlIn)),
+ () => tagger = new POSTaggerME(new POSModel(posMdlIn)),
+ () => lemmatizer = new DictionaryLemmatizer(lemmaDicIn),
+ () => if (swFinder != null) swFinder.start()
+ )(ExecutionContext.Implicits.global)
+
+ override def stop(): Unit = if (swFinder != null) swFinder.stop()
/**
*
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index 9c13081..3bacd33 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -36,8 +36,9 @@ import scala.sys.SystemProperties
import scala.util.Using
import scala.util.control.Exception.ignoring
import scala.io.BufferedSource
+
/**
- *
+ * TODO: logs for all files reading methods.
*/
object NCUtils extends LazyLogging:
final val NL = System getProperty "line.separator"
@@ -927,6 +928,14 @@ object NCUtils extends LazyLogging:
catch
case e: IOException => throw new NCException(s"Failed to read
stream.", e)
+ /**
+ *
+ * @param bodies
+ * @param ec
+ */
+ def executeParallel(bodies: (() => Any)*)(ec: ExecutionContext): Unit =
+ bodies.map(body => Future { body() } (ec)).foreach(Await.result(_,
Duration.Inf))
+
// TODO: is it suitable place for methods related to tokens manipulations?
/**
* Gets all sequential permutations of tokens in this NLP sentence.