[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP

aradzinski Tue, 13 Dec 2022 16:12:30 -0800

This is an automated email from the ASF dual-hosted git repository.

aradzinski pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
     new 59aa488f WIP
59aa488f is described below

commit 59aa488fd3c2806940926e72cba26c96dd7d58bd
Author: Aaron Radzinski <[email protected]>
AuthorDate: Tue Dec 13 16:12:19 2022 -0800

    WIP
---
 .../src/main/resources/stopwords/stop_words.txt    |   2 +-
 .../org/apache/nlpcraft/NCPipelineBuilder.scala    |   6 +-
 .../nlp/enrichers/NCDictionaryTokenEnricher.scala  |   2 +-
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 117 ++++++++++-----------
 .../apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala  |   5 +-
 .../nlp/parsers/NCOpenNLPTokenParserSpec.scala     |   4 +-
 6 files changed, 66 insertions(+), 70 deletions(-)

diff --git a/nlpcraft/src/main/resources/stopwords/stop_words.txt 
b/nlpcraft/src/main/resources/stopwords/stop_words.txt
index 5644efd4..1cff382e 100644
--- a/nlpcraft/src/main/resources/stopwords/stop_words.txt
+++ b/nlpcraft/src/main/resources/stopwords/stop_words.txt
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-# Basic predefined stop-words.
+# Basic predefined stopwords.
 #
 # Configuration contains:
 # - Words (processed as stem)
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
index 69f1b6d3..08a3886e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
@@ -235,7 +235,7 @@ class NCPipelineBuilder:
       *     and
       *     
[[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict
 en-lemmatizer.dict]] model for
       *     
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html
 DictionaryLemmatizer]].
-      *  - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher.
+      *  - [[NCEnStopWordsTokenEnricher stopword]] token enricher.
       *  - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized 
by
       *    
[[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt
 swear_words.txt]] dictionary.
       *  - [[NCQuotesTokenEnricher Quotes]] token enricher.
@@ -277,7 +277,7 @@ class NCPipelineBuilder:
       *     and
       *     
[[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict
 en-lemmatizer.dict]] model for
       *     
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html
 DictionaryLemmatizer]].
-      *  - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher.
+      *  - [[NCEnStopWordsTokenEnricher stopword]] token enricher.
       *  - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized 
by
       *    
[[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt
 swear_words.txt]] dictionary.
       *  - [[NCQuotesTokenEnricher Quotes]] token enricher.
@@ -306,7 +306,7 @@ class NCPipelineBuilder:
       *     and
       *     
[[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict
 en-lemmatizer.dict]] model for
       *     
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html
 DictionaryLemmatizer]].
-      *  - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher.
+      *  - [[NCEnStopWordsTokenEnricher stopword]] token enricher.
       *  - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized 
by
       *    
[[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt
 swear_words.txt]] dictionary.
       *  - [[NCQuotesTokenEnricher Quotes]] token enricher.
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index cf17817e..697fcbe1 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -33,7 +33,7 @@ import org.apache.nlpcraft.internal.util.NCUtils
   * metadata property before this enricher in your [[NCPipeline pipeline]].
   *
   * @param dictRes Relative path, absolute path or URL to the dictionary file. 
The dictionary should have a simple
-  *         plain text format with *one lemma per line* with no empty line, 
header or comments allowed.
+  *         plain text format with *one lemma per line* with no empty lines, 
header or other comments allowed.
   */
 //noinspection DuplicatedCode,ScalaWeakerAccess
 class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher:
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 3fc1e57a..64137cda 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -29,9 +29,9 @@ import scala.collection.*
 import scala.concurrent.ExecutionContext
 
 /**
-  * [[NCEnStopWordsTokenEnricher]] helper.
+  * Companion helper.
   */
-object NCEnStopWordsTokenEnricher:
+private object NCEnStopWordsTokenEnricher:
     // Condition types.
     private type Wildcard = (String, String)
     private type Word = String
@@ -104,14 +104,14 @@ object NCEnStopWordsTokenEnricher:
 
     /**
       * Gets all sequential permutations of tokens in this NLP sentence.
-      * This method is like a 'tokenMix', but with all combinations of 
stop-words (with and without)
+      * This method is like a 'tokenMix', but with all combinations of 
stopwords (with and without)
       *
       * @param tokens Tokens.
       * @param maxLen Maximum number of tokens in the sequence.
       */
     private[enrichers] def tokenMixWithStopWords(tokens: Seq[NCToken], maxLen: 
Int = Integer.MAX_VALUE): Seq[Seq[NCToken]] =
         /**
-          * Gets all combinations for sequence of mandatory tokens with 
stop-words and without.
+          * Gets all combinations for sequence of mandatory tokens with 
stopwords and without.
           *
           * Example:
           * 'A (stop), B, C(stop) -> [A, B, C]; [A, B]; [B, C], [B]
@@ -163,31 +163,34 @@ object NCEnStopWordsTokenEnricher:
 import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
 
 /**
-  * "Stop-word" [[NCTokenEnricher token enricher]] for English (EN) language. 
Stop words are the words
+  * Stopword [[NCTokenEnricher token enricher]] for English (EN) language. 
Stopwords are the words
   * which are filtered out (i.e. stopped) before processing of natural 
language text because they are
   * insignificant.
   *
   * This enricher adds `stopword` boolean [[NCPropertyMap metadata]] property 
to the [[NCToken token]]
-  * instance if word it represents is an English stop-word. The value `true` 
of the metadata property indicates that
-  * this word is detected as a stop-word, `false` value indicates otherwise.
+  * instance if the word it represents is an English stopword. The value 
`true` of this metadata property indicates that
+  * this word is detected as a stopword, `false` value indicates otherwise. 
This implementation works off the
+  * algorithm that uses an internal list of English stopwords as well as a 
procedural logic to determine the stopword
+  * status of the token. This algorithm should work fine for most of the 
general uses cases. User, however, can add
+  * additional stopwords or exceptions for the existing ones using 
corresponding parameters in [[NCEnStopWordsTokenEnricher]]
+  * constructor.
   *
-  * More information about stop-words can be found at 
[[https://en.wikipedia.org/wiki/Stop_word]].
+  * More information about stopwords can be found at 
[[https://en.wikipedia.org/wiki/Stop_word]].
   *
   * **NOTE:** this implementation requires `lemma` and `pos` string 
[[NCPropertyMap metadata]] properties that
-  * contain token's lemma and part of speech. You can configure 
[[NCOpenNLPTokenEnricher]] for English language
-  * that provides this metadata properties before this enricher in your 
[[NCPipeline pipeline]].
+  * contain token's lemma and part of speech accordingly. You can configure 
[[NCOpenNLPTokenEnricher]] with the model
+  * for English language that would provide these metadata properties before 
this enricher in your [[NCPipeline pipeline]].
   *
-  * @see [[NCEnStemmer]]
-  *
-  * @param addStopsSet User defined collection of additional stop-words.
-  *  These word will be tried to match based on `stemmer` implementation.
-  * @param exclStopsSet User defined collection of exceptions, that is words 
which should not be marked as stop-words during processing.
-  *  These word will be tried to match based on `stemmer` implementation.
-  * @param stemmer English stemmer implementation.
+  * @param addSet User defined collection of additional stopwords. These words 
will be stemmatized by the given `stemmer`
+  *         before attempting to find a match. Default value is an empty set.
+  * @param exclSet User defined collection of exceptions, i.e. the words which 
should not be marked as stopwords during
+  *         processing. These words will be stemmatized by the given `stemmer` 
before attempting to find a match.
+  *         Default value is an empty set.
+  * @param stemmer English stemmer implementation. Default value is the 
instance of [[org.apache.nlpcraft.nlp.stemmer.NCEnStemmer]].
   */
 class NCEnStopWordsTokenEnricher(
-    addStopsSet: Set[String] = Set.empty,
-    exclStopsSet: Set[String] = Set.empty,
+    addSet: Set[String] = Set.empty,
+    exclSet: Set[String] = Set.empty,
     stemmer: NCStemmer = new NCEnStemmer
 ) extends NCTokenEnricher with LazyLogging:
     private var addStems: Set[String] = _
@@ -205,7 +208,7 @@ class NCEnStopWordsTokenEnricher(
     private def toStemKey(toks: Seq[NCToken]): String = 
toks.map(_.getText).map(stem).mkString(" ")
 
     /**
-      * Stop words holder, used for hash search.
+      * stopword holder, used for hash search.
       *
       * @param any Any POSes container.
       * @param includes Included by POS container.
@@ -224,7 +227,7 @@ class NCEnStopWordsTokenEnricher(
                 case _ => any.contains(s)
 
     /**
-      * Stop words holder, used for scanning.
+      * stopword holder, used for scanning.
       *
       * @param any Any POSes container.
       * @param includes Included by POS container.
@@ -263,7 +266,7 @@ class NCEnStopWordsTokenEnricher(
                     case _ => throw new AssertionError("Unexpected POS.")
 
     /**
-      * Stop words data holder.
+      * stopword data holder.
       *
       * @param stems Stems data holder.
       * @param lemmas Lemmas data holder.
@@ -286,18 +289,18 @@ class NCEnStopWordsTokenEnricher(
 
             // Hash access.
             stems.matches(toStemKey(toks), posOpt) ||
-                lemmas.matches(toLemmaKey(toks), posOpt) ||
-                origins.matches(toOriginalKey(toks), posOpt) ||
-                // Scan access.
-                wildcardsLemmas.matches(toLemmaKey(toks), posOpt) ||
-                wildcardsOrigins.matches(toOriginalKey(toks), posOpt)
+            lemmas.matches(toLemmaKey(toks), posOpt) ||
+            origins.matches(toOriginalKey(toks), posOpt) ||
+            // Scan access.
+            wildcardsLemmas.matches(toLemmaKey(toks), posOpt) ||
+            wildcardsOrigins.matches(toOriginalKey(toks), posOpt)
 
     /**
-      * 
+      *
       */
     private def init(): Unit =
-        addStems = if addStopsSet == null then Set.empty else 
addStopsSet.map(stem)
-        exclStems = if exclStopsSet == null then Set.empty else 
exclStopsSet.map(stem)
+        addStems = if addSet == null then Set.empty else addSet.map(stem)
+        exclStems = if exclSet == null then Set.empty else exclSet.map(stem)
 
         def check(name: String, set: Set[String]): Unit =
             if set.exists(_.exists(_.isWhitespace)) then throw E(s"$name 
contain a string with whitespaces.")
@@ -355,8 +358,8 @@ class NCEnStopWordsTokenEnricher(
                                 case Some(set) => set.add(cond)
                                 case _ =>
                                     val set = mutable.HashSet.empty[T]
-                                        set += cond
-                                        m += pos -> set
+                                    set += cond
+                                    m += pos -> set
                         )
 
                     add(incls, incl = true)
@@ -370,7 +373,7 @@ class NCEnStopWordsTokenEnricher(
                 m += tuple._1 -> tuple._2
             WordForm.values.foreach(f =>
                 add(f, mkT, isExc = true)
-                    add(f, mkT, isExc = false)
+                add(f, mkT, isExc = false)
             )
             m.toMap
 
@@ -416,7 +419,7 @@ class NCEnStopWordsTokenEnricher(
             val isMultiWord = s.contains(' ')
 
             // Confusing POSes.
-            if poses.nonEmpty && isMultiWord then throwError("POSes cannot be 
defined for multiple stop words.")
+            if poses.nonEmpty && isMultiWord then throwError("POSes cannot be 
defined for multiple stopword.")
             var isCase = false
             if s.head == '@' then
                 s = s.drop(1)
@@ -424,12 +427,11 @@ class NCEnStopWordsTokenEnricher(
                 if s.isEmpty then throwError("Empty word.")
                 isCase = true
             val idxWild = s.indexOf("*")
-            if idxWild >= 0 && isMultiWord then throwError("Wildcard cannot be 
defined for multiple stop words.")
+            if idxWild >= 0 && isMultiWord then throwError("Wildcard cannot be 
defined for multiple stopword.")
             if idxWild < 0 then
                 val (word, form) =
                     if isCase then (s, ORIG)
-                    else
-                        if !hasPoses then (stem(s), STEM) else (stem(s), LEM)
+                    else if !hasPoses then (stem(s), STEM) else (stem(s), LEM)
                 mHash((isExc, form)).addCondition(word, poses)
             else
                 val b = s.take(idxWild)
@@ -450,20 +452,18 @@ class NCEnStopWordsTokenEnricher(
                 val any = m((isExc, form)).any.toSet
                 val incl = toImmutable(m((isExc, form)).incls)
                 val excl = toImmutable(m((isExc, form)).excls)
+                mkInstance(any ++ excl.values.flatten, incl, excl)
 
-                    mkInstance(any ++ excl.values.flatten, incl, excl)
-            end mkHolder
             def mkHash(form: WordForm): HashHolder = mkHolder(mHash, form, 
HashHolder.apply)
-            def mkScan(form: WordForm):
-            ScanHolder = mkHolder(mScan, form, ScanHolder.apply)
+            def mkScan(form: WordForm): ScanHolder = mkHolder(mScan, form, 
ScanHolder.apply)
 
-                isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM), 
mkHash(ORIG), mkScan(LEM), mkScan(ORIG))
+            isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM), mkHash(ORIG), 
mkScan(LEM), mkScan(ORIG))
         ).toMap
 
     private def isVerb(pos: String): Boolean = pos.head == 'V'
 
     /**
-      * Marks words before stop words.
+      * Marks words before stopword.
       *
       * @param ns Sentence.
       * @param stopPoses Stop POSes.
@@ -480,12 +480,10 @@ class NCEnStopWordsTokenEnricher(
         stops: mutable.HashSet[NCToken]
     ): Boolean =
         var stop = true
-
         for ((tok, idx) <- ns.zipWithIndex if idx != lastIdx && 
!isStopWord(tok) && !isException(Seq(tok)) &&
             stopPoses.contains(getPos(tok)) && isStopWord(ns(idx + 1)))
             stops += tok
             stop = false
-
         if stop then true else markBefore(ns, stopPoses, lastIdx, isException, 
stops)
 
     /**
@@ -504,11 +502,11 @@ class NCEnStopWordsTokenEnricher(
                 b
 
     /**
-      * Marks as stopwords, words with POS from configured list, which also 
placed before another stop words.
+      * Marks as stopwords, words with POS from configured list, which also 
placed before another stopword.
       */
     private def processCommonStops(ns: Seq[NCToken], stops: 
mutable.HashSet[NCToken]): Unit =
         /**
-          * Marks as stopwords, words with POS from configured list, which 
also placed before another stop words.
+          * Marks as stopwords, words with POS from configured list, which 
also placed before another stopword.
           */
         @tailrec
         def processCommonStops0(ns: Seq[NCToken]): Unit =
@@ -526,7 +524,7 @@ class NCEnStopWordsTokenEnricher(
 
     /** @inheritdoc */
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
List[NCToken]): Unit =
-        // Stop words and exceptions caches for this sentence.
+        // stopword and exceptions caches for this sentence.
         val cacheSw = mutable.HashMap.empty[Seq[NCToken], Boolean]
         val cacheEx = mutable.HashMap.empty[Seq[NCToken], Boolean]
 
@@ -547,7 +545,7 @@ class NCEnStopWordsTokenEnricher(
             def prev(): NCToken = toks(idx - 1)
             def isCommonVerbs(firstVerb: String, secondVerb: String): Boolean =
                 isVerb(pos) && lemma == secondVerb ||
-                    (isVerb(pos) && lemma == firstVerb && !isLast && 
isVerb(getPos(next())) && getLemma(next()) == secondVerb)
+                (isVerb(pos) && lemma == firstVerb && !isLast && 
isVerb(getPos(next())) && getLemma(next()) == secondVerb)
 
             // +---------------------------------+
             // | Pass #1.                        |
@@ -579,7 +577,7 @@ class NCEnStopWordsTokenEnricher(
             toks.foreach(tok => stops += tok)
             buf += toks
 
-        // Capture the token mix at this point minus the initial stop words 
found up to this point.
+        // Capture the token mix at this point minus the initial stopword 
found up to this point.
         val origToks: Seq[(Seq[NCToken], String)] =
             (for (toks <- mix) yield toks.toSeq).map(s => s -> 
toStemKey(s)).toSeq
 
@@ -590,7 +588,7 @@ class NCEnStopWordsTokenEnricher(
 
         val foundKeys = new mutable.HashSet[String]()
 
-        // All sentence first stop words + first non stop word.
+        // All sentence first stopword + first non stop word.
         val startToks = toks.takeWhile(isStopWord) ++ toks.find(p => 
!isStopWord(p)).map(p => p)
         for (startTok <- startToks; tup <- origToks.filter(_._1.head == 
startTok); key = tup._2 if firstWords.contains(key) && !isException(tup._1))
             tup._1.foreach(tok => stops += tok)
@@ -607,31 +605,28 @@ class NCEnStopWordsTokenEnricher(
 
         // +-------------------------------------------------+
         // | Pass #5.                                        |
-        // | Mark words with POSes before stop-words.        |
+        // | Mark words with POSes before stopwords.        |
         // +-------------------------------------------------+
         markBefore(toks, STOP_BEFORE_STOP, toks.size - 1, isException, stops)
 
         // +-------------------------------------------------+
         // | Pass #6.                                        |
-        // | Processing additional and excluded stop words.  |
+        // | Processing additional and excluded stopword.  |
         // +-------------------------------------------------+
-        for (t <- toks if addStems.contains(stem(t.getText)))
-            stops += t
-
-        for (t <- stops.filter(t => exclStems.contains(stem(t.getText))))
-            stops -= t
+        for (t <- toks if addStems.contains(stem(t.getText))) stops += t
+        for (t <- stops.filter(t => exclStems.contains(stem(t.getText)))) 
stops -= t
 
         // +-------------------------------------------------+
         // | Pass #7.                                        |
         // | Marks as stopwords, words with POS from         |
         // | configured list, which also placed before       |
-        // | another stop words.                             |
+        // | another stopword.                             |
         // +-------------------------------------------------+
         processCommonStops(toks, stops)
 
         // +-------------------------------------------------+
         // | Pass #8.                                        |
-        // | Deletes stop words if they are marked as quoted.|
+        // | Deletes stopword if they are marked as quoted.|
         // +-------------------------------------------------+
         var quotes = toks.filter(isQuote)
 
@@ -651,7 +646,7 @@ class NCEnStopWordsTokenEnricher(
 
         // +-------------------------------------------------+
         // | Pass #9.                                        |
-        // | Deletes stop words if they are brackets.        |
+        // | Deletes stopword if they are brackets.        |
         // +-------------------------------------------------+
         val stack = new java.util.Stack[String]()
         val set = mutable.HashSet.empty[NCToken]
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala
index cf0dd30d..b794f931 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala
@@ -22,8 +22,9 @@ import org.apache.nlpcraft.nlp.parsers.*
 
 /**
   * Stemmer implementation for the English language that delegates to
-  * [[https://opennlp.apache.org/ OpenNLP]] Porter Stemmer. You can find more 
information about this
-  * stemmer algorithm at [[https://tartarus.org/martin/PorterStemmer]].
+  * [[https://opennlp.apache.org/ OpenNLP]] Porter Stemmer.
+  *
+  * @see More information about this stemmer algorithm can be found at 
[[https://tartarus.org/martin/PorterStemmer]].
   */
 class NCEnStemmer extends NCStemmer:
     private val stemmer = new PorterStemmer
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParserSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParserSpec.scala
index dae3e961..1f640cd2 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParserSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParserSpec.scala
@@ -59,7 +59,7 @@ class NCOpenNLPTokenParserSpec extends AnyFunSuite:
             }
         )
         test(
-            // First and last are stop words,
+            // First and last are stopword,
             // Third and fourth are not because quoted.
             // Note that "a ` a a` a" parsed as 5 tokens ("a", "`", ""a, "a`", 
"a") because OpenNLP tokenizer logic,
             // So we use spaces around quotes to simplify test.
@@ -72,7 +72,7 @@ class NCOpenNLPTokenParserSpec extends AnyFunSuite:
             }
         )
         test(
-            // First and last are stop words,
+            // First and last are stopword,
             // Third and fourth are not because brackets.
             "a ( a a ) a",
             toks => {

[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP

Reply via email to