This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new 59aa488f WIP
59aa488f is described below
commit 59aa488fd3c2806940926e72cba26c96dd7d58bd
Author: Aaron Radzinski <[email protected]>
AuthorDate: Tue Dec 13 16:12:19 2022 -0800
WIP
---
.../src/main/resources/stopwords/stop_words.txt | 2 +-
.../org/apache/nlpcraft/NCPipelineBuilder.scala | 6 +-
.../nlp/enrichers/NCDictionaryTokenEnricher.scala | 2 +-
.../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 117 ++++++++++-----------
.../apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala | 5 +-
.../nlp/parsers/NCOpenNLPTokenParserSpec.scala | 4 +-
6 files changed, 66 insertions(+), 70 deletions(-)
diff --git a/nlpcraft/src/main/resources/stopwords/stop_words.txt
b/nlpcraft/src/main/resources/stopwords/stop_words.txt
index 5644efd4..1cff382e 100644
--- a/nlpcraft/src/main/resources/stopwords/stop_words.txt
+++ b/nlpcraft/src/main/resources/stopwords/stop_words.txt
@@ -15,7 +15,7 @@
# limitations under the License.
#
-# Basic predefined stop-words.
+# Basic predefined stopwords.
#
# Configuration contains:
# - Words (processed as stem)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
index 69f1b6d3..08a3886e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
@@ -235,7 +235,7 @@ class NCPipelineBuilder:
* and
*
[[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict
en-lemmatizer.dict]] model for
*
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html
DictionaryLemmatizer]].
- * - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher.
+ * - [[NCEnStopWordsTokenEnricher stopword]] token enricher.
* - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized
by
*
[[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt
swear_words.txt]] dictionary.
* - [[NCQuotesTokenEnricher Quotes]] token enricher.
@@ -277,7 +277,7 @@ class NCPipelineBuilder:
* and
*
[[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict
en-lemmatizer.dict]] model for
*
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html
DictionaryLemmatizer]].
- * - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher.
+ * - [[NCEnStopWordsTokenEnricher stopword]] token enricher.
* - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized
by
*
[[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt
swear_words.txt]] dictionary.
* - [[NCQuotesTokenEnricher Quotes]] token enricher.
@@ -306,7 +306,7 @@ class NCPipelineBuilder:
* and
*
[[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict
en-lemmatizer.dict]] model for
*
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html
DictionaryLemmatizer]].
- * - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher.
+ * - [[NCEnStopWordsTokenEnricher stopword]] token enricher.
* - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized
by
*
[[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt
swear_words.txt]] dictionary.
* - [[NCQuotesTokenEnricher Quotes]] token enricher.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index cf17817e..697fcbe1 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -33,7 +33,7 @@ import org.apache.nlpcraft.internal.util.NCUtils
* metadata property before this enricher in your [[NCPipeline pipeline]].
*
* @param dictRes Relative path, absolute path or URL to the dictionary file.
The dictionary should have a simple
- * plain text format with *one lemma per line* with no empty line,
header or comments allowed.
+ * plain text format with *one lemma per line* with no empty lines,
header or other comments allowed.
*/
//noinspection DuplicatedCode,ScalaWeakerAccess
class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher:
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 3fc1e57a..64137cda 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -29,9 +29,9 @@ import scala.collection.*
import scala.concurrent.ExecutionContext
/**
- * [[NCEnStopWordsTokenEnricher]] helper.
+ * Companion helper.
*/
-object NCEnStopWordsTokenEnricher:
+private object NCEnStopWordsTokenEnricher:
// Condition types.
private type Wildcard = (String, String)
private type Word = String
@@ -104,14 +104,14 @@ object NCEnStopWordsTokenEnricher:
/**
* Gets all sequential permutations of tokens in this NLP sentence.
- * This method is like a 'tokenMix', but with all combinations of
stop-words (with and without)
+ * This method is like a 'tokenMix', but with all combinations of
stopwords (with and without)
*
* @param tokens Tokens.
* @param maxLen Maximum number of tokens in the sequence.
*/
private[enrichers] def tokenMixWithStopWords(tokens: Seq[NCToken], maxLen:
Int = Integer.MAX_VALUE): Seq[Seq[NCToken]] =
/**
- * Gets all combinations for sequence of mandatory tokens with
stop-words and without.
+ * Gets all combinations for sequence of mandatory tokens with
stopwords and without.
*
* Example:
* 'A (stop), B, C(stop) -> [A, B, C]; [A, B]; [B, C], [B]
@@ -163,31 +163,34 @@ object NCEnStopWordsTokenEnricher:
import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
/**
- * "Stop-word" [[NCTokenEnricher token enricher]] for English (EN) language.
Stop words are the words
+ * Stopword [[NCTokenEnricher token enricher]] for English (EN) language.
Stopwords are the words
* which are filtered out (i.e. stopped) before processing of natural
language text because they are
* insignificant.
*
* This enricher adds `stopword` boolean [[NCPropertyMap metadata]] property
to the [[NCToken token]]
- * instance if word it represents is an English stop-word. The value `true`
of the metadata property indicates that
- * this word is detected as a stop-word, `false` value indicates otherwise.
+ * instance if the word it represents is an English stopword. The value
`true` of this metadata property indicates that
+ * this word is detected as a stopword, `false` value indicates otherwise.
This implementation works off the
+ * algorithm that uses an internal list of English stopwords as well as a
procedural logic to determine the stopword
+ * status of the token. This algorithm should work fine for most of the
general uses cases. User, however, can add
+ * additional stopwords or exceptions for the existing ones using
corresponding parameters in [[NCEnStopWordsTokenEnricher]]
+ * constructor.
*
- * More information about stop-words can be found at
[[https://en.wikipedia.org/wiki/Stop_word]].
+ * More information about stopwords can be found at
[[https://en.wikipedia.org/wiki/Stop_word]].
*
* **NOTE:** this implementation requires `lemma` and `pos` string
[[NCPropertyMap metadata]] properties that
- * contain token's lemma and part of speech. You can configure
[[NCOpenNLPTokenEnricher]] for English language
- * that provides this metadata properties before this enricher in your
[[NCPipeline pipeline]].
+ * contain token's lemma and part of speech accordingly. You can configure
[[NCOpenNLPTokenEnricher]] with the model
+ * for English language that would provide these metadata properties before
this enricher in your [[NCPipeline pipeline]].
*
- * @see [[NCEnStemmer]]
- *
- * @param addStopsSet User defined collection of additional stop-words.
- * These word will be tried to match based on `stemmer` implementation.
- * @param exclStopsSet User defined collection of exceptions, that is words
which should not be marked as stop-words during processing.
- * These word will be tried to match based on `stemmer` implementation.
- * @param stemmer English stemmer implementation.
+ * @param addSet User defined collection of additional stopwords. These words
will be stemmatized by the given `stemmer`
+ * before attempting to find a match. Default value is an empty set.
+ * @param exclSet User defined collection of exceptions, i.e. the words which
should not be marked as stopwords during
+ * processing. These words will be stemmatized by the given `stemmer`
before attempting to find a match.
+ * Default value is an empty set.
+ * @param stemmer English stemmer implementation. Default value is the
instance of [[org.apache.nlpcraft.nlp.stemmer.NCEnStemmer]].
*/
class NCEnStopWordsTokenEnricher(
- addStopsSet: Set[String] = Set.empty,
- exclStopsSet: Set[String] = Set.empty,
+ addSet: Set[String] = Set.empty,
+ exclSet: Set[String] = Set.empty,
stemmer: NCStemmer = new NCEnStemmer
) extends NCTokenEnricher with LazyLogging:
private var addStems: Set[String] = _
@@ -205,7 +208,7 @@ class NCEnStopWordsTokenEnricher(
private def toStemKey(toks: Seq[NCToken]): String =
toks.map(_.getText).map(stem).mkString(" ")
/**
- * Stop words holder, used for hash search.
+ * stopword holder, used for hash search.
*
* @param any Any POSes container.
* @param includes Included by POS container.
@@ -224,7 +227,7 @@ class NCEnStopWordsTokenEnricher(
case _ => any.contains(s)
/**
- * Stop words holder, used for scanning.
+ * stopword holder, used for scanning.
*
* @param any Any POSes container.
* @param includes Included by POS container.
@@ -263,7 +266,7 @@ class NCEnStopWordsTokenEnricher(
case _ => throw new AssertionError("Unexpected POS.")
/**
- * Stop words data holder.
+ * stopword data holder.
*
* @param stems Stems data holder.
* @param lemmas Lemmas data holder.
@@ -286,18 +289,18 @@ class NCEnStopWordsTokenEnricher(
// Hash access.
stems.matches(toStemKey(toks), posOpt) ||
- lemmas.matches(toLemmaKey(toks), posOpt) ||
- origins.matches(toOriginalKey(toks), posOpt) ||
- // Scan access.
- wildcardsLemmas.matches(toLemmaKey(toks), posOpt) ||
- wildcardsOrigins.matches(toOriginalKey(toks), posOpt)
+ lemmas.matches(toLemmaKey(toks), posOpt) ||
+ origins.matches(toOriginalKey(toks), posOpt) ||
+ // Scan access.
+ wildcardsLemmas.matches(toLemmaKey(toks), posOpt) ||
+ wildcardsOrigins.matches(toOriginalKey(toks), posOpt)
/**
- *
+ *
*/
private def init(): Unit =
- addStems = if addStopsSet == null then Set.empty else
addStopsSet.map(stem)
- exclStems = if exclStopsSet == null then Set.empty else
exclStopsSet.map(stem)
+ addStems = if addSet == null then Set.empty else addSet.map(stem)
+ exclStems = if exclSet == null then Set.empty else exclSet.map(stem)
def check(name: String, set: Set[String]): Unit =
if set.exists(_.exists(_.isWhitespace)) then throw E(s"$name
contain a string with whitespaces.")
@@ -355,8 +358,8 @@ class NCEnStopWordsTokenEnricher(
case Some(set) => set.add(cond)
case _ =>
val set = mutable.HashSet.empty[T]
- set += cond
- m += pos -> set
+ set += cond
+ m += pos -> set
)
add(incls, incl = true)
@@ -370,7 +373,7 @@ class NCEnStopWordsTokenEnricher(
m += tuple._1 -> tuple._2
WordForm.values.foreach(f =>
add(f, mkT, isExc = true)
- add(f, mkT, isExc = false)
+ add(f, mkT, isExc = false)
)
m.toMap
@@ -416,7 +419,7 @@ class NCEnStopWordsTokenEnricher(
val isMultiWord = s.contains(' ')
// Confusing POSes.
- if poses.nonEmpty && isMultiWord then throwError("POSes cannot be
defined for multiple stop words.")
+ if poses.nonEmpty && isMultiWord then throwError("POSes cannot be
defined for multiple stopword.")
var isCase = false
if s.head == '@' then
s = s.drop(1)
@@ -424,12 +427,11 @@ class NCEnStopWordsTokenEnricher(
if s.isEmpty then throwError("Empty word.")
isCase = true
val idxWild = s.indexOf("*")
- if idxWild >= 0 && isMultiWord then throwError("Wildcard cannot be
defined for multiple stop words.")
+ if idxWild >= 0 && isMultiWord then throwError("Wildcard cannot be
defined for multiple stopword.")
if idxWild < 0 then
val (word, form) =
if isCase then (s, ORIG)
- else
- if !hasPoses then (stem(s), STEM) else (stem(s), LEM)
+ else if !hasPoses then (stem(s), STEM) else (stem(s), LEM)
mHash((isExc, form)).addCondition(word, poses)
else
val b = s.take(idxWild)
@@ -450,20 +452,18 @@ class NCEnStopWordsTokenEnricher(
val any = m((isExc, form)).any.toSet
val incl = toImmutable(m((isExc, form)).incls)
val excl = toImmutable(m((isExc, form)).excls)
+ mkInstance(any ++ excl.values.flatten, incl, excl)
- mkInstance(any ++ excl.values.flatten, incl, excl)
- end mkHolder
def mkHash(form: WordForm): HashHolder = mkHolder(mHash, form,
HashHolder.apply)
- def mkScan(form: WordForm):
- ScanHolder = mkHolder(mScan, form, ScanHolder.apply)
+ def mkScan(form: WordForm): ScanHolder = mkHolder(mScan, form,
ScanHolder.apply)
- isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM),
mkHash(ORIG), mkScan(LEM), mkScan(ORIG))
+ isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM), mkHash(ORIG),
mkScan(LEM), mkScan(ORIG))
).toMap
private def isVerb(pos: String): Boolean = pos.head == 'V'
/**
- * Marks words before stop words.
+ * Marks words before stopword.
*
* @param ns Sentence.
* @param stopPoses Stop POSes.
@@ -480,12 +480,10 @@ class NCEnStopWordsTokenEnricher(
stops: mutable.HashSet[NCToken]
): Boolean =
var stop = true
-
for ((tok, idx) <- ns.zipWithIndex if idx != lastIdx &&
!isStopWord(tok) && !isException(Seq(tok)) &&
stopPoses.contains(getPos(tok)) && isStopWord(ns(idx + 1)))
stops += tok
stop = false
-
if stop then true else markBefore(ns, stopPoses, lastIdx, isException,
stops)
/**
@@ -504,11 +502,11 @@ class NCEnStopWordsTokenEnricher(
b
/**
- * Marks as stopwords, words with POS from configured list, which also
placed before another stop words.
+ * Marks as stopwords, words with POS from configured list, which also
placed before another stopword.
*/
private def processCommonStops(ns: Seq[NCToken], stops:
mutable.HashSet[NCToken]): Unit =
/**
- * Marks as stopwords, words with POS from configured list, which
also placed before another stop words.
+ * Marks as stopwords, words with POS from configured list, which
also placed before another stopword.
*/
@tailrec
def processCommonStops0(ns: Seq[NCToken]): Unit =
@@ -526,7 +524,7 @@ class NCEnStopWordsTokenEnricher(
/** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
List[NCToken]): Unit =
- // Stop words and exceptions caches for this sentence.
+ // stopword and exceptions caches for this sentence.
val cacheSw = mutable.HashMap.empty[Seq[NCToken], Boolean]
val cacheEx = mutable.HashMap.empty[Seq[NCToken], Boolean]
@@ -547,7 +545,7 @@ class NCEnStopWordsTokenEnricher(
def prev(): NCToken = toks(idx - 1)
def isCommonVerbs(firstVerb: String, secondVerb: String): Boolean =
isVerb(pos) && lemma == secondVerb ||
- (isVerb(pos) && lemma == firstVerb && !isLast &&
isVerb(getPos(next())) && getLemma(next()) == secondVerb)
+ (isVerb(pos) && lemma == firstVerb && !isLast &&
isVerb(getPos(next())) && getLemma(next()) == secondVerb)
// +---------------------------------+
// | Pass #1. |
@@ -579,7 +577,7 @@ class NCEnStopWordsTokenEnricher(
toks.foreach(tok => stops += tok)
buf += toks
- // Capture the token mix at this point minus the initial stop words
found up to this point.
+ // Capture the token mix at this point minus the initial stopword
found up to this point.
val origToks: Seq[(Seq[NCToken], String)] =
(for (toks <- mix) yield toks.toSeq).map(s => s ->
toStemKey(s)).toSeq
@@ -590,7 +588,7 @@ class NCEnStopWordsTokenEnricher(
val foundKeys = new mutable.HashSet[String]()
- // All sentence first stop words + first non stop word.
+ // All sentence first stopword + first non stop word.
val startToks = toks.takeWhile(isStopWord) ++ toks.find(p =>
!isStopWord(p)).map(p => p)
for (startTok <- startToks; tup <- origToks.filter(_._1.head ==
startTok); key = tup._2 if firstWords.contains(key) && !isException(tup._1))
tup._1.foreach(tok => stops += tok)
@@ -607,31 +605,28 @@ class NCEnStopWordsTokenEnricher(
// +-------------------------------------------------+
// | Pass #5. |
- // | Mark words with POSes before stop-words. |
+ // | Mark words with POSes before stopwords. |
// +-------------------------------------------------+
markBefore(toks, STOP_BEFORE_STOP, toks.size - 1, isException, stops)
// +-------------------------------------------------+
// | Pass #6. |
- // | Processing additional and excluded stop words. |
+ // | Processing additional and excluded stopword. |
// +-------------------------------------------------+
- for (t <- toks if addStems.contains(stem(t.getText)))
- stops += t
-
- for (t <- stops.filter(t => exclStems.contains(stem(t.getText))))
- stops -= t
+ for (t <- toks if addStems.contains(stem(t.getText))) stops += t
+ for (t <- stops.filter(t => exclStems.contains(stem(t.getText))))
stops -= t
// +-------------------------------------------------+
// | Pass #7. |
// | Marks as stopwords, words with POS from |
// | configured list, which also placed before |
- // | another stop words. |
+ // | another stopword. |
// +-------------------------------------------------+
processCommonStops(toks, stops)
// +-------------------------------------------------+
// | Pass #8. |
- // | Deletes stop words if they are marked as quoted.|
+ // | Deletes stopword if they are marked as quoted.|
// +-------------------------------------------------+
var quotes = toks.filter(isQuote)
@@ -651,7 +646,7 @@ class NCEnStopWordsTokenEnricher(
// +-------------------------------------------------+
// | Pass #9. |
- // | Deletes stop words if they are brackets. |
+ // | Deletes stopword if they are brackets. |
// +-------------------------------------------------+
val stack = new java.util.Stack[String]()
val set = mutable.HashSet.empty[NCToken]
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala
index cf0dd30d..b794f931 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala
@@ -22,8 +22,9 @@ import org.apache.nlpcraft.nlp.parsers.*
/**
* Stemmer implementation for the English language that delegates to
- * [[https://opennlp.apache.org/ OpenNLP]] Porter Stemmer. You can find more
information about this
- * stemmer algorithm at [[https://tartarus.org/martin/PorterStemmer]].
+ * [[https://opennlp.apache.org/ OpenNLP]] Porter Stemmer.
+ *
+ * @see More information about this stemmer algorithm can be found at
[[https://tartarus.org/martin/PorterStemmer]].
*/
class NCEnStemmer extends NCStemmer:
private val stemmer = new PorterStemmer
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParserSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParserSpec.scala
index dae3e961..1f640cd2 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParserSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParserSpec.scala
@@ -59,7 +59,7 @@ class NCOpenNLPTokenParserSpec extends AnyFunSuite:
}
)
test(
- // First and last are stop words,
+ // First and last are stopword,
// Third and fourth are not because quoted.
// Note that "a ` a a` a" parsed as 5 tokens ("a", "`", ""a, "a`",
"a") because OpenNLP tokenizer logic,
// So we use spaces around quotes to simplify test.
@@ -72,7 +72,7 @@ class NCOpenNLPTokenParserSpec extends AnyFunSuite:
}
)
test(
- // First and last are stop words,
+ // First and last are stopword,
// Third and fourth are not because brackets.
"a ( a a ) a",
toks => {