This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-520 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 077ec10e47617f9b9cff67939d441f61105f9c47 Merge: b9f683a1 4c27312e Author: Sergey Kamov <[email protected]> AuthorDate: Wed Dec 14 14:24:40 2022 +0400 Merge remote-tracking branch 'origin/NLPCRAFT-520' into NLPCRAFT-520 # Conflicts: # nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala .../src/main/resources/stopwords/stop_words.txt | 2 +- .../org/apache/nlpcraft/NCPipelineBuilder.scala | 6 +- .../nlp/enrichers/NCDictionaryTokenEnricher.scala | 2 +- .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 114 ++++++++++----------- .../nlp/enrichers/NCOpenNLPTokenEnricher.scala | 21 ++-- .../apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala | 5 +- .../apache/nlpcraft/nlp/stemmer/NCStemmer.scala | 7 +- .../nlp/parsers/NCOpenNLPTokenParserSpec.scala | 4 +- 8 files changed, 80 insertions(+), 81 deletions(-) diff --cc nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala index 3ac14f84,64137cda..108fe3aa --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala @@@ -204,11 -204,11 +207,11 @@@ class NCEnStopWordsTokenEnricher init() private def read(path: String): Set[String] = NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet - private def stem(s: String): String = stemmer.stem(s.toLowerCase) - private def toStemKey(toks: Seq[NCToken]): String = toks.map(_.getText).map(stem).mkString(" ") + private def getStem(s: String): String = stemmer.stem(s.toLowerCase) + private def toStemKey(toks: Seq[NCToken]): String = toks.map(_.getText).map(getStem).mkString(" ") /** - * Stop words holder, used for hash search. + * stopword holder, used for hash search. * * @param any Any POSes container. * @param includes Included by POS container. @@@ -299,8 -299,8 +302,8 @@@ * */ private def init(): Unit = - addStems = if addStopsSet == null then Set.empty else addStopsSet.map(getStem) - exclStems = if exclStopsSet == null then Set.empty else exclStopsSet.map(getStem) - addStems = if addSet == null then Set.empty else addSet.map(stem) - exclStems = if exclSet == null then Set.empty else exclSet.map(stem) ++ addStems = if addSet == null then Set.empty else addSet.map(getStem) ++ exclStems = if exclSet == null then Set.empty else exclSet.map(getStem) def check(name: String, set: Set[String]): Unit = if set.exists(_.exists(_.isWhitespace)) then throw E(s"$name contain a string with whitespaces.") @@@ -431,8 -431,7 +434,7 @@@ if idxWild < 0 then val (word, form) = if isCase then (s, ORIG) - else - if !hasPoses then (getStem(s), STEM) else (getStem(s), LEM) - else if !hasPoses then (stem(s), STEM) else (stem(s), LEM) ++ else if !hasPoses then (getStem(s), STEM) else (getStem(s), LEM) mHash((isExc, form)).addCondition(word, poses) else val b = s.take(idxWild) @@@ -507,14 -502,14 +505,14 @@@ b /** - * Marks as stopwords, words with POS from configured list, which also placed before another stop words. + * Marks as stopwords, words with POS from configured list, which also placed before another stopword. */ - private def processCommonStops(ns: Seq[NCToken], stops: mutable.HashSet[NCToken]): Unit = + private def processCommonStops(ns: Seq[NCToken], extraToks: Map[NCToken, TokenExtra], stops: mutable.HashSet[NCToken]): Unit = /** - * Marks as stopwords, words with POS from configured list, which also placed before another stop words. + * Marks as stopwords, words with POS from configured list, which also placed before another stopword. */ @tailrec - def processCommonStops0(ns: Seq[NCToken]): Unit = + def processCommonStops0(ns: Seq[NCToken], extraToks: Map[NCToken, TokenExtra]): Unit = val max = ns.size - 1 var stop = true @@@ -626,24 -611,18 +624,22 @@@ // +-------------------------------------------------+ // | Pass #6. | - // | Processing additional and excluded stop words. | + // | Processing additional and excluded stopword. | // +-------------------------------------------------+ - for ((t, extra) <- extraToks if addStems.contains(extra.stem) || addStems.contains(extra.lemmaStem)) - stops += t - for (t <- toks if addStems.contains(stem(t.getText))) stops += t - for (t <- stops.filter(t => exclStems.contains(stem(t.getText)))) stops -= t ++ for ((t, extra) <- extraToks if addStems.contains(extra.stem) || addStems.contains(extra.lemmaStem)) stops += t + + for (t <- stops.filter( t => + val extra = extraToks(t) + exclStems.contains(extra.stem) || exclStems.contains(extra.lemmaStem)) - ) - stops -= t ++ ) stops -= t // +-------------------------------------------------+ // | Pass #7. | // | Marks as stopwords, words with POS from | // | configured list, which also placed before | - // | another stop words. | + // | another stopword. | // +-------------------------------------------------+ - processCommonStops(toks, stops) + processCommonStops(toks, extraToks, stops) // +-------------------------------------------------+ // | Pass #8. |
