This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 077ec10e47617f9b9cff67939d441f61105f9c47
Merge: b9f683a1 4c27312e
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 14 14:24:40 2022 +0400

    Merge remote-tracking branch 'origin/NLPCRAFT-520' into NLPCRAFT-520
    
    # Conflicts:
    #       
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala

 .../src/main/resources/stopwords/stop_words.txt    |   2 +-
 .../org/apache/nlpcraft/NCPipelineBuilder.scala    |   6 +-
 .../nlp/enrichers/NCDictionaryTokenEnricher.scala  |   2 +-
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 114 ++++++++++-----------
 .../nlp/enrichers/NCOpenNLPTokenEnricher.scala     |  21 ++--
 .../apache/nlpcraft/nlp/stemmer/NCEnStemmer.scala  |   5 +-
 .../apache/nlpcraft/nlp/stemmer/NCStemmer.scala    |   7 +-
 .../nlp/parsers/NCOpenNLPTokenParserSpec.scala     |   4 +-
 8 files changed, 80 insertions(+), 81 deletions(-)

diff --cc 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 3ac14f84,64137cda..108fe3aa
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@@ -204,11 -204,11 +207,11 @@@ class NCEnStopWordsTokenEnricher
      init()
  
      private def read(path: String): Set[String] = 
NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
 -    private def stem(s: String): String = stemmer.stem(s.toLowerCase)
 -    private def toStemKey(toks: Seq[NCToken]): String = 
toks.map(_.getText).map(stem).mkString(" ")
 +    private def getStem(s: String): String = stemmer.stem(s.toLowerCase)
 +    private def toStemKey(toks: Seq[NCToken]): String = 
toks.map(_.getText).map(getStem).mkString(" ")
  
      /**
-       * Stop words holder, used for hash search.
+       * stopword holder, used for hash search.
        *
        * @param any Any POSes container.
        * @param includes Included by POS container.
@@@ -299,8 -299,8 +302,8 @@@
        *
        */
      private def init(): Unit =
-         addStems = if addStopsSet == null then Set.empty else 
addStopsSet.map(getStem)
-         exclStems = if exclStopsSet == null then Set.empty else 
exclStopsSet.map(getStem)
 -        addStems = if addSet == null then Set.empty else addSet.map(stem)
 -        exclStems = if exclSet == null then Set.empty else exclSet.map(stem)
++        addStems = if addSet == null then Set.empty else addSet.map(getStem)
++        exclStems = if exclSet == null then Set.empty else 
exclSet.map(getStem)
  
          def check(name: String, set: Set[String]): Unit =
              if set.exists(_.exists(_.isWhitespace)) then throw E(s"$name 
contain a string with whitespaces.")
@@@ -431,8 -431,7 +434,7 @@@
              if idxWild < 0 then
                  val (word, form) =
                      if isCase then (s, ORIG)
-                     else
-                         if !hasPoses then (getStem(s), STEM) else 
(getStem(s), LEM)
 -                    else if !hasPoses then (stem(s), STEM) else (stem(s), LEM)
++                    else if !hasPoses then (getStem(s), STEM) else 
(getStem(s), LEM)
                  mHash((isExc, form)).addCondition(word, poses)
              else
                  val b = s.take(idxWild)
@@@ -507,14 -502,14 +505,14 @@@
                  b
  
      /**
-       * Marks as stopwords, words with POS from configured list, which also 
placed before another stop words.
+       * Marks as stopwords, words with POS from configured list, which also 
placed before another stopword.
        */
 -    private def processCommonStops(ns: Seq[NCToken], stops: 
mutable.HashSet[NCToken]): Unit =
 +    private def processCommonStops(ns: Seq[NCToken], extraToks: Map[NCToken, 
TokenExtra], stops: mutable.HashSet[NCToken]): Unit =
          /**
-           * Marks as stopwords, words with POS from configured list, which 
also placed before another stop words.
+           * Marks as stopwords, words with POS from configured list, which 
also placed before another stopword.
            */
          @tailrec
 -        def processCommonStops0(ns: Seq[NCToken]): Unit =
 +        def processCommonStops0(ns: Seq[NCToken], extraToks: Map[NCToken, 
TokenExtra]): Unit =
              val max = ns.size - 1
              var stop = true
  
@@@ -626,24 -611,18 +624,22 @@@
  
          // +-------------------------------------------------+
          // | Pass #6.                                        |
-         // | Processing additional and excluded stop words.  |
+         // | Processing additional and excluded stopword.  |
          // +-------------------------------------------------+
-         for ((t, extra) <- extraToks if addStems.contains(extra.stem) || 
addStems.contains(extra.lemmaStem))
-             stops += t
 -        for (t <- toks if addStems.contains(stem(t.getText))) stops += t
 -        for (t <- stops.filter(t => exclStems.contains(stem(t.getText)))) 
stops -= t
++        for ((t, extra) <- extraToks if addStems.contains(extra.stem) || 
addStems.contains(extra.lemmaStem)) stops += t
 +
 +        for (t <- stops.filter( t =>
 +            val extra = extraToks(t)
 +            exclStems.contains(extra.stem) || 
exclStems.contains(extra.lemmaStem))
-         )
-             stops -= t
++        ) stops -= t
  
          // +-------------------------------------------------+
          // | Pass #7.                                        |
          // | Marks as stopwords, words with POS from         |
          // | configured list, which also placed before       |
-         // | another stop words.                             |
+         // | another stopword.                             |
          // +-------------------------------------------------+
 -        processCommonStops(toks, stops)
 +        processCommonStops(toks, extraToks, stops)
  
          // +-------------------------------------------------+
          // | Pass #8.                                        |

Reply via email to