[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP.

sergeykamov Mon, 19 Dec 2022 02:29:03 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
     new 19b208cf WIP.
19b208cf is described below

commit 19b208cf87d6bb497ff5c549f4668f002c99476e
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Dec 19 14:29:07 2022 +0400

    WIP.
---
 .../nlp/enrichers/NCDictionaryTokenEnricher.scala  |  8 ++++---
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala |  3 +++
 .../nlp/enrichers/NCOpenNLPTokenEnricher.scala     |  6 ++---
 .../nlp/enrichers/NCSwearWordsTokenEnricher.scala  |  9 +++++---
 .../nlpcraft/nlp/parsers/NCNLPEntityParser.scala   |  2 +-
 .../nlp/parsers/NCOpenNLPEntityParser.scala        |  6 ++---
 .../nlp/parsers/NCOpenNLPTokenParser.scala         |  4 ++--
 .../nlp/parsers/NCSemanticEntityParser.scala       | 26 ++++++++++++----------
 8 files changed, 37 insertions(+), 27 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index 2efc5468..0d28f3ad 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -33,12 +33,14 @@ import org.apache.nlpcraft.internal.util.NCUtils as U
   * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] for required 
language that provides this
   * metadata property before this enricher in your [[NCPipeline pipeline]].
   *
-  * @param dictRes Relative path, absolute path or URL to the dictionary file. 
The dictionary should have a simple
-  *         plain text format with *one lemma per line*, empty lines are 
skipped, duplicates ignored, header or other comments allowed.
-  *         Headers are lines started by **#** symbol. Search in the 
dictionary is implemented by input words **lemms**, case is ignored.
+  * @param dictRes Relative path, absolute path, classpath resource or URL to 
the dictionary.
+  *         The dictionary should have a simple plain text format with *one 
lemma per line*, empty lines are skipped, duplicates ignored, header or other 
comments allowed.
+  *         Headers are lines started with **#** symbol. Search in the 
dictionary is implemented by input words **lemms**, case is ignored.
   */
 //noinspection DuplicatedCode,ScalaWeakerAccess
 class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher with 
LazyLogging:
+    require(dictRes != null, "Dictonary resource cannot be null.")
+
     private var dict: Set[String] = _
 
     init()
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index cb1baae2..28446481 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -178,6 +178,9 @@ import 
org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
   *
   * More information about stopwords can be found at 
[[https://en.wikipedia.org/wiki/Stop_word]].
   *
+  * * `stemmer` implementation language should be corresponded to other 
components of [[NCPipeline]], but
+  * required `stemmer` implementation is independent from other components' 
stemmers.
+  *
   * **NOTE:** this implementation requires `lemma` and `pos` string 
[[NCPropertyMap metadata]] properties that
   * contain token's lemma and part of speech accordingly. You can configure 
[[NCOpenNLPTokenEnricher]] with the model
   * for English language that would provide these metadata properties before 
this enricher in your [[NCPipeline pipeline]].
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
index 0321fab0..a51284da 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
@@ -37,17 +37,17 @@ import scala.concurrent.ExecutionContext
   * This OpenNLP enricher requires PoS and lemma models. Some of OpenNLP 
community models can be found
   * [[https://opennlp.sourceforge.net/models-1.5/ here]].
   *
-  * @param posMdlRes Relative path, absolute path or URL to
+  * @param posMdlRes Relative path, absolute path, classpath resource or URL to
   *         
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/postag/POSTaggerME.html
 POSTaggerME]] model.
   *         Can be `null` if **part-of-speech** model is not configured, so 
`pos` property will not be set.
   *         Note that at least one of the model must be provided.
-  * @param lemmaDicRes Relative path, absolute path or URL to
+  * @param lemmaDicRes Relative path, absolute path, classpath resource or URL 
to
   *         
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html
 DictionaryLemmatizer]] model.
   *         Can be `null` if **lemmatizer** model is not configured, so 
`lemma` property will not be set.
   *         Note that at least one of the model must be provided.
   */
 class NCOpenNLPTokenEnricher(posMdlRes: String = null, lemmaDicRes: String = 
null) extends NCTokenEnricher with LazyLogging:
-    require(posMdlRes != null || lemmaDicRes != null, "At least one model must 
be defined")
+    require(posMdlRes != null || lemmaDicRes != null, "At least one model must 
be defined.")
 
     private var tagger: POSTaggerME = _
     private var lemmatizer: DictionaryLemmatizer = _
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
index 85efc02d..ff0ebc98 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
@@ -34,16 +34,19 @@ import java.util.Objects
   * `false` value indicates otherwise.
   *
   * Read more about stemming [[https://en.wikipedia.org/wiki/Stemming here]].
+  * Dictionary language and `stemmer` implementation language should be 
corresponded to other components of [[NCPipeline]], but
+  * required `stemmer` implementation is independent from other components' 
stemmers.
+  *
   * Stemming is used here because it is too difficult to be based on more 
accurate `lemma` approach for swear words.
   *
-  * @param dictRes Path to the swear dictionary. The dictionary should have a 
simple
+  * @param dictRes Relative path, absolute path, classpath resource or URL to 
the swear dictionary. The dictionary should have a simple
   *         plain text format with *one word per line*, empty lines are 
skipped, duplicates ignored, header or other comments allowed.
-  *         Headers are lines started by **#** symbol. Search in the 
dictionary is implemented by input words **stems**, case is ignored.
+  *         Headers are lines started with **#** symbol. Search in the 
dictionary is implemented by input words **stems**, case is ignored.
   * @param stemmer Stemmer implementation for the dictionary language.
   */
 //noinspection ScalaWeakerAccess
 class NCSwearWordsTokenEnricher(dictRes: String, stemmer: NCStemmer) extends 
NCTokenEnricher with LazyLogging:
-    require(dictRes != null, "Swear words model file cannot be null.")
+    require(dictRes != null, "Swear words dictonary resource cannot be null.")
     require(stemmer != null, "Stemmer cannot be null.")
 
     private var swearWords: Set[String] = _
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
index b21f34ec..c805c21b 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
@@ -48,7 +48,7 @@ import org.apache.nlpcraft.nlp.parsers.NCNLPEntityParser.*
   *  By default all [[NCToken]] instances converted.
   */
 class NCNLPEntityParser(predicate: NCToken => Boolean = _ => true) extends 
NCEntityParser:
-    require(predicate != null)
+    require(predicate != null, "Predicate cannot be null.")
 
     /** @inheritdoc */
     override def parse(req: NCRequest, cfg: NCModelConfig, toks: 
List[NCToken]): List[NCEntity] =
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
index d29e2daa..e931695f 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
@@ -39,7 +39,7 @@ object NCOpenNLPEntityParser:
     /**
       * Creates [[NCOpenNLPEntityParser]] instance.
       *
-      * @param mdl Path to 
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
 model]].
+      * @param mdl Relative path, absolute path, classpath resource or URL to 
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
 model]].
       * @return [[NCOpenNLPEntityParser]] instance.
       */
     def apply(mdl: String): NCOpenNLPEntityParser =
@@ -59,10 +59,10 @@ object NCOpenNLPEntityParser:
   *
   * **NOTE:** that each input [[NCToken]] can be included into several output 
[[NCEntity]] instances.
   *
-  * @param findersMdlsRes Paths to 
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
 models]].
+  * @param findersMdlsRes Relative paths, absolute paths, resources or URLs to 
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
 models]].
   */
 class NCOpenNLPEntityParser(findersMdlsRes: List[String]) extends 
NCEntityParser with LazyLogging:
-    require(findersMdlsRes != null && findersMdlsRes.nonEmpty, "Models paths 
cannot be null or empty.")
+    require(findersMdlsRes != null && findersMdlsRes.nonEmpty, "Models 
resources cannot be null or empty.")
 
     private var finders: Seq[NameFinderME] = _
     private case class Holder(start: Int, end: Int, name: String, probability: 
Double)
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
index a148b3bb..2c1dc7ef 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
@@ -32,10 +32,10 @@ import java.util.Objects
   *
   * Some of OpenNLP prepared models can be found 
[[https://opennlp.sourceforge.net/models-1.5/ here]].
   *
-  * @param tokMdlRes Path to 
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/tokenize/TokenizerModel.html
 model]].
+  * @param tokMdlRes Relative path, absolute path, classpath resource or URL 
to 
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/tokenize/TokenizerModel.html
 model]].
   */
 class NCOpenNLPTokenParser(tokMdlRes: String) extends NCTokenParser with 
LazyLogging:
-    require(tokMdlRes != null, "Tokenizer model path cannot be null.")
+    require(tokMdlRes != null, "Tokenizer model resource cannot be null.")
 
     @volatile private var tokenizer: TokenizerME = _
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
index 8de55ac1..4ef25fc3 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
@@ -51,9 +51,9 @@ object NCSemanticEntityParser:
         elements: List[NCSemanticElement]
     ): NCSemanticEntityParser =
         require(stemmer != null, "Stemmer cannot be null.")
-        require(parser != null, "Parser cannot be null.")
+        require(parser != null, "Token parser cannot be null.")
         require(macros != null, "Macros cannot be null.")
-        require(elements != null, "Elements cannot be null.")
+        require(elements != null && elements.nonEmpty, "Elements cannot be 
null or empty.")
 
         new NCSemanticEntityParser(stemmer, parser, macros = macros, elements 
= elements)
 
@@ -71,8 +71,8 @@ object NCSemanticEntityParser:
         elements: List[NCSemanticElement]
     ): NCSemanticEntityParser =
         require(stemmer != null, "Stemmer cannot be null.")
-        require(parser != null, "Parser cannot be null.")
-        require(elements != null, "Elements cannot be null.")
+        require(parser != null, "Token parser cannot be null.")
+        require(elements != null && elements.nonEmpty, "Elements cannot be 
null or empty.")
 
         new NCSemanticEntityParser(stemmer, parser, macros = Map.empty, 
elements = elements)
 
@@ -82,12 +82,12 @@ object NCSemanticEntityParser:
       *
       * @param stemmer [[NCStemmer]] implementation for synonyms language.
       * @param parser  [[NCTokenParser]] implementation.
-      * @param mdlRes  Classpath resource, file path or URL for YAML or JSON 
semantic model definition file.
+      * @param mdlRes Relative path, absolute path, classpath resource or URL 
to YAML or JSON semantic model definition.
       */
     def apply(stemmer: NCStemmer, parser: NCTokenParser, mdlRes: String): 
NCSemanticEntityParser =
         require(stemmer != null, "Stemmer cannot be null.")
-        require(parser != null, "Parser cannot be null.")
-        require(mdlRes != null, "Model path cannot be null.")
+        require(parser != null, "Token parser cannot be null.")
+        require(mdlRes != null, "Model resource cannot be null.")
 
         new NCSemanticEntityParser(stemmer, parser, mdlResOpt = mdlRes.?)
 
@@ -183,13 +183,15 @@ import 
org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser.*
   * See detailed description on the website 
[[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic 
Semantic Parser]].
   *
   * **NOTE:** [[NCSemanticElement]] synonyms, **stemmer** and **parser** 
parameters must be configured for the same language.
+  * `stemmer` implementation language should be corresponded to other 
components of [[NCPipeline]], but
+  * required `stemmer` implementation is independent from other components' 
stemmers.
   *
   * @see [[NCSemanticElement]]
   * @param stemmer   [[NCStemmer]] implementation for synonyms language.
   * @param parser    [[NCTokenParser]] implementation.
   * @param macros    Macros map. Empty by default.
   * @param elements  [[NCSemanticElement]] list.
-  * @param mdlResOpt Optional classpath resource, file path or URL for YAML or 
JSON semantic model definition file.
+  * @param mdlResOpt Optional relative path, absolute path, classpath resource 
or URL to YAML or JSON semantic model definition.
   */
 class NCSemanticEntityParser(
     stemmer: NCStemmer,
@@ -198,10 +200,10 @@ class NCSemanticEntityParser(
     elements: List[NCSemanticElement] = List.empty,
     mdlResOpt: Option[String] = None
 ) extends NCEntityParser with LazyLogging:
-    require(stemmer != null)
-    require(parser != null)
-    require(macros != null)
-    require(elements != null && elements.nonEmpty || mdlResOpt.isDefined)
+    require(stemmer != null, "Stemmer cannot be null.")
+    require(parser != null, "Token parser cannot be null.")
+    require(macros != null, "Macroses cannot be null.")
+    require(elements != null && elements.nonEmpty || mdlResOpt.isDefined, 
"Elements cannot be null or empty or model resource cannot be empty.")
 
     private lazy val scrType =
         require(mdlResOpt.isDefined)

[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP.

Reply via email to