This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new 19b208cf WIP.
19b208cf is described below
commit 19b208cf87d6bb497ff5c549f4668f002c99476e
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Dec 19 14:29:07 2022 +0400
WIP.
---
.../nlp/enrichers/NCDictionaryTokenEnricher.scala | 8 ++++---
.../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 3 +++
.../nlp/enrichers/NCOpenNLPTokenEnricher.scala | 6 ++---
.../nlp/enrichers/NCSwearWordsTokenEnricher.scala | 9 +++++---
.../nlpcraft/nlp/parsers/NCNLPEntityParser.scala | 2 +-
.../nlp/parsers/NCOpenNLPEntityParser.scala | 6 ++---
.../nlp/parsers/NCOpenNLPTokenParser.scala | 4 ++--
.../nlp/parsers/NCSemanticEntityParser.scala | 26 ++++++++++++----------
8 files changed, 37 insertions(+), 27 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index 2efc5468..0d28f3ad 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -33,12 +33,14 @@ import org.apache.nlpcraft.internal.util.NCUtils as U
* token's lemma. You can configure [[NCOpenNLPTokenEnricher]] for required
language that provides this
* metadata property before this enricher in your [[NCPipeline pipeline]].
*
- * @param dictRes Relative path, absolute path or URL to the dictionary file.
The dictionary should have a simple
- * plain text format with *one lemma per line*, empty lines are
skipped, duplicates ignored, header or other comments allowed.
- * Headers are lines started by **#** symbol. Search in the
dictionary is implemented by input words **lemms**, case is ignored.
+ * @param dictRes Relative path, absolute path, classpath resource or URL to
the dictionary.
+ * The dictionary should have a simple plain text format with *one
lemma per line*, empty lines are skipped, duplicates ignored, header or other
comments allowed.
+ * Headers are lines started with **#** symbol. Search in the
dictionary is implemented by input words **lemms**, case is ignored.
*/
//noinspection DuplicatedCode,ScalaWeakerAccess
class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher with
LazyLogging:
+ require(dictRes != null, "Dictonary resource cannot be null.")
+
private var dict: Set[String] = _
init()
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index cb1baae2..28446481 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -178,6 +178,9 @@ import
org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
*
* More information about stopwords can be found at
[[https://en.wikipedia.org/wiki/Stop_word]].
*
+ * * `stemmer` implementation language should be corresponded to other
components of [[NCPipeline]], but
+ * required `stemmer` implementation is independent from other components'
stemmers.
+ *
* **NOTE:** this implementation requires `lemma` and `pos` string
[[NCPropertyMap metadata]] properties that
* contain token's lemma and part of speech accordingly. You can configure
[[NCOpenNLPTokenEnricher]] with the model
* for English language that would provide these metadata properties before
this enricher in your [[NCPipeline pipeline]].
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
index 0321fab0..a51284da 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
@@ -37,17 +37,17 @@ import scala.concurrent.ExecutionContext
* This OpenNLP enricher requires PoS and lemma models. Some of OpenNLP
community models can be found
* [[https://opennlp.sourceforge.net/models-1.5/ here]].
*
- * @param posMdlRes Relative path, absolute path or URL to
+ * @param posMdlRes Relative path, absolute path, classpath resource or URL to
*
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/postag/POSTaggerME.html
POSTaggerME]] model.
* Can be `null` if **part-of-speech** model is not configured, so
`pos` property will not be set.
* Note that at least one of the model must be provided.
- * @param lemmaDicRes Relative path, absolute path or URL to
+ * @param lemmaDicRes Relative path, absolute path, classpath resource or URL
to
*
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html
DictionaryLemmatizer]] model.
* Can be `null` if **lemmatizer** model is not configured, so
`lemma` property will not be set.
* Note that at least one of the model must be provided.
*/
class NCOpenNLPTokenEnricher(posMdlRes: String = null, lemmaDicRes: String =
null) extends NCTokenEnricher with LazyLogging:
- require(posMdlRes != null || lemmaDicRes != null, "At least one model must
be defined")
+ require(posMdlRes != null || lemmaDicRes != null, "At least one model must
be defined.")
private var tagger: POSTaggerME = _
private var lemmatizer: DictionaryLemmatizer = _
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
index 85efc02d..ff0ebc98 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
@@ -34,16 +34,19 @@ import java.util.Objects
* `false` value indicates otherwise.
*
* Read more about stemming [[https://en.wikipedia.org/wiki/Stemming here]].
+ * Dictionary language and `stemmer` implementation language should be
corresponded to other components of [[NCPipeline]], but
+ * required `stemmer` implementation is independent from other components'
stemmers.
+ *
* Stemming is used here because it is too difficult to be based on more
accurate `lemma` approach for swear words.
*
- * @param dictRes Path to the swear dictionary. The dictionary should have a
simple
+ * @param dictRes Relative path, absolute path, classpath resource or URL to
the swear dictionary. The dictionary should have a simple
* plain text format with *one word per line*, empty lines are
skipped, duplicates ignored, header or other comments allowed.
- * Headers are lines started by **#** symbol. Search in the
dictionary is implemented by input words **stems**, case is ignored.
+ * Headers are lines started with **#** symbol. Search in the
dictionary is implemented by input words **stems**, case is ignored.
* @param stemmer Stemmer implementation for the dictionary language.
*/
//noinspection ScalaWeakerAccess
class NCSwearWordsTokenEnricher(dictRes: String, stemmer: NCStemmer) extends
NCTokenEnricher with LazyLogging:
- require(dictRes != null, "Swear words model file cannot be null.")
+ require(dictRes != null, "Swear words dictonary resource cannot be null.")
require(stemmer != null, "Stemmer cannot be null.")
private var swearWords: Set[String] = _
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
index b21f34ec..c805c21b 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
@@ -48,7 +48,7 @@ import org.apache.nlpcraft.nlp.parsers.NCNLPEntityParser.*
* By default all [[NCToken]] instances converted.
*/
class NCNLPEntityParser(predicate: NCToken => Boolean = _ => true) extends
NCEntityParser:
- require(predicate != null)
+ require(predicate != null, "Predicate cannot be null.")
/** @inheritdoc */
override def parse(req: NCRequest, cfg: NCModelConfig, toks:
List[NCToken]): List[NCEntity] =
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
index d29e2daa..e931695f 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
@@ -39,7 +39,7 @@ object NCOpenNLPEntityParser:
/**
* Creates [[NCOpenNLPEntityParser]] instance.
*
- * @param mdl Path to
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
model]].
+ * @param mdl Relative path, absolute path, classpath resource or URL to
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
model]].
* @return [[NCOpenNLPEntityParser]] instance.
*/
def apply(mdl: String): NCOpenNLPEntityParser =
@@ -59,10 +59,10 @@ object NCOpenNLPEntityParser:
*
* **NOTE:** that each input [[NCToken]] can be included into several output
[[NCEntity]] instances.
*
- * @param findersMdlsRes Paths to
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
models]].
+ * @param findersMdlsRes Relative paths, absolute paths, resources or URLs to
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
models]].
*/
class NCOpenNLPEntityParser(findersMdlsRes: List[String]) extends
NCEntityParser with LazyLogging:
- require(findersMdlsRes != null && findersMdlsRes.nonEmpty, "Models paths
cannot be null or empty.")
+ require(findersMdlsRes != null && findersMdlsRes.nonEmpty, "Models
resources cannot be null or empty.")
private var finders: Seq[NameFinderME] = _
private case class Holder(start: Int, end: Int, name: String, probability:
Double)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
index a148b3bb..2c1dc7ef 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
@@ -32,10 +32,10 @@ import java.util.Objects
*
* Some of OpenNLP prepared models can be found
[[https://opennlp.sourceforge.net/models-1.5/ here]].
*
- * @param tokMdlRes Path to
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/tokenize/TokenizerModel.html
model]].
+ * @param tokMdlRes Relative path, absolute path, classpath resource or URL
to
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/tokenize/TokenizerModel.html
model]].
*/
class NCOpenNLPTokenParser(tokMdlRes: String) extends NCTokenParser with
LazyLogging:
- require(tokMdlRes != null, "Tokenizer model path cannot be null.")
+ require(tokMdlRes != null, "Tokenizer model resource cannot be null.")
@volatile private var tokenizer: TokenizerME = _
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
index 8de55ac1..4ef25fc3 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
@@ -51,9 +51,9 @@ object NCSemanticEntityParser:
elements: List[NCSemanticElement]
): NCSemanticEntityParser =
require(stemmer != null, "Stemmer cannot be null.")
- require(parser != null, "Parser cannot be null.")
+ require(parser != null, "Token parser cannot be null.")
require(macros != null, "Macros cannot be null.")
- require(elements != null, "Elements cannot be null.")
+ require(elements != null && elements.nonEmpty, "Elements cannot be
null or empty.")
new NCSemanticEntityParser(stemmer, parser, macros = macros, elements
= elements)
@@ -71,8 +71,8 @@ object NCSemanticEntityParser:
elements: List[NCSemanticElement]
): NCSemanticEntityParser =
require(stemmer != null, "Stemmer cannot be null.")
- require(parser != null, "Parser cannot be null.")
- require(elements != null, "Elements cannot be null.")
+ require(parser != null, "Token parser cannot be null.")
+ require(elements != null && elements.nonEmpty, "Elements cannot be
null or empty.")
new NCSemanticEntityParser(stemmer, parser, macros = Map.empty,
elements = elements)
@@ -82,12 +82,12 @@ object NCSemanticEntityParser:
*
* @param stemmer [[NCStemmer]] implementation for synonyms language.
* @param parser [[NCTokenParser]] implementation.
- * @param mdlRes Classpath resource, file path or URL for YAML or JSON
semantic model definition file.
+ * @param mdlRes Relative path, absolute path, classpath resource or URL
to YAML or JSON semantic model definition.
*/
def apply(stemmer: NCStemmer, parser: NCTokenParser, mdlRes: String):
NCSemanticEntityParser =
require(stemmer != null, "Stemmer cannot be null.")
- require(parser != null, "Parser cannot be null.")
- require(mdlRes != null, "Model path cannot be null.")
+ require(parser != null, "Token parser cannot be null.")
+ require(mdlRes != null, "Model resource cannot be null.")
new NCSemanticEntityParser(stemmer, parser, mdlResOpt = mdlRes.?)
@@ -183,13 +183,15 @@ import
org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser.*
* See detailed description on the website
[[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic
Semantic Parser]].
*
* **NOTE:** [[NCSemanticElement]] synonyms, **stemmer** and **parser**
parameters must be configured for the same language.
+ * `stemmer` implementation language should be corresponded to other
components of [[NCPipeline]], but
+ * required `stemmer` implementation is independent from other components'
stemmers.
*
* @see [[NCSemanticElement]]
* @param stemmer [[NCStemmer]] implementation for synonyms language.
* @param parser [[NCTokenParser]] implementation.
* @param macros Macros map. Empty by default.
* @param elements [[NCSemanticElement]] list.
- * @param mdlResOpt Optional classpath resource, file path or URL for YAML or
JSON semantic model definition file.
+ * @param mdlResOpt Optional relative path, absolute path, classpath resource
or URL to YAML or JSON semantic model definition.
*/
class NCSemanticEntityParser(
stemmer: NCStemmer,
@@ -198,10 +200,10 @@ class NCSemanticEntityParser(
elements: List[NCSemanticElement] = List.empty,
mdlResOpt: Option[String] = None
) extends NCEntityParser with LazyLogging:
- require(stemmer != null)
- require(parser != null)
- require(macros != null)
- require(elements != null && elements.nonEmpty || mdlResOpt.isDefined)
+ require(stemmer != null, "Stemmer cannot be null.")
+ require(parser != null, "Token parser cannot be null.")
+ require(macros != null, "Macroses cannot be null.")
+ require(elements != null && elements.nonEmpty || mdlResOpt.isDefined,
"Elements cannot be null or empty or model resource cannot be empty.")
private lazy val scrType =
require(mdlResOpt.isDefined)