[incubator-nlpcraft] branch master updated: WIP

aradzinski Wed, 07 Dec 2022 16:29:43 -0800

This is an automated email from the ASF dual-hosted git repository.

aradzinski pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/master by this push:
     new 9448a41b WIP
9448a41b is described below

commit 9448a41be9a25624a418132c8319f84325cfd397
Author: Aaron Radzinski <[email protected]>
AuthorDate: Wed Dec 7 16:29:29 2022 -0800

    WIP
---
 .../scala/org/apache/nlpcraft/NCEntityEnricher.scala    |  1 +
 .../scala/org/apache/nlpcraft/NCEntityValidator.scala   |  1 +
 .../scala/org/apache/nlpcraft/NCPipelineBuilder.scala   |  2 +-
 .../scala/org/apache/nlpcraft/NCTokenEnricher.scala     |  1 +
 .../scala/org/apache/nlpcraft/NCTokenValidator.scala    |  1 +
 .../nlp/enrichers/NCEnBracketsTokenEnricher.scala       | 12 ++++++------
 .../nlp/enrichers/NCEnDictionaryTokenEnricher.scala     | 17 +++++++++++------
 .../nlp/enrichers/NCEnQuotesTokenEnricher.scala         |  3 ++-
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala      |  2 +-
 .../nlp/enrichers/NCEnSwearWordsTokenEnricher.scala     |  1 +
 ...TokenEnricher.scala => NCOpenNLPTokenEnricher.scala} |  2 +-
 .../apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala |  2 +-
 .../org/apache/nlpcraft/nlp/util/NCTestConfig.scala     |  2 +-
 13 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityEnricher.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityEnricher.scala
index 84d98fc2..3dafa0f9 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityEnricher.scala
@@ -33,6 +33,7 @@ package org.apache.nlpcraft
   * @see [[NCEntityValidator]]
   * @see [[NCEntityMapper]]
   */
+//noinspection DuplicatedCode
 trait NCEntityEnricher extends NCLifecycle:
     /**
       * Enriches given list of entities by settings their properties.
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityValidator.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityValidator.scala
index 359cfb5d..4798ca45 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityValidator.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityValidator.scala
@@ -37,6 +37,7 @@ package org.apache.nlpcraft
   * @see [[NCEntityValidator]]
   * @see [[NCEntityMapper]]
   */
+//noinspection DuplicatedCode
 trait NCEntityValidator extends NCLifecycle:
     /**
       * Validates the final list of parsed and enriched entities.
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
index 91aa2e09..0ca12729 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
@@ -217,7 +217,7 @@ class NCPipelineBuilder:
       */
     private def setEnComponents(): Unit =
         tokParser = mkEnOpenNLPTokenParser.?
-        tokEnrichers += new 
NCOpenNLPLemmaPosTokenEnricher(NCResourceReader.getPath("opennlp/en-pos-maxent.bin"),
 NCResourceReader.getPath("opennlp/en-lemmatizer.dict"))
+        tokEnrichers += new 
NCOpenNLPTokenEnricher(NCResourceReader.getPath("opennlp/en-pos-maxent.bin"), 
NCResourceReader.getPath("opennlp/en-lemmatizer.dict"))
         tokEnrichers += new NCEnStopWordsTokenEnricher
         tokEnrichers += new 
NCEnSwearWordsTokenEnricher(NCResourceReader.getPath("badfilter/swear_words.txt"))
         tokEnrichers += new NCEnQuotesTokenEnricher
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenEnricher.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenEnricher.scala
index 334796bf..cc24f14e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenEnricher.scala
@@ -31,6 +31,7 @@ package org.apache.nlpcraft
   * @see [[NCEntityValidator]]
   * @see [[NCEntityMapper]]
   */
+//noinspection DuplicatedCode
 trait NCTokenEnricher extends NCLifecycle:
     /**
       * Enriches, or otherwise modifies, previously parsed [[NCToken tokens]].
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenValidator.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenValidator.scala
index f29ced61..d938763d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenValidator.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenValidator.scala
@@ -33,6 +33,7 @@ package org.apache.nlpcraft
   * @see [[NCEntityValidator]]
   * @see [[NCEntityMapper]]
   */
+//noinspection DuplicatedCode
 trait NCTokenValidator extends NCLifecycle:
     /**
       * Validates given list of tokens. If validation fails this method should 
throw an [[NCException]]. Note that
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala
index 9d6eceba..29e562e7 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala
@@ -24,15 +24,15 @@ import java.io.*
 import scala.collection.mutable
 
 /**
-  * [[NCTokenEnricher]] built-in English language implementation.
+  * Brackets [[NCTokenEnricher enricher]] for English language.
   *
-  * It adds <code>brackets</code> boolean property to [[NCToken]] instance if 
word which it represents is in brackets.
+  * This enricher adds `brackets` boolean [[NCPropertyMap metadata]] property 
to the [[NCToken token]]
+  * instance if the word it represents is enclosed in brackets. Supported 
brackets are: `()`, `{}`,
+  * `[]` and `<>`.
   *
-  * Supported brackets are: <code>()</code>, <code>{}</code>, <code>[]</code> 
and <code><></code>.
-  *
-  * Note that invalid enclosed brackets are ignored.
+  * **NOTE:** invalid enclosed brackets are ignored.
   */
-//noinspection DuplicatedCode
+//noinspection DuplicatedCode,ScalaWeakerAccess
 class NCEnBracketsTokenEnricher extends NCTokenEnricher with LazyLogging:
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
List[NCToken]): Unit =
         val stack = new java.util.Stack[String]()
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala
index 5f250c0e..67615aa1 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala
@@ -21,15 +21,20 @@ import org.apache.nlpcraft.*
 import org.apache.nlpcraft.internal.util.NCUtils
 
 /**
-  * [[NCTokenEnricher]] built-in English language implementation.
+  * "Known-word" [[NCTokenEnricher enricher]] for English language.
   *
-  * It adds <code>dict</code> boolean property to [[NCToken]] instance if word 
which it represents is
-  * valid English word. That means that English dictionary contains this word 
initial form.
-  * Look more about [[https://en.wikipedia.org/wiki/Moby_Project Moby 
Project]] EN dictonary used here.
+  * This enricher adds `dict` boolean [[NCPropertyMap metadata]] property to 
the [[NCToken token]]
+  * instance if word it represents is a known English word, i.e. the English 
dictionary contains this word's
+  * lemma. The value `true` of the metadata property indicates that this 
word's lemma is found in the dictionary,
+  * `false` value indicates otherwise.
   *
-  * Note that this implementation requires <code>lemma</code> string property 
in [[NCToken]] instance.
-  * You can configure [[NCOpenNLPLemmaPosTokenEnricher]] before 
[[NCEnDictionaryTokenEnricher]] in your [[NCPipeline]].
+  * Implementation uses the [[https://en.wikipedia.org/wiki/Moby_Project Moby 
Project]] English dictionary.
+  *
+  * **NOTE:** this implementation requires `lemma` string [[NCPropertyMap 
metadata]] property that contains
+  * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] that provides 
this metadata property before
+  * this enricher in your [[NCPipeline pipeline]].
   */
+//noinspection DuplicatedCode,ScalaWeakerAccess
 class NCEnDictionaryTokenEnricher extends NCTokenEnricher:
     private var dict: Set[String] = _
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala
index cc53bddf..ac6428c8 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala
@@ -26,8 +26,9 @@ import org.apache.nlpcraft.*
   * It adds <code>quoted</code> boolean property to [[NCToken]] instance if 
word which it represents is in quotes.
   *
   * Note that this implementation requires <code>pos</code> string property in 
[[NCToken]] instance.
-  * You can configure [[NCOpenNLPLemmaPosTokenEnricher]] before 
[[NCEnQuotesTokenEnricher]] in your [[NCPipeline]].
+  * You can configure [[NCOpenNLPTokenEnricher]] before 
[[NCEnQuotesTokenEnricher]] in your [[NCPipeline]].
   */
+//noinspection ScalaWeakerAccess
 class NCEnQuotesTokenEnricher extends NCTokenEnricher with LazyLogging:
     private final val Q_POS: Set[String] = Set("``", "''")
     private def getPos(t: NCToken): String = t.get("pos").getOrElse(throw new 
NCException("POS not found in token."))
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 0dca57bb..a6c052c4 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -169,7 +169,7 @@ import 
org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
   * Look more about stop-words [[https://en.wikipedia.org/wiki/Stop_word 
here]].
   *
   * Note that this implementation requires <code>pos</code> and 
<code>lemma</code> string properties in [[NCToken]] instance.
-  * You can configure [[NCOpenNLPLemmaPosTokenEnricher]] before 
[[NCEnQuotesTokenEnricher]] in your [[NCPipeline]].
+  * You can configure [[NCOpenNLPTokenEnricher]] before 
[[NCEnQuotesTokenEnricher]] in your [[NCPipeline]].
   *
   * @param addStopsSet User defined additional stop-words collection.
   * @param exclStopsSet Collection of words which should not be marked as 
stop-words during component processing.
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala
index 4191b13c..2072c435 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala
@@ -34,6 +34,7 @@ import java.util.Objects
   * Note that [[NCPipelineBuilder.withSemantic()]] methods use for English 
language
   * 
[[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt
 NlpCraft Swearword Dictionary]]
   */
+//noinspection ScalaWeakerAccess
 class NCEnSwearWordsTokenEnricher(res: String) extends NCTokenEnricher with 
LazyLogging:
     require(res != null, "Swear words model file cannot be null.")
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPLemmaPosTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
similarity index 97%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPLemmaPosTokenEnricher.scala
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
index 9127efaa..3467daea 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPLemmaPosTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
@@ -43,7 +43,7 @@ import scala.concurrent.ExecutionContext
   * @param lemmaDicSrc Path to 
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html
 DictionaryLemmatizer]] model.
   * Note that [[NCPipelineBuilder.withSemantic()]] methods use for English 
language 
[[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict
 en-lemmatizer.dict]].
   */
-class NCOpenNLPLemmaPosTokenEnricher(posMdlSrc: String = null, lemmaDicSrc: 
String = null) extends NCTokenEnricher with LazyLogging:
+class NCOpenNLPTokenEnricher(posMdlSrc: String = null, lemmaDicSrc: String = 
null) extends NCTokenEnricher with LazyLogging:
     private var tagger: POSTaggerME = _
     private var lemmatizer: DictionaryLemmatizer = _
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala
index 91f1bc54..eae2f643 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala
@@ -21,7 +21,7 @@ package org.apache.nlpcraft.nlp.parsers
   *
   * Stemmer trait. Read more about stemming 
[[https://en.wikipedia.org/wiki/Stemming here]].
   *
-  *  See detailed description 
[[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic 
Semantic Parser]].
+  * See detailed description 
[[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic 
Semantic Parser]].
   *
   * @see [[NCSemanticEntityParser]]
   * @see [[NCSemanticElement]]
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestConfig.scala 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestConfig.scala
index 545a6403..03737491 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestConfig.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestConfig.scala
@@ -28,7 +28,7 @@ final val CFG = NCModelConfig("testId", "test", "1.0", desc = 
"Test description"
 final val EN_TOK_PARSER = new 
NCOpenNLPTokenParser(R.getPath("opennlp/en-token.bin"))
 final val EN_TOK_STOP_ENRICHER = new NCEnStopWordsTokenEnricher
 final val EN_TOK_LEMMA_POS_ENRICHER =
-    new NCOpenNLPLemmaPosTokenEnricher(R.getPath("opennlp/en-pos-maxent.bin"), 
R.getPath("opennlp/en-lemmatizer.dict"))
+    new NCOpenNLPTokenEnricher(R.getPath("opennlp/en-pos-maxent.bin"), 
R.getPath("opennlp/en-lemmatizer.dict"))
 final def mkEmptyEnPipeline: NCTestPipeline = NCTestPipeline(EN_TOK_PARSER)
 final def mkEnPipeline(ep: NCEntityParser): NCTestPipeline =
     val pl = mkEmptyEnPipeline

[incubator-nlpcraft] branch master updated: WIP

Reply via email to