This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new 9448a41b WIP
9448a41b is described below
commit 9448a41be9a25624a418132c8319f84325cfd397
Author: Aaron Radzinski <[email protected]>
AuthorDate: Wed Dec 7 16:29:29 2022 -0800
WIP
---
.../scala/org/apache/nlpcraft/NCEntityEnricher.scala | 1 +
.../scala/org/apache/nlpcraft/NCEntityValidator.scala | 1 +
.../scala/org/apache/nlpcraft/NCPipelineBuilder.scala | 2 +-
.../scala/org/apache/nlpcraft/NCTokenEnricher.scala | 1 +
.../scala/org/apache/nlpcraft/NCTokenValidator.scala | 1 +
.../nlp/enrichers/NCEnBracketsTokenEnricher.scala | 12 ++++++------
.../nlp/enrichers/NCEnDictionaryTokenEnricher.scala | 17 +++++++++++------
.../nlp/enrichers/NCEnQuotesTokenEnricher.scala | 3 ++-
.../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 2 +-
.../nlp/enrichers/NCEnSwearWordsTokenEnricher.scala | 1 +
...TokenEnricher.scala => NCOpenNLPTokenEnricher.scala} | 2 +-
.../apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala | 2 +-
.../org/apache/nlpcraft/nlp/util/NCTestConfig.scala | 2 +-
13 files changed, 29 insertions(+), 18 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityEnricher.scala
index 84d98fc2..3dafa0f9 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityEnricher.scala
@@ -33,6 +33,7 @@ package org.apache.nlpcraft
* @see [[NCEntityValidator]]
* @see [[NCEntityMapper]]
*/
+//noinspection DuplicatedCode
trait NCEntityEnricher extends NCLifecycle:
/**
* Enriches given list of entities by settings their properties.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityValidator.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityValidator.scala
index 359cfb5d..4798ca45 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityValidator.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityValidator.scala
@@ -37,6 +37,7 @@ package org.apache.nlpcraft
* @see [[NCEntityValidator]]
* @see [[NCEntityMapper]]
*/
+//noinspection DuplicatedCode
trait NCEntityValidator extends NCLifecycle:
/**
* Validates the final list of parsed and enriched entities.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
index 91aa2e09..0ca12729 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
@@ -217,7 +217,7 @@ class NCPipelineBuilder:
*/
private def setEnComponents(): Unit =
tokParser = mkEnOpenNLPTokenParser.?
- tokEnrichers += new
NCOpenNLPLemmaPosTokenEnricher(NCResourceReader.getPath("opennlp/en-pos-maxent.bin"),
NCResourceReader.getPath("opennlp/en-lemmatizer.dict"))
+ tokEnrichers += new
NCOpenNLPTokenEnricher(NCResourceReader.getPath("opennlp/en-pos-maxent.bin"),
NCResourceReader.getPath("opennlp/en-lemmatizer.dict"))
tokEnrichers += new NCEnStopWordsTokenEnricher
tokEnrichers += new
NCEnSwearWordsTokenEnricher(NCResourceReader.getPath("badfilter/swear_words.txt"))
tokEnrichers += new NCEnQuotesTokenEnricher
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenEnricher.scala
index 334796bf..cc24f14e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenEnricher.scala
@@ -31,6 +31,7 @@ package org.apache.nlpcraft
* @see [[NCEntityValidator]]
* @see [[NCEntityMapper]]
*/
+//noinspection DuplicatedCode
trait NCTokenEnricher extends NCLifecycle:
/**
* Enriches, or otherwise modifies, previously parsed [[NCToken tokens]].
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenValidator.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenValidator.scala
index f29ced61..d938763d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenValidator.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenValidator.scala
@@ -33,6 +33,7 @@ package org.apache.nlpcraft
* @see [[NCEntityValidator]]
* @see [[NCEntityMapper]]
*/
+//noinspection DuplicatedCode
trait NCTokenValidator extends NCLifecycle:
/**
* Validates given list of tokens. If validation fails this method should
throw an [[NCException]]. Note that
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala
index 9d6eceba..29e562e7 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala
@@ -24,15 +24,15 @@ import java.io.*
import scala.collection.mutable
/**
- * [[NCTokenEnricher]] built-in English language implementation.
+ * Brackets [[NCTokenEnricher enricher]] for English language.
*
- * It adds <code>brackets</code> boolean property to [[NCToken]] instance if
word which it represents is in brackets.
+ * This enricher adds `brackets` boolean [[NCPropertyMap metadata]] property
to the [[NCToken token]]
+ * instance if the word it represents is enclosed in brackets. Supported
brackets are: `()`, `{}`,
+ * `[]` and `<>`.
*
- * Supported brackets are: <code>()</code>, <code>{}</code>, <code>[]</code>
and <code><></code>.
- *
- * Note that invalid enclosed brackets are ignored.
+ * **NOTE:** invalid enclosed brackets are ignored.
*/
-//noinspection DuplicatedCode
+//noinspection DuplicatedCode,ScalaWeakerAccess
class NCEnBracketsTokenEnricher extends NCTokenEnricher with LazyLogging:
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
List[NCToken]): Unit =
val stack = new java.util.Stack[String]()
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala
index 5f250c0e..67615aa1 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala
@@ -21,15 +21,20 @@ import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
/**
- * [[NCTokenEnricher]] built-in English language implementation.
+ * "Known-word" [[NCTokenEnricher enricher]] for English language.
*
- * It adds <code>dict</code> boolean property to [[NCToken]] instance if word
which it represents is
- * valid English word. That means that English dictionary contains this word
initial form.
- * Look more about [[https://en.wikipedia.org/wiki/Moby_Project Moby
Project]] EN dictonary used here.
+ * This enricher adds `dict` boolean [[NCPropertyMap metadata]] property to
the [[NCToken token]]
+ * instance if word it represents is a known English word, i.e. the English
dictionary contains this word's
+ * lemma. The value `true` of the metadata property indicates that this
word's lemma is found in the dictionary,
+ * `false` value indicates otherwise.
*
- * Note that this implementation requires <code>lemma</code> string property
in [[NCToken]] instance.
- * You can configure [[NCOpenNLPLemmaPosTokenEnricher]] before
[[NCEnDictionaryTokenEnricher]] in your [[NCPipeline]].
+ * Implementation uses the [[https://en.wikipedia.org/wiki/Moby_Project Moby
Project]] English dictionary.
+ *
+ * **NOTE:** this implementation requires `lemma` string [[NCPropertyMap
metadata]] property that contains
+ * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] that provides
this metadata property before
+ * this enricher in your [[NCPipeline pipeline]].
*/
+//noinspection DuplicatedCode,ScalaWeakerAccess
class NCEnDictionaryTokenEnricher extends NCTokenEnricher:
private var dict: Set[String] = _
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala
index cc53bddf..ac6428c8 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala
@@ -26,8 +26,9 @@ import org.apache.nlpcraft.*
* It adds <code>quoted</code> boolean property to [[NCToken]] instance if
word which it represents is in quotes.
*
* Note that this implementation requires <code>pos</code> string property in
[[NCToken]] instance.
- * You can configure [[NCOpenNLPLemmaPosTokenEnricher]] before
[[NCEnQuotesTokenEnricher]] in your [[NCPipeline]].
+ * You can configure [[NCOpenNLPTokenEnricher]] before
[[NCEnQuotesTokenEnricher]] in your [[NCPipeline]].
*/
+//noinspection ScalaWeakerAccess
class NCEnQuotesTokenEnricher extends NCTokenEnricher with LazyLogging:
private final val Q_POS: Set[String] = Set("``", "''")
private def getPos(t: NCToken): String = t.get("pos").getOrElse(throw new
NCException("POS not found in token."))
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 0dca57bb..a6c052c4 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -169,7 +169,7 @@ import
org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
* Look more about stop-words [[https://en.wikipedia.org/wiki/Stop_word
here]].
*
* Note that this implementation requires <code>pos</code> and
<code>lemma</code> string properties in [[NCToken]] instance.
- * You can configure [[NCOpenNLPLemmaPosTokenEnricher]] before
[[NCEnQuotesTokenEnricher]] in your [[NCPipeline]].
+ * You can configure [[NCOpenNLPTokenEnricher]] before
[[NCEnQuotesTokenEnricher]] in your [[NCPipeline]].
*
* @param addStopsSet User defined additional stop-words collection.
* @param exclStopsSet Collection of words which should not be marked as
stop-words during component processing.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala
index 4191b13c..2072c435 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala
@@ -34,6 +34,7 @@ import java.util.Objects
* Note that [[NCPipelineBuilder.withSemantic()]] methods use for English
language
*
[[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt
NlpCraft Swearword Dictionary]]
*/
+//noinspection ScalaWeakerAccess
class NCEnSwearWordsTokenEnricher(res: String) extends NCTokenEnricher with
LazyLogging:
require(res != null, "Swear words model file cannot be null.")
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPLemmaPosTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
similarity index 97%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPLemmaPosTokenEnricher.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
index 9127efaa..3467daea 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPLemmaPosTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
@@ -43,7 +43,7 @@ import scala.concurrent.ExecutionContext
* @param lemmaDicSrc Path to
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html
DictionaryLemmatizer]] model.
* Note that [[NCPipelineBuilder.withSemantic()]] methods use for English
language
[[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict
en-lemmatizer.dict]].
*/
-class NCOpenNLPLemmaPosTokenEnricher(posMdlSrc: String = null, lemmaDicSrc:
String = null) extends NCTokenEnricher with LazyLogging:
+class NCOpenNLPTokenEnricher(posMdlSrc: String = null, lemmaDicSrc: String =
null) extends NCTokenEnricher with LazyLogging:
private var tagger: POSTaggerME = _
private var lemmatizer: DictionaryLemmatizer = _
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala
index 91f1bc54..eae2f643 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala
@@ -21,7 +21,7 @@ package org.apache.nlpcraft.nlp.parsers
*
* Stemmer trait. Read more about stemming
[[https://en.wikipedia.org/wiki/Stemming here]].
*
- * See detailed description
[[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic
Semantic Parser]].
+ * See detailed description
[[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic
Semantic Parser]].
*
* @see [[NCSemanticEntityParser]]
* @see [[NCSemanticElement]]
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestConfig.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestConfig.scala
index 545a6403..03737491 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestConfig.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestConfig.scala
@@ -28,7 +28,7 @@ final val CFG = NCModelConfig("testId", "test", "1.0", desc =
"Test description"
final val EN_TOK_PARSER = new
NCOpenNLPTokenParser(R.getPath("opennlp/en-token.bin"))
final val EN_TOK_STOP_ENRICHER = new NCEnStopWordsTokenEnricher
final val EN_TOK_LEMMA_POS_ENRICHER =
- new NCOpenNLPLemmaPosTokenEnricher(R.getPath("opennlp/en-pos-maxent.bin"),
R.getPath("opennlp/en-lemmatizer.dict"))
+ new NCOpenNLPTokenEnricher(R.getPath("opennlp/en-pos-maxent.bin"),
R.getPath("opennlp/en-lemmatizer.dict"))
final def mkEmptyEnPipeline: NCTestPipeline = NCTestPipeline(EN_TOK_PARSER)
final def mkEnPipeline(ep: NCEntityParser): NCTestPipeline =
val pl = mkEmptyEnPipeline