This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new e172d5a5 WIP.
e172d5a5 is described below
commit e172d5a568716dceb63d51cb01fee840e7630842
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Dec 23 12:27:42 2022 +0400
WIP.
---
.../entity/parser/NCFrSemanticEntityParser.scala | 2 +-
.../entity/parser/NCRuSemanticEntityParser.scala | 2 +-
.../components/PizzeriaModelPipeline.scala | 4 +-
.../scala/org/apache/nlpcraft/NCEntityParser.scala | 14 +++
.../org/apache/nlpcraft/NCPipelineBuilder.scala | 4 +-
.../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 2 +-
.../nlpcraft/nlp/parsers/NCNLPEntityParser.scala | 12 +--
.../nlp/parsers/NCOpenNLPEntityParser.scala | 21 ++--
.../nlp/parsers/NCSemanticEntityParser.scala | 112 +++++++++------------
.../org/apache/nlpcraft/nlp/util/NCTestUtils.scala | 6 +-
10 files changed, 83 insertions(+), 96 deletions(-)
diff --git
a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
index e119a08a..8252468d 100644
---
a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
+++
b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
@@ -33,5 +33,5 @@ class NCFrSemanticEntityParser(src: String) extends
NCSemanticEntityParser(
override def stem(word: String): String = stemmer.synchronized {
stemmer.stem(word.toLowerCase).toString }
,
new NCFrTokenParser(),
- mdlResOpt = src.?
+ src
)
diff --git
a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
index cafeef39..775a5ccf 100644
---
a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
+++
b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
@@ -33,5 +33,5 @@ class NCRuSemanticEntityParser(src: String) extends
NCSemanticEntityParser(
override def stem(word: String): String = stemmer.synchronized {
stemmer.stem(word.toLowerCase).toString }
,
new NCRuTokenParser(),
- mdlResOpt = src.?
+ src
)
diff --git
a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
index 95464cc8..4735f476 100644
---
a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
+++
b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
@@ -21,13 +21,13 @@ object PizzeriaModelPipeline:
new StanfordCoreNLP(props)
val tokParser = new NCStanfordNLPTokenParser(stanford)
- import PizzeriaOrderMapperDesc as D
+ import
org.apache.nlpcraft.examples.pizzeria.components.PizzeriaOrderMapperDesc as D
new NCPipelineBuilder().
withTokenParser(tokParser).
withTokenEnricher(new NCEnStopWordsTokenEnricher()).
withEntityParser(new NCStanfordNLPEntityParser(stanford,
Set("number"))).
- withEntityParser(NCSemanticEntityParser(new NCEnStemmer,
tokParser, "pizzeria_model.yaml")).
+ withEntityParser(new NCSemanticEntityParser(new NCEnStemmer,
tokParser, "pizzeria_model.yaml")).
withEntityMapper(PizzeriaOrderMapper(extra = D("ord:pizza:size",
"ord:pizza:size:value"), dests = D("ord:pizza", "ord:pizza:size"))).
withEntityMapper(PizzeriaOrderMapper(extra = D("stanford:number",
"stanford:number:nne"), dests = D("ord:pizza", "ord:pizza:qty"), D("ord:drink",
"ord:drink:qty"))).
withEntityValidator(new PizzeriaOrderValidator()).
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.scala
index 6c111dd7..fd77ef68 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.scala
@@ -19,6 +19,20 @@ package org.apache.nlpcraft
/**
* A pipeline component that converts list of tokens into the list of
entities.
+ *
+ * Parser instance can produce [[NCEntity]] instances with different types.
+ * Each [[NCEntity]] instance contains [[NCToken]] instances list and
+ * each [[NCToken]] instance can belong to different [[NCEntity]] instances.
+ * Order of result entities list is not important.
+ *
+ * Example. For [[NCToken tokens]] **San** and **Diego** can be found two
[[NCEntity entities]]:
+ * - **City** entity which contains tokens **San** and **Diego**.
+ * - **Name** entity which contains token **Diego**.
+ *
+ * **NOTE** that even if this parser instance produces [[NCEntity]]
instances with only one same type,
+ * [[NCPipeline]] can contain multiple [[NCEntityParser]] instances, so
total result set of [[NCEntity]] instances can contain different
+ * entities types. Based on this entities total result set the system
prepares [[NCVariant]] instances .
+ *
* See [[NCPipeline]] for documentation on the overall processing pipeline.
Note that pipeline
* must have at least one entity parser.
*
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
index 08a3886e..61c673c3 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
@@ -260,7 +260,7 @@ class NCPipelineBuilder:
lang.toUpperCase match
case "EN" =>
setEnComponents()
- entParsers += NCSemanticEntityParser(new NCEnStemmer,
mkEnOpenNLPTokenParser, macros, elms)
+ entParsers += new NCSemanticEntityParser(new NCEnStemmer,
mkEnOpenNLPTokenParser, macros, elms)
case _ => require(false, s"Unsupported language: $lang")
this
@@ -326,7 +326,7 @@ class NCPipelineBuilder:
lang.toUpperCase match
case "EN" =>
setEnComponents()
- this.entParsers += NCSemanticEntityParser(new NCEnStemmer,
mkEnOpenNLPTokenParser, mdlSrc)
+ this.entParsers += new NCSemanticEntityParser(new NCEnStemmer,
mkEnOpenNLPTokenParser, mdlSrc)
case _ => require(false, s"Unsupported language: $lang")
this
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 698c43f0..f2b5d28d 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -161,7 +161,7 @@ private object NCEnStopWordsTokenEnricher extends
LazyLogging:
private def tokenMix(toks: Seq[NCToken], maxLen: Int = Integer.MAX_VALUE):
Seq[Seq[NCToken]] =
(for (n <- toks.length until 0 by -1 if n <= maxLen) yield
toks.sliding(n)).flatten
-import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
+import NCEnStopWordsTokenEnricher.*
/**
* Stopword [[NCTokenEnricher token enricher]] for English (EN) language.
Stopwords are the words
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
index e5f62254..b184f779 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
@@ -25,24 +25,24 @@ import java.util.stream.Collectors
/**
* [[NCNLPEntityParser]] helper.
*/
-object NCNLPEntityParser:
+private object NCNLPEntityParser:
private val id: String = "nlp:entity"
-import org.apache.nlpcraft.nlp.parsers.NCNLPEntityParser.*
+import NCNLPEntityParser.*
/**
* NLP data [[NCEntityParser entity parser]].
*
- * This parser converts list of input [[NCToken]] instances to list of
[[NCEntity]] instances with ID **nlp:entity**.
+ * This parser converts list of input [[NCToken]] instances one-to-one to
list of [[NCEntity]] instances with ID **nlp:entity**.
* All [[NCEntity]] instances contain following mandatory [[NCPropertyMap
metadata]] properties:
* - nlp:entity:text
* - nlp:entity:index
* - nlp:entity:startCharIndex
* - nlp:entity:endCharIndex
*
- * Also created [[NCEntity]] instances receive all another [[NCPropertyMap
metadata]] properties
- * which were added by configured in [[NCPipeline pipeline]] token
[[org.apache.nlpcraft.NCTokenEnricher enrichers]].
- * These properties identifiers will be prefixed by **nlp:entity:**, for
example **nlp:entity:prop**.
+ * Created [[NCEntity]] instances inherit all [[NCToken]] [[NCPropertyMap
metadata]] properties,
+ * with new names prefixed by **nlp:entity:**.
+ * For example for property **prop** new name will be **nlp:entity:prop**.
*
* @param predicate Predicate which allows to filter list of converted
[[NCToken]] instances.
* By default all [[NCToken]] instances converted.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
index b705a44c..8cb7d661 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
@@ -32,20 +32,6 @@ import scala.concurrent.ExecutionContext
import scala.language.postfixOps
import scala.util.Using
-/**
- * [[NCOpenNLPEntityParser]] helper.
- */
-object NCOpenNLPEntityParser:
- /**
- * Creates [[NCOpenNLPEntityParser]] instance.
- *
- * @param mdl Relative path, absolute path, classpath resource or URL to
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
model]].
- * @return [[NCOpenNLPEntityParser]] instance.
- */
- def apply(mdl: String): NCOpenNLPEntityParser =
- require(mdl != null, "Model source cannot be null.")
- new NCOpenNLPEntityParser(List(mdl))
-
/**
* [[https://opennlp.apache.org/ OpenNLP]] based language independent
[[NCEntityParser entity parser]] configured by
* [[https://opennlp.apache.org/ OpenNLP]] **name finders** models.
@@ -64,6 +50,13 @@ object NCOpenNLPEntityParser:
class NCOpenNLPEntityParser(findersMdlsRes: List[String]) extends
NCEntityParser with LazyLogging:
require(findersMdlsRes != null && findersMdlsRes.nonEmpty, "Models
resources cannot be null or empty.")
+ /**
+ * Creates [[NCOpenNLPEntityParser]] instance.
+ *
+ * @param mdl Relative path, absolute path, classpath resource or URL to
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
model]].
+ */
+ def this(mdl: String) = this(List[String](Objects.requireNonNull(mdl)))
+
private var finders: Seq[NameFinderME] = _
private case class Holder(start: Int, end: Int, name: String, probability:
Double)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
index 4ef25fc3..8004d3e9 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
@@ -35,62 +35,7 @@ import scala.collection.mutable
/**
* [[NCSemanticEntityParser]] helper.
*/
-object NCSemanticEntityParser:
- /**
- * Creates [[NCSemanticEntityParser]] instance.
- *
- * @param stemmer [[NCStemmer]] implementation for synonyms language.
- * @param parser [[NCTokenParser]] implementation.
- * @param macros Macros map. Empty by default.
- * @param elements [[NCSemanticElement]] list.
- */
- def apply(
- stemmer: NCStemmer,
- parser: NCTokenParser,
- macros: Map[String, String],
- elements: List[NCSemanticElement]
- ): NCSemanticEntityParser =
- require(stemmer != null, "Stemmer cannot be null.")
- require(parser != null, "Token parser cannot be null.")
- require(macros != null, "Macros cannot be null.")
- require(elements != null && elements.nonEmpty, "Elements cannot be
null or empty.")
-
- new NCSemanticEntityParser(stemmer, parser, macros = macros, elements
= elements)
-
- /**
- *
- * Creates [[NCSemanticEntityParser]] instance.
- *
- * @param stemmer [[NCStemmer]] implementation for synonyms language.
- * @param parser [[NCTokenParser]] implementation.
- * @param elements [[NCSemanticElement]] list.
- */
- def apply(
- stemmer: NCStemmer,
- parser: NCTokenParser,
- elements: List[NCSemanticElement]
- ): NCSemanticEntityParser =
- require(stemmer != null, "Stemmer cannot be null.")
- require(parser != null, "Token parser cannot be null.")
- require(elements != null && elements.nonEmpty, "Elements cannot be
null or empty.")
-
- new NCSemanticEntityParser(stemmer, parser, macros = Map.empty,
elements = elements)
-
- /**
- *
- * Creates [[NCSemanticEntityParser]] instance.
- *
- * @param stemmer [[NCStemmer]] implementation for synonyms language.
- * @param parser [[NCTokenParser]] implementation.
- * @param mdlRes Relative path, absolute path, classpath resource or URL
to YAML or JSON semantic model definition.
- */
- def apply(stemmer: NCStemmer, parser: NCTokenParser, mdlRes: String):
NCSemanticEntityParser =
- require(stemmer != null, "Stemmer cannot be null.")
- require(parser != null, "Token parser cannot be null.")
- require(mdlRes != null, "Model resource cannot be null.")
-
- new NCSemanticEntityParser(stemmer, parser, mdlResOpt = mdlRes.?)
-
+private object NCSemanticEntityParser:
/**
* @param baseTokens Tokens.
* @param variants Variants without stopwords.
@@ -175,7 +120,7 @@ object NCSemanticEntityParser:
else if i >= data1.size then tmp
else combine(data1, data2, i + 1, tmp.map(_ :+ data1(i)) ++ tmp.map(_
:+ data2(i)))
-import org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser.*
+import NCSemanticEntityParser.*
/**
* **Semantic** [[NCEntityParser entity parser]] implementation.
@@ -186,25 +131,60 @@ import
org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser.*
* `stemmer` implementation language should be corresponded to other
components of [[NCPipeline]], but
* required `stemmer` implementation is independent from other components'
stemmers.
*
+ * There are several constructors with different set of parameters.
+ * - **stemmer** [[NCStemmer]] implementation which used for matching tokens
and given [[NCSemanticElement]] synonyms.
+ * - **parser** [[NCTokenParser]] implementation which used for given
[[NCSemanticElement]] synonyms tokenization. It should be same implementation
as used in [[NCPipeline.getTokenParser]].
+ * - **macros** Macros map which are used for extracting
[[NCSemanticElement]] synonyms defined via **macros**. Empty by default. Look
more at the website
[[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic
Macros]].
+ * - **elements** Programmatically prepared [[NCSemanticElement]] instances.
+ * - **mdlRes** Relative path, absolute path, classpath resource or URL to
YAML or JSON semantic model which contains [[NCSemanticElement]] definitions.
+ *
* @see [[NCSemanticElement]]
- * @param stemmer [[NCStemmer]] implementation for synonyms language.
- * @param parser [[NCTokenParser]] implementation.
- * @param macros Macros map. Empty by default.
- * @param elements [[NCSemanticElement]] list.
- * @param mdlResOpt Optional relative path, absolute path, classpath resource
or URL to YAML or JSON semantic model definition.
*/
-class NCSemanticEntityParser(
+class NCSemanticEntityParser private (
stemmer: NCStemmer,
parser: NCTokenParser,
- macros: Map[String, String] = Map.empty,
- elements: List[NCSemanticElement] = List.empty,
- mdlResOpt: Option[String] = None
+ macros: Map[String, String],
+ elements: List[NCSemanticElement],
+ mdlResOpt: Option[String]
) extends NCEntityParser with LazyLogging:
require(stemmer != null, "Stemmer cannot be null.")
require(parser != null, "Token parser cannot be null.")
require(macros != null, "Macroses cannot be null.")
require(elements != null && elements.nonEmpty || mdlResOpt.isDefined,
"Elements cannot be null or empty or model resource cannot be empty.")
+ /**
+ * Creates [[NCSemanticEntityParser]] instance.
+ *
+ * @param stemmer [[NCStemmer]] implementation for synonyms language.
+ * @param parser [[NCTokenParser]] implementation.
+ * @param macros Macros map. Empty by default.
+ * @param elements [[NCSemanticElement]] list.
+ */
+ def this(stemmer: NCStemmer, parser: NCTokenParser, macros: Map[String,
String], elements: List[NCSemanticElement]) =
+ this(stemmer, parser, macros, elements, None)
+
+ /**
+ *
+ * Creates [[NCSemanticEntityParser]] instance.
+ *
+ * @param stemmer [[NCStemmer]] implementation for synonyms language.
+ * @param parser [[NCTokenParser]] implementation.
+ * @param elements [[NCSemanticElement]] list.
+ */
+ def this(stemmer: NCStemmer, parser: NCTokenParser, elements:
List[NCSemanticElement]) =
+ this(stemmer, parser, Map.empty, elements, None)
+
+ /**
+ *
+ * Creates [[NCSemanticEntityParser]] instance.
+ *
+ * @param stemmer [[NCStemmer]] implementation for synonyms language.
+ * @param parser [[NCTokenParser]] implementation.
+ * @param mdlRes Relative path, absolute path, classpath resource or URL
to YAML or JSON semantic model definition.
+ */
+ def this(stemmer: NCStemmer, parser: NCTokenParser, mdlRes: String) =
+ this(stemmer, parser, Map.empty, List.empty, mdlRes.?)
+
private lazy val scrType =
require(mdlResOpt.isDefined)
NCSemanticSourceType.detect(mdlResOpt.get)
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
index 7d4395e3..8afc5cf8 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
@@ -125,18 +125,18 @@ object NCTestUtils:
* @param macros
*/
def mkEnSemanticParser(elms: List[NCSemanticElement], macros: Map[String,
String] = Map.empty): NCSemanticEntityParser =
- parsers.NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, macros,
elms)
+ new NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, macros,
elms)
/**
*
* @param elms
*/
def mkEnSemanticParser(elms: NCSemanticElement*): NCSemanticEntityParser =
- parsers.NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER,
elms.toList)
+ new NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, elms.toList)
/**
*
* @param mdlSrc
*/
def mkEnSemanticParser(mdlSrc: String): NCSemanticEntityParser =
- parsers.NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, mdlSrc)
\ No newline at end of file
+ new NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, mdlSrc)
\ No newline at end of file