[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP.

sergeykamov Fri, 23 Dec 2022 00:27:36 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
     new e172d5a5 WIP.
e172d5a5 is described below

commit e172d5a568716dceb63d51cb01fee840e7630842
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Dec 23 12:27:42 2022 +0400

    WIP.
---
 .../entity/parser/NCFrSemanticEntityParser.scala   |   2 +-
 .../entity/parser/NCRuSemanticEntityParser.scala   |   2 +-
 .../components/PizzeriaModelPipeline.scala         |   4 +-
 .../scala/org/apache/nlpcraft/NCEntityParser.scala |  14 +++
 .../org/apache/nlpcraft/NCPipelineBuilder.scala    |   4 +-
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala |   2 +-
 .../nlpcraft/nlp/parsers/NCNLPEntityParser.scala   |  12 +--
 .../nlp/parsers/NCOpenNLPEntityParser.scala        |  21 ++--
 .../nlp/parsers/NCSemanticEntityParser.scala       | 112 +++++++++------------
 .../org/apache/nlpcraft/nlp/util/NCTestUtils.scala |   6 +-
 10 files changed, 83 insertions(+), 96 deletions(-)

diff --git 
a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
 
b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
index e119a08a..8252468d 100644
--- 
a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
+++ 
b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
@@ -33,5 +33,5 @@ class NCFrSemanticEntityParser(src: String) extends 
NCSemanticEntityParser(
         override def stem(word: String): String = stemmer.synchronized { 
stemmer.stem(word.toLowerCase).toString }
     ,
     new NCFrTokenParser(),
-    mdlResOpt = src.?
+    src
 )
diff --git 
a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
 
b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
index cafeef39..775a5ccf 100644
--- 
a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
+++ 
b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
@@ -33,5 +33,5 @@ class NCRuSemanticEntityParser(src: String) extends 
NCSemanticEntityParser(
         override def stem(word: String): String = stemmer.synchronized { 
stemmer.stem(word.toLowerCase).toString }
     ,
     new NCRuTokenParser(),
-    mdlResOpt = src.?
+    src
 )
diff --git 
a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
 
b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
index 95464cc8..4735f476 100644
--- 
a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
+++ 
b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
@@ -21,13 +21,13 @@ object PizzeriaModelPipeline:
             new StanfordCoreNLP(props)
         val tokParser = new NCStanfordNLPTokenParser(stanford)
 
-        import PizzeriaOrderMapperDesc as D
+        import 
org.apache.nlpcraft.examples.pizzeria.components.PizzeriaOrderMapperDesc as D
 
         new NCPipelineBuilder().
             withTokenParser(tokParser).
             withTokenEnricher(new NCEnStopWordsTokenEnricher()).
             withEntityParser(new NCStanfordNLPEntityParser(stanford, 
Set("number"))).
-            withEntityParser(NCSemanticEntityParser(new NCEnStemmer, 
tokParser, "pizzeria_model.yaml")).
+            withEntityParser(new NCSemanticEntityParser(new NCEnStemmer, 
tokParser, "pizzeria_model.yaml")).
             withEntityMapper(PizzeriaOrderMapper(extra = D("ord:pizza:size", 
"ord:pizza:size:value"), dests = D("ord:pizza", "ord:pizza:size"))).
             withEntityMapper(PizzeriaOrderMapper(extra = D("stanford:number", 
"stanford:number:nne"), dests = D("ord:pizza", "ord:pizza:qty"), D("ord:drink", 
"ord:drink:qty"))).
             withEntityValidator(new PizzeriaOrderValidator()).
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.scala
index 6c111dd7..fd77ef68 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.scala
@@ -19,6 +19,20 @@ package org.apache.nlpcraft
 
 /**
   * A pipeline component that converts list of tokens into the list of 
entities.
+  *
+  * Parser instance can produce [[NCEntity]] instances with different types.
+  * Each [[NCEntity]] instance contains [[NCToken]] instances list and
+  * each [[NCToken]] instance can belong to different [[NCEntity]] instances.
+  * Order of result entities list is not important.
+  *
+  * Example. For [[NCToken tokens]] **San** and **Diego** can be found two 
[[NCEntity entities]]:
+  *  - **City** entity which contains tokens **San** and **Diego**.
+  *  - **Name** entity which contains token **Diego**.
+  *
+  *  **NOTE** that even if this parser instance produces [[NCEntity]] 
instances with only one same type,
+  *  [[NCPipeline]] can contain multiple [[NCEntityParser]] instances, so 
total result set of [[NCEntity]] instances can contain different
+  *  entities types. Based on this entities total result set the system 
prepares [[NCVariant]] instances .
+  *
   * See [[NCPipeline]] for documentation on the overall processing pipeline. 
Note that pipeline
   * must have at least one entity parser.
   *
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
index 08a3886e..61c673c3 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
@@ -260,7 +260,7 @@ class NCPipelineBuilder:
         lang.toUpperCase match
             case "EN" =>
                 setEnComponents()
-                entParsers += NCSemanticEntityParser(new NCEnStemmer, 
mkEnOpenNLPTokenParser, macros, elms)
+                entParsers += new NCSemanticEntityParser(new NCEnStemmer, 
mkEnOpenNLPTokenParser, macros, elms)
             case _ => require(false, s"Unsupported language: $lang")
         this
 
@@ -326,7 +326,7 @@ class NCPipelineBuilder:
         lang.toUpperCase match
             case "EN" =>
                 setEnComponents()
-                this.entParsers += NCSemanticEntityParser(new NCEnStemmer, 
mkEnOpenNLPTokenParser, mdlSrc)
+                this.entParsers += new NCSemanticEntityParser(new NCEnStemmer, 
mkEnOpenNLPTokenParser, mdlSrc)
             case _ => require(false, s"Unsupported language: $lang")
         this
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 698c43f0..f2b5d28d 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -161,7 +161,7 @@ private object NCEnStopWordsTokenEnricher extends 
LazyLogging:
     private def tokenMix(toks: Seq[NCToken], maxLen: Int = Integer.MAX_VALUE): 
Seq[Seq[NCToken]] =
         (for (n <- toks.length until 0 by -1 if n <= maxLen) yield 
toks.sliding(n)).flatten
 
-import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
+import NCEnStopWordsTokenEnricher.*
 
 /**
   * Stopword [[NCTokenEnricher token enricher]] for English (EN) language. 
Stopwords are the words
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
index e5f62254..b184f779 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
@@ -25,24 +25,24 @@ import java.util.stream.Collectors
 /**
   * [[NCNLPEntityParser]] helper.
   */
-object NCNLPEntityParser:
+private object NCNLPEntityParser:
     private val id: String = "nlp:entity"
 
-import org.apache.nlpcraft.nlp.parsers.NCNLPEntityParser.*
+import NCNLPEntityParser.*
 
 /**
   *  NLP data [[NCEntityParser entity parser]].
   *
-  * This parser converts list of input [[NCToken]] instances to list of 
[[NCEntity]] instances with ID **nlp:entity**.
+  * This parser converts list of input [[NCToken]] instances one-to-one to 
list of [[NCEntity]] instances with ID **nlp:entity**.
   * All [[NCEntity]] instances contain following mandatory [[NCPropertyMap 
metadata]] properties:
   *  - nlp:entity:text
   *  - nlp:entity:index
   *  - nlp:entity:startCharIndex
   *  - nlp:entity:endCharIndex
   *
-  *  Also created [[NCEntity]] instances receive all another [[NCPropertyMap 
metadata]] properties
-  *  which were added by configured in [[NCPipeline pipeline]] token 
[[org.apache.nlpcraft.NCTokenEnricher enrichers]].
-  *  These properties identifiers will be prefixed by **nlp:entity:**, for 
example **nlp:entity:prop**.
+  *  Created [[NCEntity]] instances inherit all [[NCToken]] [[NCPropertyMap 
metadata]] properties,
+  *  with new names prefixed by **nlp:entity:**.
+  *  For example for property **prop** new name will be **nlp:entity:prop**.
   *
   *  @param predicate Predicate which allows to filter list of converted 
[[NCToken]] instances.
   *  By default all [[NCToken]] instances converted.
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
index b705a44c..8cb7d661 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
@@ -32,20 +32,6 @@ import scala.concurrent.ExecutionContext
 import scala.language.postfixOps
 import scala.util.Using
 
-/**
-  * [[NCOpenNLPEntityParser]] helper.
-  */
-object NCOpenNLPEntityParser:
-    /**
-      * Creates [[NCOpenNLPEntityParser]] instance.
-      *
-      * @param mdl Relative path, absolute path, classpath resource or URL to 
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
 model]].
-      * @return [[NCOpenNLPEntityParser]] instance.
-      */
-    def apply(mdl: String): NCOpenNLPEntityParser =
-        require(mdl != null, "Model source cannot be null.")
-        new NCOpenNLPEntityParser(List(mdl))
-
 /**
   *  [[https://opennlp.apache.org/ OpenNLP]] based language independent 
[[NCEntityParser entity parser]] configured by
   *  [[https://opennlp.apache.org/ OpenNLP]] **name finders** models.
@@ -64,6 +50,13 @@ object NCOpenNLPEntityParser:
 class NCOpenNLPEntityParser(findersMdlsRes: List[String]) extends 
NCEntityParser with LazyLogging:
     require(findersMdlsRes != null && findersMdlsRes.nonEmpty, "Models 
resources cannot be null or empty.")
 
+    /**
+      * Creates [[NCOpenNLPEntityParser]] instance.
+      *
+      * @param mdl Relative path, absolute path, classpath resource or URL to 
[[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html
 model]].
+      */
+    def this(mdl: String) = this(List[String](Objects.requireNonNull(mdl)))
+
     private var finders: Seq[NameFinderME] = _
     private case class Holder(start: Int, end: Int, name: String, probability: 
Double)
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
index 4ef25fc3..8004d3e9 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
@@ -35,62 +35,7 @@ import scala.collection.mutable
 /**
   * [[NCSemanticEntityParser]] helper.
   */
-object NCSemanticEntityParser:
-    /**
-      * Creates [[NCSemanticEntityParser]] instance.
-      *
-      * @param stemmer  [[NCStemmer]] implementation for synonyms language.
-      * @param parser   [[NCTokenParser]] implementation.
-      * @param macros   Macros map. Empty by default.
-      * @param elements [[NCSemanticElement]] list.
-      */
-    def apply(
-        stemmer: NCStemmer,
-        parser: NCTokenParser,
-        macros: Map[String, String],
-        elements: List[NCSemanticElement]
-    ): NCSemanticEntityParser =
-        require(stemmer != null, "Stemmer cannot be null.")
-        require(parser != null, "Token parser cannot be null.")
-        require(macros != null, "Macros cannot be null.")
-        require(elements != null && elements.nonEmpty, "Elements cannot be 
null or empty.")
-
-        new NCSemanticEntityParser(stemmer, parser, macros = macros, elements 
= elements)
-
-    /**
-      *
-      * Creates [[NCSemanticEntityParser]] instance.
-      *
-      * @param stemmer  [[NCStemmer]] implementation for synonyms language.
-      * @param parser   [[NCTokenParser]] implementation.
-      * @param elements [[NCSemanticElement]] list.
-      */
-    def apply(
-        stemmer: NCStemmer,
-        parser: NCTokenParser,
-        elements: List[NCSemanticElement]
-    ): NCSemanticEntityParser =
-        require(stemmer != null, "Stemmer cannot be null.")
-        require(parser != null, "Token parser cannot be null.")
-        require(elements != null && elements.nonEmpty, "Elements cannot be 
null or empty.")
-
-        new NCSemanticEntityParser(stemmer, parser, macros = Map.empty, 
elements = elements)
-
-    /**
-      *
-      * Creates [[NCSemanticEntityParser]] instance.
-      *
-      * @param stemmer [[NCStemmer]] implementation for synonyms language.
-      * @param parser  [[NCTokenParser]] implementation.
-      * @param mdlRes Relative path, absolute path, classpath resource or URL 
to YAML or JSON semantic model definition.
-      */
-    def apply(stemmer: NCStemmer, parser: NCTokenParser, mdlRes: String): 
NCSemanticEntityParser =
-        require(stemmer != null, "Stemmer cannot be null.")
-        require(parser != null, "Token parser cannot be null.")
-        require(mdlRes != null, "Model resource cannot be null.")
-
-        new NCSemanticEntityParser(stemmer, parser, mdlResOpt = mdlRes.?)
-
+private object NCSemanticEntityParser:
     /**
       * @param baseTokens Tokens.
       * @param variants Variants without stopwords.
@@ -175,7 +120,7 @@ object NCSemanticEntityParser:
         else if i >= data1.size then tmp
         else combine(data1, data2, i + 1, tmp.map(_ :+ data1(i)) ++ tmp.map(_ 
:+ data2(i)))
 
-import org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser.*
+import NCSemanticEntityParser.*
 
 /**
   * **Semantic** [[NCEntityParser entity parser]] implementation.
@@ -186,25 +131,60 @@ import 
org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser.*
   * `stemmer` implementation language should be corresponded to other 
components of [[NCPipeline]], but
   * required `stemmer` implementation is independent from other components' 
stemmers.
   *
+  * There are several constructors with different set of parameters.
+  * - **stemmer** [[NCStemmer]] implementation which used for matching tokens 
and given [[NCSemanticElement]] synonyms.
+  * - **parser** [[NCTokenParser]] implementation which used for given 
[[NCSemanticElement]] synonyms tokenization. It should be same implementation 
as used in [[NCPipeline.getTokenParser]].
+  * - **macros** Macros map which are used for extracting 
[[NCSemanticElement]] synonyms defined via **macros**. Empty by default. Look 
more at the website 
[[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic 
Macros]].
+  * - **elements** Programmatically prepared [[NCSemanticElement]] instances.
+  * - **mdlRes** Relative path, absolute path, classpath resource or URL to 
YAML or JSON semantic model which contains [[NCSemanticElement]] definitions.
+  *
   * @see [[NCSemanticElement]]
-  * @param stemmer   [[NCStemmer]] implementation for synonyms language.
-  * @param parser    [[NCTokenParser]] implementation.
-  * @param macros    Macros map. Empty by default.
-  * @param elements  [[NCSemanticElement]] list.
-  * @param mdlResOpt Optional relative path, absolute path, classpath resource 
or URL to YAML or JSON semantic model definition.
   */
-class NCSemanticEntityParser(
+class NCSemanticEntityParser private (
     stemmer: NCStemmer,
     parser: NCTokenParser,
-    macros: Map[String, String] = Map.empty,
-    elements: List[NCSemanticElement] = List.empty,
-    mdlResOpt: Option[String] = None
+    macros: Map[String, String],
+    elements: List[NCSemanticElement],
+    mdlResOpt: Option[String]
 ) extends NCEntityParser with LazyLogging:
     require(stemmer != null, "Stemmer cannot be null.")
     require(parser != null, "Token parser cannot be null.")
     require(macros != null, "Macroses cannot be null.")
     require(elements != null && elements.nonEmpty || mdlResOpt.isDefined, 
"Elements cannot be null or empty or model resource cannot be empty.")
 
+    /**
+      * Creates [[NCSemanticEntityParser]] instance.
+      *
+      * @param stemmer  [[NCStemmer]] implementation for synonyms language.
+      * @param parser   [[NCTokenParser]] implementation.
+      * @param macros   Macros map. Empty by default.
+      * @param elements [[NCSemanticElement]] list.
+      */
+    def this(stemmer: NCStemmer, parser: NCTokenParser, macros: Map[String, 
String], elements: List[NCSemanticElement]) =
+        this(stemmer, parser, macros, elements, None)
+
+    /**
+      *
+      * Creates [[NCSemanticEntityParser]] instance.
+      *
+      * @param stemmer  [[NCStemmer]] implementation for synonyms language.
+      * @param parser   [[NCTokenParser]] implementation.
+      * @param elements [[NCSemanticElement]] list.
+      */
+    def this(stemmer: NCStemmer, parser: NCTokenParser, elements: 
List[NCSemanticElement]) =
+        this(stemmer, parser, Map.empty, elements, None)
+
+    /**
+      *
+      * Creates [[NCSemanticEntityParser]] instance.
+      *
+      * @param stemmer [[NCStemmer]] implementation for synonyms language.
+      * @param parser  [[NCTokenParser]] implementation.
+      * @param mdlRes  Relative path, absolute path, classpath resource or URL 
to YAML or JSON semantic model definition.
+      */
+    def this(stemmer: NCStemmer, parser: NCTokenParser, mdlRes: String) =
+        this(stemmer, parser, Map.empty, List.empty, mdlRes.?)
+
     private lazy val scrType =
         require(mdlResOpt.isDefined)
         NCSemanticSourceType.detect(mdlResOpt.get)
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
index 7d4395e3..8afc5cf8 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
@@ -125,18 +125,18 @@ object NCTestUtils:
       * @param macros
       */
     def mkEnSemanticParser(elms: List[NCSemanticElement], macros: Map[String, 
String] = Map.empty): NCSemanticEntityParser =
-        parsers.NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, macros, 
elms)
+        new NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, macros, 
elms)
 
     /**
       *
       * @param elms
       */
     def mkEnSemanticParser(elms: NCSemanticElement*): NCSemanticEntityParser =
-        parsers.NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, 
elms.toList)
+        new NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, elms.toList)
 
     /**
       *
       * @param mdlSrc
       */
     def mkEnSemanticParser(mdlSrc: String): NCSemanticEntityParser =
-        parsers.NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, mdlSrc)
\ No newline at end of file
+        new NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, mdlSrc)
\ No newline at end of file

[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP.

Reply via email to