[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP.

sergeykamov Sun, 08 Jan 2023 05:41:39 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
     new 514e0976 WIP.
514e0976 is described below

commit 514e0976d14d7572cec62b0ac3119f0ee56d0ac3
Author: Sergey Kamov <[email protected]>
AuthorDate: Sun Jan 8 17:41:28 2023 +0400

    WIP.
---
 .../nlpcraft/nlp/parsers/NCSemanticElement.scala   | 13 ++++++-
 .../nlp/parsers/NCSemanticEntityParser.scala       |  7 ++--
 .../nlp/parsers/NCSemanticEntityParserSpec.scala   | 41 +++++++++++++---------
 3 files changed, 42 insertions(+), 19 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
index fb4cf6b6..24372675 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
@@ -34,7 +34,18 @@ import org.apache.nlpcraft.*
   * with stemmatized forms of user input which were lemmatized preliminarily.
   * This approach allows to provide more accurate matching and doesn't force 
users to prepare synonyms in initial words form.
   *
-  * Also semantic element can have an optional set of special synonyms called 
values or "proper nouns" for this element.
+  * Stemmetization.
+  * Via one synonyms **argue** all following words *argued*, *argues* and 
*arguing* are matched
+  * by the same stem **argu**.
+  * Note that you can control stemmatization aggression level by choosing 
preferable algorithm,
+  * look at the following article 
[[https://www.baeldung.com/cs/porter-vs-lancaster-stemming-algorithms 
Differences Between Porter and Lancaster Stemming Algorithms]].
+  * Also note please that stemmatization approach can be less or more usefull 
for different languages.
+  *
+  * Lemmetization.
+  * If an element defined via synonym **go**, all following user input texts 
are matched:
+  * *go*, *gone*, *goes*, *went*. So, it is enough to define just synonym 
initial word's forms.
+  *
+  * Beside described above synonyms, semantic element can also have an 
optional set of special synonyms called values or "proper nouns" for this 
element.
   * Unlike basic synonyms, each value is a pair of a name and a set of 
standard synonyms by which that value,
   * and ultimately its element, can be recognized in the user input.
   * Note that the value name itself acts as an implicit synonym even when no 
additional synonyms added for that value.
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
index 49869a4f..ad334245 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
@@ -126,7 +126,7 @@ import NCSemanticEntityParser.*
   * **Semantic** [[NCEntityParser entity parser]] implementation.
   *
   * This synonyms based parser provides simple but very powerful way to find 
domain specific data in the input text.
-  * It is configured via [[NCSemanticElement]] list which are represent 
[[NCEntity name entities]] that
+  * It is configured via [[NCSemanticElement]] list which represents 
[[NCEntity name entities]] that
   * can be produced by this parser.
   *
   * [[NCSemanticElement Semantic elements]]  can be configured via YAML or 
JSON files in special format or
@@ -150,12 +150,15 @@ import NCSemanticEntityParser.*
   *       - "{&lt;CUR&gt;|_} &lt;TIME>"
   *       - "what &lt;TIME&gt; {is it now|now|is it|_}"
   * </pre>
+  * So **x:time** element can be detected by huge bunch of synonyms like *day 
time*,
+  * *local day time*, *time of day*, *local time of day*, *what hour is it* 
etc.
+  * Note that these synonyms are configured in easy to extend and support way, 
very compactly.
   *
   * See detailed description on the website 
[[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic 
Semantic Parser]].
   * Also look at the 
[[https://github.com/apache/incubator-nlpcraft/tree/master/nlpcraft-examples 
examples section]].
   *
   * **NOTE:** [[NCSemanticElement]] synonyms, **stemmer** and **parser** 
parameters must be configured for the same language.
-  * `stemmer` implementation language should be corresponded to other 
components of [[NCPipeline]], but
+  * `stemmer` implementation language should be corresponded with other 
components of [[NCPipeline]], but
   * required `stemmer` implementation is independent from other components' 
stemmers.
   *
   * **NOTE:** that parser can produce different types of [[NCEntity]] 
instances and each input [[NCToken]] can be included into several output 
[[NCEntity]] instances.
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserSpec.scala
index e813a45f..8bdabdc3 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserSpec.scala
@@ -21,6 +21,7 @@ import org.apache.nlpcraft.*
 import annotations.*
 import nlp.parsers.*
 import nlp.util.*
+import org.apache.nlpcraft.nlp.stemmer.NCEnStemmer
 import org.scalatest.funsuite.AnyFunSuite
 
 import scala.collection.mutable
@@ -48,7 +49,9 @@ class NCSemanticEntityParserSpec extends AnyFunSuite:
                 // Regex.
                 NCSemanticTestElement("t7", synonyms = Set("x //[a-d]+//")),
                 // Empty synonyms.
-                NCSemanticTestElement("t8", synonyms = Set("{A|_} {B|_}"))
+                NCSemanticTestElement("t8", synonyms = Set("{A|_} {B|_}")),
+
+                NCSemanticTestElement("t9", synonyms = Set("go"))
             )
         )
 
@@ -107,19 +110,25 @@ class NCSemanticEntityParserSpec extends AnyFunSuite:
       *
       */
     test("test") {
-        check("t1", "t1")
-        check("the t1", "t1")
-        check("t2", "t2")
-        check("the t2", "t2")
-        check("t3 t3", "t3")
-        check("t3 the t3", "t3") // With stopword inside.
-        check("value4", "t4", value = "value4".?)
-        check("value the 5", "t5", value = "value5".?) // With stopword inside.
-        check("t6", "t6", elemData = Map("testKey" -> "testValue").?)
-        check("the x abc x abe", "t7") // `x abc` should be matched, `x abe` 
shouldn't.
-        check("A B", "t8")
-        check("A", "t8")
-        check("B", "t8")
-
-        checkMultiple("t1 the x abc the x the abc", "t1", "t7", "t7")
+//        check("t1", "t1")
+//        check("the t1", "t1")
+//        check("t2", "t2")
+//        check("the t2", "t2")
+//        check("t3 t3", "t3")
+//        check("t3 the t3", "t3") // With stopword inside.
+//        check("value4", "t4", value = "value4".?)
+//        check("value the 5", "t5", value = "value5".?) // With stopword 
inside.
+//        check("t6", "t6", elemData = Map("testKey" -> "testValue").?)
+//        check("the x abc x abe", "t7") // `x abc` should be matched, `x abe` 
shouldn't.
+//        check("A B", "t8")
+//        check("A", "t8")
+//        check("B", "t8")
+//
+//        checkMultiple("t1 the x abc the x the abc", "t1", "t7", "t7")
+        val s = new NCEnStemmer
+
+        println(s.stem("argue"))
+        println(s.stem("argued"))
+        println(s.stem("argues"))
+        println(s.stem("arguing"))
     }
\ No newline at end of file

[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP.

Reply via email to