This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new 514e0976 WIP.
514e0976 is described below
commit 514e0976d14d7572cec62b0ac3119f0ee56d0ac3
Author: Sergey Kamov <[email protected]>
AuthorDate: Sun Jan 8 17:41:28 2023 +0400
WIP.
---
.../nlpcraft/nlp/parsers/NCSemanticElement.scala | 13 ++++++-
.../nlp/parsers/NCSemanticEntityParser.scala | 7 ++--
.../nlp/parsers/NCSemanticEntityParserSpec.scala | 41 +++++++++++++---------
3 files changed, 42 insertions(+), 19 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
index fb4cf6b6..24372675 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
@@ -34,7 +34,18 @@ import org.apache.nlpcraft.*
* with stemmatized forms of user input which were lemmatized preliminarily.
* This approach allows to provide more accurate matching and doesn't force
users to prepare synonyms in initial words form.
*
- * Also semantic element can have an optional set of special synonyms called
values or "proper nouns" for this element.
+ * Stemmetization.
+ * Via one synonyms **argue** all following words *argued*, *argues* and
*arguing* are matched
+ * by the same stem **argu**.
+ * Note that you can control stemmatization aggression level by choosing
preferable algorithm,
+ * look at the following article
[[https://www.baeldung.com/cs/porter-vs-lancaster-stemming-algorithms
Differences Between Porter and Lancaster Stemming Algorithms]].
+ * Also note please that stemmatization approach can be less or more usefull
for different languages.
+ *
+ * Lemmetization.
+ * If an element defined via synonym **go**, all following user input texts
are matched:
+ * *go*, *gone*, *goes*, *went*. So, it is enough to define just synonym
initial word's forms.
+ *
+ * Beside described above synonyms, semantic element can also have an
optional set of special synonyms called values or "proper nouns" for this
element.
* Unlike basic synonyms, each value is a pair of a name and a set of
standard synonyms by which that value,
* and ultimately its element, can be recognized in the user input.
* Note that the value name itself acts as an implicit synonym even when no
additional synonyms added for that value.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
index 49869a4f..ad334245 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
@@ -126,7 +126,7 @@ import NCSemanticEntityParser.*
* **Semantic** [[NCEntityParser entity parser]] implementation.
*
* This synonyms based parser provides simple but very powerful way to find
domain specific data in the input text.
- * It is configured via [[NCSemanticElement]] list which are represent
[[NCEntity name entities]] that
+ * It is configured via [[NCSemanticElement]] list which represents
[[NCEntity name entities]] that
* can be produced by this parser.
*
* [[NCSemanticElement Semantic elements]] can be configured via YAML or
JSON files in special format or
@@ -150,12 +150,15 @@ import NCSemanticEntityParser.*
* - "{<CUR>|_} <TIME>"
* - "what <TIME> {is it now|now|is it|_}"
* </pre>
+ * So **x:time** element can be detected by huge bunch of synonyms like *day
time*,
+ * *local day time*, *time of day*, *local time of day*, *what hour is it*
etc.
+ * Note that these synonyms are configured in easy to extend and support way,
very compactly.
*
* See detailed description on the website
[[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic
Semantic Parser]].
* Also look at the
[[https://github.com/apache/incubator-nlpcraft/tree/master/nlpcraft-examples
examples section]].
*
* **NOTE:** [[NCSemanticElement]] synonyms, **stemmer** and **parser**
parameters must be configured for the same language.
- * `stemmer` implementation language should be corresponded to other
components of [[NCPipeline]], but
+ * `stemmer` implementation language should be corresponded with other
components of [[NCPipeline]], but
* required `stemmer` implementation is independent from other components'
stemmers.
*
* **NOTE:** that parser can produce different types of [[NCEntity]]
instances and each input [[NCToken]] can be included into several output
[[NCEntity]] instances.
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserSpec.scala
index e813a45f..8bdabdc3 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserSpec.scala
@@ -21,6 +21,7 @@ import org.apache.nlpcraft.*
import annotations.*
import nlp.parsers.*
import nlp.util.*
+import org.apache.nlpcraft.nlp.stemmer.NCEnStemmer
import org.scalatest.funsuite.AnyFunSuite
import scala.collection.mutable
@@ -48,7 +49,9 @@ class NCSemanticEntityParserSpec extends AnyFunSuite:
// Regex.
NCSemanticTestElement("t7", synonyms = Set("x //[a-d]+//")),
// Empty synonyms.
- NCSemanticTestElement("t8", synonyms = Set("{A|_} {B|_}"))
+ NCSemanticTestElement("t8", synonyms = Set("{A|_} {B|_}")),
+
+ NCSemanticTestElement("t9", synonyms = Set("go"))
)
)
@@ -107,19 +110,25 @@ class NCSemanticEntityParserSpec extends AnyFunSuite:
*
*/
test("test") {
- check("t1", "t1")
- check("the t1", "t1")
- check("t2", "t2")
- check("the t2", "t2")
- check("t3 t3", "t3")
- check("t3 the t3", "t3") // With stopword inside.
- check("value4", "t4", value = "value4".?)
- check("value the 5", "t5", value = "value5".?) // With stopword inside.
- check("t6", "t6", elemData = Map("testKey" -> "testValue").?)
- check("the x abc x abe", "t7") // `x abc` should be matched, `x abe`
shouldn't.
- check("A B", "t8")
- check("A", "t8")
- check("B", "t8")
-
- checkMultiple("t1 the x abc the x the abc", "t1", "t7", "t7")
+// check("t1", "t1")
+// check("the t1", "t1")
+// check("t2", "t2")
+// check("the t2", "t2")
+// check("t3 t3", "t3")
+// check("t3 the t3", "t3") // With stopword inside.
+// check("value4", "t4", value = "value4".?)
+// check("value the 5", "t5", value = "value5".?) // With stopword
inside.
+// check("t6", "t6", elemData = Map("testKey" -> "testValue").?)
+// check("the x abc x abe", "t7") // `x abc` should be matched, `x abe`
shouldn't.
+// check("A B", "t8")
+// check("A", "t8")
+// check("B", "t8")
+//
+// checkMultiple("t1 the x abc the x the abc", "t1", "t7", "t7")
+ val s = new NCEnStemmer
+
+ println(s.stem("argue"))
+ println(s.stem("argued"))
+ println(s.stem("argues"))
+ println(s.stem("arguing"))
}
\ No newline at end of file