This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new 2e1a72fd WIP.
2e1a72fd is described below
commit 2e1a72fdde3e45d51726f50cdb74807a72cbe7bf
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Dec 19 15:39:40 2022 +0400
WIP.
---
.../nlp/enrichers/NCBracketsTokenEnricher.scala | 54 ++++++++++++----------
.../nlp/enrichers/NCQuotesTokenEnricher.scala | 4 +-
.../enrichers/NCBracketsTokenEnricherSpec.scala | 18 ++++++--
3 files changed, 44 insertions(+), 32 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
index c0e692a3..b4d8f563 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
@@ -21,16 +21,26 @@ import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
import java.io.*
-import scala.collection.mutable
+import scala.collection.{Map, mutable}
+
+/**
+ * Companion helper.
+ */
+object NCBracketsTokenEnricher:
+ private val BRACKETS = Map("(" -> ")", "{" -> "}", "[" -> "]", "<" -> ">")
+ private val BRACKETS_REVERSED = BRACKETS.map { case (key, value) => value
-> key }
+
+import NCBracketsTokenEnricher.*
/**
* Brackets [[NCTokenEnricher token enricher]].
*
* This enricher adds `brackets` boolean [[NCPropertyMap metadata]] property
to the [[NCToken token]]
- * instance if the word it represents is enclosed in brackets. Supported
brackets are: `()`, `{}`,
- * `[]` and `<>`.
+ * instance if the word it represents is enclosed in brackets.
+ *
+ * Supported brackets are: `()`, `{}`, `[]` and `<>`.
*
- * **NOTE:** invalid enclosed brackets are ignored.
+ * **NOTE:** invalid enclosed brackets are ignored and for all input tokens
property `brackets` assigned as `false`.
*/
//noinspection DuplicatedCode,ScalaWeakerAccess
class NCBracketsTokenEnricher extends NCTokenEnricher with LazyLogging:
@@ -41,26 +51,20 @@ class NCBracketsTokenEnricher extends NCTokenEnricher with
LazyLogging:
var ok = true
def check(expected: String): Unit = if stack.empty() || stack.pop() !=
expected then ok = false
- def mark(t: NCToken): Unit = map += t -> !stack.isEmpty
+ def add(t: NCToken): Unit = map += t -> !stack.isEmpty
- for (t <- toks if ok)
- t.getText match
- case "(" | "{" | "[" | "<" =>
- mark(t)
- stack.push(t.getText)
- case ")" =>
- check("(")
- mark(t)
- case "}" =>
- check("{")
- mark(t)
- case "]" =>
- check("[")
- mark(t)
- case ">" =>
- check("<")
- mark(t)
- case _ => mark(t)
+ for (t <- toks if ok; txt = t.getText)
+ if BRACKETS.contains(txt) then
+ add(t)
+ stack.push(txt)
+ else if BRACKETS_REVERSED.contains(txt) then
+ check(BRACKETS_REVERSED(txt))
+ add(t)
+ else
+ add(t)
- if ok && stack.isEmpty then map.foreach { (tok, b) =>
tok.put("brackets", b) }
- else logger.warn(s"Detected invalid brackets in: ${req.getText}")
\ No newline at end of file
+ if ok && stack.isEmpty then
+ map.foreach { (tok, b) => tok.put("brackets", b) }
+ else
+ toks.foreach(_.put("brackets",false))
+ logger.warn(s"Detected invalid brackets in: ${req.getText}")
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
index 8912e178..f2abb1c8 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
@@ -45,8 +45,8 @@ import NCQuotesTokenEnricher.*
* `false` value indicates otherwise.
*
* Supported quotes are: **«**, **»**, **"**, **'**, **`**.
- * For any invalid cases, like invalid quotes order otr count,
- * property `quoted` assigned as `false` for all input tokens.
+ *
+ * **NOTE:** invalid enclosed quotes are ignored and for all input tokens
property `quoted` assigned as `false`.
*/
//noinspection ScalaWeakerAccess
class NCQuotesTokenEnricher extends NCTokenEnricher with LazyLogging:
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala
index 6739a703..82bcff24 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala
@@ -33,16 +33,24 @@ class NCBracketsTokenEnricherSpec extends AnyFunSuite:
* @param txt
* @param brackets
*/
- private def check(txt: String, brackets: Set[Integer]): Unit =
+ private def check(txt: String, brackets: Integer*): Unit =
val toks = EN_TOK_PARSER.tokenize(txt)
bracketsEnricher.enrich(NCTestRequest(txt), CFG, toks)
NCTestUtils.printTokens(toks)
- toks.foreach (tok => require(!(tok[Boolean]("brackets") ^
brackets.contains(tok.getIndex))))
+ if brackets.isEmpty then require(toks.forall(p =>
!p[Boolean]("brackets")))
+ else toks.foreach (tok => require(!(tok[Boolean]("brackets") ^
brackets.contains(tok.getIndex))))
test("test") {
- check("A [ B C ] D", Set(2, 3))
- check("A [ B { C } ] D", Set(2, 3, 4, 5))
- check("A [ B { C } ] [ [ D ] ] [ E ]", Set(2, 3, 4, 5, 8, 9, 10, 13))
+ check("A [ B C ] D", 2, 3)
+ check("A [ B { C } ] D", 2, 3, 4, 5)
+ check("A [ B { C } ] [ [ D ] ] [ E ]", 2, 3, 4, 5, 8, 9, 10, 13)
+
+ // Invalid.
+ check("[[a]")
+ check("[a[")
+ check("{[a[}")
+ check("[")
+ check("}")
}