This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new 7205452a WIP.
new f5dcdd81 Merge remote-tracking branch 'origin/NLPCRAFT-520' into
NLPCRAFT-520
7205452a is described below
commit 7205452aa6784125075424bcbaf24fac169be328
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Dec 16 16:09:00 2022 +0400
WIP.
---
.../nlp/enrichers/NCQuotesTokenEnricher.scala | 66 ++++++++++++++++------
.../nlp/enrichers/NCQuotesTokenEnricherSpec.scala | 35 +++++++++---
2 files changed, 76 insertions(+), 25 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
index 695c27c7..7433c7b6 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
@@ -19,6 +19,25 @@ package org.apache.nlpcraft.nlp.enrichers
import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
+import scala.collection.*
+
+/**
+ * Companion helper.
+ */
+object NCQuotesTokenEnricher:
+ private val PROP = "quoted"
+
+ private case class Range(from: Int, to: Int):
+ def in(idx: Int): Boolean = idx >= from && idx <= to
+
+ private val QUOTES = Map("«" -> "»", "\"" -> "\"", "`" -> "`", "'" -> "'")
+
+ private val QUOTES_REVERSED = QUOTES.map { case (key, value) => value ->
key }
+ private val QUOTES_SYMBOLS = QUOTES.flatMap { case (key, value) =>
Set(key, value) }.toSet
+
+ private def isQuote(t: NCToken): Boolean =
QUOTES_SYMBOLS.contains(t.getText)
+
+import NCQuotesTokenEnricher.*
/**
* Quotes [[NCTokenEnricher token enricher]].
@@ -27,30 +46,41 @@ import org.apache.nlpcraft.*
* instance if word it represents is in quotes. The value `true` of the
metadata property indicates that this word is in quotes,
* `false` value indicates otherwise.
*
- * **NOTE:** this implementation requires `lemma` string [[NCPropertyMap
metadata]] property that contains
- * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] for required
language that provides this metadata property before
- * this enricher in your [[NCPipeline pipeline]].
+ * Supported quotes are: **«**, **»**, **"**, **'**, **`**.
+ * For any invalid situations, like unexpected quotes count or their invalid
order detection, for all tokens
+ * property `quoted` value assigned as `false`.
*/
//noinspection ScalaWeakerAccess
class NCQuotesTokenEnricher extends NCTokenEnricher with LazyLogging:
- private final val Q_POS: Set[String] = Set("``", "''")
- private def getPos(t: NCToken): String = t.get("pos").getOrElse(throw new
NCException("POS not found in token."))
- private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
-
//noinspection DuplicatedCode
/** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
List[NCToken]): Unit =
+ def markAllNot(invalidState: Boolean): Unit =
+ if invalidState then logger.warn(s"Detected invalid quotes in:
${req.getText}")
+ toks.foreach(_.put(PROP, false))
+
val quotes = toks.filter(isQuote)
- // Start and end quote can be different ("a` processed as valid)
- if quotes.nonEmpty then
- if quotes.size % 2 == 0 then
- val m = toks.zipWithIndex.toMap
- val pairs = quotes.zipWithIndex.drop(1).flatMap { (t, idx) =>
- Option.when(idx % 2 != 0)(m(t) -> m(quotes(idx - 1)))
- }
- toks.zipWithIndex.foreach { (tok, idx) =>
- tok.put("quoted", pairs.exists { (from, to) => from > idx
&& to < idx })
- }
+ if quotes.isEmpty then
+ markAllNot(false)
+ else if quotes.length % 2 != 0 then
+ markAllNot(true)
+ else
+ val quotedRanges = mutable.HashSet.empty[Range]
+ val stack = mutable.Stack.empty[NCToken]
+
+ for (quote <- quotes)
+ if stack.nonEmpty then
+ val top = stack.top
+ if top.getText == QUOTES_REVERSED.getOrElse(quote.getText,
null) then
+ quotedRanges += Range(top.getIndex + 1, quote.getIndex
- 1)
+ stack.pop()
+ else
+ stack.push(quote)
+ else
+ stack.push(quote)
+
+ if stack.isEmpty then
+ toks.foreach(t => t.put(PROP,
quotedRanges.exists(_.in(t.getIndex))))
else
- logger.warn(s"Detected invalid quotes in: ${req.getText}")
\ No newline at end of file
+ markAllNot(true)
\ No newline at end of file
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala
index ee3ad403..ccec2447 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala
@@ -18,10 +18,9 @@
package org.apache.nlpcraft.nlp.enrichers
import org.apache.nlpcraft.*
-import annotations.*
-import nlp.enrichers.*
-import nlp.util.*
-import internal.util.NCResourceReader
+import org.apache.nlpcraft.annotations.*
+import org.apache.nlpcraft.nlp.enrichers.*
+import org.apache.nlpcraft.nlp.util.*
import org.scalatest.funsuite.AnyFunSuite
/**
@@ -35,7 +34,7 @@ class NCQuotesTokenEnricherSpec extends AnyFunSuite:
* @param txt
* @param quotes
*/
- private def check(txt: String, quotes: Set[Integer]): Unit =
+ private def check(txt: String, quotes: Integer*): Unit =
val toks = EN_TOK_PARSER.tokenize(txt)
val req = NCTestRequest(txt)
@@ -46,6 +45,28 @@ class NCQuotesTokenEnricherSpec extends AnyFunSuite:
toks.foreach (tok => require(!(tok[Boolean]("quoted") ^
quotes.contains(tok.getIndex))))
test("test") {
- check("It called ' test data '", Set(3, 4))
- check("It called ' test data ' , ' test data '", Set(3, 4, 8, 9))
+ check("It called ' test data '", 3, 4)
+ check("It called \" test data \"", 3, 4)
+ check("It called « test data »", 3, 4)
+ check("It called ' test data ' , ' test data '", 3, 4, 8, 9)
+
+ // Invalid.
+ check("It called ' test data ' '")
+ check("It called ' test data `")
+ check("It called ' test data ' `")
+ check("It called « test data '")
+ check("It called « test data ' »")
+ check("It called « test data «")
+ check("It called » test data »")
+ check("'")
+ check("\"a\"\"")
+
+ // Empty.
+ check("It called ' ' test data ' '")
+
+ // Nested.
+ check("It called \" ' test data ' \"", 3, 4, 5, 6)
+ check("It called ' \" test data \" '", 3, 4, 5, 6)
+ check("It called « \" test data \" »", 3, 4, 5, 6)
+ check("It called « \" ' test data ' \" »", 3, 4, 5, 6, 7, 8)
}
\ No newline at end of file