This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new cb37759 WIP.
cb37759 is described below
commit cb37759055ac7875315834e96dc7299d0cf5d39b
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Dec 24 16:18:16 2021 +0300
WIP.
---
.../apache/nlpcraft/NCParameterizedAdapter.java | 5 +-
...nricher.java => NCEnBracketsTokenEnricher.java} | 24 +++++++--
.../token/enricher/NCEnQuotesTokenEnricher.java | 22 ++++++--
.../nlp/token/enricher/impl/NCEnBracketsImpl.scala | 47 +++++++++++++++++
.../nlp/token/enricher/impl/NCEnQuotesImpl.scala | 27 +++++++++-
...c.scala => NCEnBracketsTokenEnricherSpec.scala} | 35 +++++++------
.../enricher/NCEnDictionaryTokenEnricherSpec.scala | 2 +
.../enricher/NCEnQuotesTokenEnricherSpec.scala | 61 ++++++++++++++++++++++
.../enricher/NCEnSwearWordsTokenEnricherSpec.scala | 2 +
.../enricher/NCEnWordsTokenEnricherSpec.scala | 2 +
.../opennlp/NCEnOpenNlpTokenParserSpec.scala | 2 +-
.../nlpcraft/internal/nlp/util/NCTestUtils.scala | 48 ++++++++++++-----
12 files changed, 232 insertions(+), 45 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCParameterizedAdapter.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCParameterizedAdapter.java
index 81d9854..686393e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCParameterizedAdapter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCParameterizedAdapter.java
@@ -17,14 +17,13 @@
package org.apache.nlpcraft;
-import java.util.HashMap;
-import java.util.Optional;
+import java.util.*;
/**
*
*/
public class NCParameterizedAdapter implements NCParameterized {
- private HashMap<String, Object> map = new HashMap<>();
+ private final Map<String, Object> map = new HashMap<>();
@Override
public <T> T get(String key) {
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
similarity index 69%
copy from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricher.java
copy to
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
index d22fe6f..ec5310a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
@@ -17,15 +17,29 @@
package org.apache.nlpcraft.internal.nlp.token.enricher;
-import org.apache.nlpcraft.*;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import org.apache.nlpcraft.NCTokenEnricher;
+import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnBracketsImpl;
+
import java.util.List;
-import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnQuotesImpl;
/**
- *
+ * TODO: enriches with <code>brackets:en</code> property.
*/
-public class NCEnQuotesTokenEnricher implements NCTokenEnricher {
- private NCEnQuotesImpl impl;
+public class NCEnBracketsTokenEnricher implements NCTokenEnricher {
+ private final NCEnBracketsImpl impl = new NCEnBracketsImpl();
+
+ @Override
+ public void start() {
+ impl.start();
+ }
+
+ @Override
+ public void stop() {
+ impl.stop();
+ }
@Override
public void enrich(NCRequest req, NCModelConfig cfg, List<NCToken> toks) {
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricher.java
index d22fe6f..ec2931a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricher.java
@@ -17,15 +17,29 @@
package org.apache.nlpcraft.internal.nlp.token.enricher;
-import org.apache.nlpcraft.*;
-import java.util.List;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import org.apache.nlpcraft.NCTokenEnricher;
import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnQuotesImpl;
+import java.util.List;
+
/**
- *
+ * TODO: enriches with <code>quotes:en</code> property.
*/
public class NCEnQuotesTokenEnricher implements NCTokenEnricher {
- private NCEnQuotesImpl impl;
+ private final NCEnQuotesImpl impl = new NCEnQuotesImpl();
+
+ @Override
+ public void start() {
+ impl.start();
+ }
+
+ @Override
+ public void stop() {
+ impl.stop();
+ }
@Override
public void enrich(NCRequest req, NCModelConfig cfg, List<NCToken> toks) {
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
new file mode 100644
index 0000000..51609b7
--- /dev/null
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.nlp.token.enricher.impl
+
+import org.apache.nlpcraft.*
+
+import java.io.*
+import scala.collection.mutable
+import scala.jdk.CollectionConverters.CollectionHasAsScala
+/**
+ *
+ */
+class NCEnBracketsImpl extends NCTokenEnricher:
+ override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
java.util.List[NCToken]): Unit =
+ val s = new java.util.Stack[String]()
+
+ // TODO: text
+ def mkError: NCException = new NCException(s"Invalid brackets in text:
${req.getOriginalText}")
+ def check(expected: String): Unit = if s.empty() || s.pop() !=
expected then throw mkError
+ def mark(t: NCToken): Unit = t.put("brackets:en", !s.isEmpty)
+
+ toks.forEach(t =>
+ t.getOriginalText match {
+ case "(" | "{" | "[" => mark(t); s.push(t.getOriginalText)
+ case ")" => check("("); mark(t)
+ case "}" => check("{"); mark(t)
+ case "]" => check("["); mark(t)
+ case _ => mark(t)
+ }
+ )
+
+ if !s.isEmpty then throw mkError
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
index 64a9ee9..7adc02c 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
@@ -19,14 +19,37 @@ package org.apache.nlpcraft.internal.nlp.token.enricher.impl
import org.apache.nlpcraft.*
+import scala.jdk.CollectionConverters.*
+
+object NCEnQuotesImpl:
+ private final val Q_POS: Set[String] = Set("``", "''")
+ private def isQuote(t: NCToken): Boolean = Q_POS.contains(t.getPos)
+
+import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnQuotesImpl.*
/**
*
*/
-class NCEnQuotesImpl:
+class NCEnQuotesImpl extends NCTokenEnricher:
+
/**
*
* @param req
* @param cfg
* @param toks
*/
- def enrich(req: NCRequest, cfg: NCModelConfig, toks:
java.util.List[NCToken]): Unit = ???
+ def enrich(req: NCRequest, cfg: NCModelConfig, toks:
java.util.List[NCToken]): Unit =
+ val toksScala = toks.asScala
+ val quotes = toksScala.filter(isQuote)
+
+ // TODO:
+ if quotes.size % 2 != 0 then throw new NCException(s"Invalid quotes in
text: ${req.getOriginalText}")
+
+ val m = toksScala.zipWithIndex.toMap
+ val pairs = quotes.zipWithIndex.drop(1).flatMap {
+ case (t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx -
1))) else None
+ }
+
+ // Do not rewrite it with case, it works only such way, with tuples
(scala 3.1 error?)
+ toksScala.zipWithIndex.foreach(p =>
+ p._1.put("quoted:en", pairs.exists { case (from, to) => from >
p._2 && to < p._2 })
+ )
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
similarity index 58%
copy from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
copy to
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
index fcb4afa..12044c9 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
@@ -17,32 +17,35 @@
package org.apache.nlpcraft.internal.nlp.token.enricher
+import org.apache.nlpcraft.NCToken
import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
-import org.apache.nlpcraft.internal.nlp.util.{NCTestToken, NCTestUtils}
+import org.apache.nlpcraft.internal.nlp.util.{NCTestRequest, NCTestToken,
NCTestUtils}
import org.junit.jupiter.api.{BeforeEach, Test}
import scala.jdk.CollectionConverters.SeqHasAsJava
-
/**
*
*/
-class NCEnWordsTokenEnricherSpec:
- private var enricher: NCEnWordsTokenEnricher = _
+
+class NCEnBracketsTokenEnricherSpec:
+ private var enricher: NCEnBracketsTokenEnricher = _
@BeforeEach
- def start(): Unit = enricher = NCTestUtils.makeAndStart(new
NCEnWordsTokenEnricher)
+ def start(): Unit = enricher = NCTestUtils.makeAndStart(new
NCEnBracketsTokenEnricher())
- @Test
- def test(): Unit =
- val toks = Seq(
- NCTestToken(txt = "english", stem = "english"), // English word.
- NCTestToken(txt = "русский", stem = "русский") // Not english word.
- )
+ private def check(txt: String, brackets: Set[Integer]): Unit =
+ val toks = txt.split("
").toIndexedSeq.map(_.strip).filter(_.nonEmpty).map(NCTestToken(_))
- require(toks.head.getOpt[Boolean]("word:en").isEmpty)
- require(toks.last.getOpt[Boolean]("word:en").isEmpty)
+ enricher.enrich(NCTestRequest(txt), null, toks.asJava)
- enricher.enrich(null, null, toks.asJava)
+ NCTestUtils.printTokens(toks, "brackets:en")
- require(toks.head.get[Boolean]("word:en"))
- require(!toks.last.get[Boolean]("word:en"))
\ No newline at end of file
+ toks.zipWithIndex.foreach { case (tok, idx) =>
+ require(!(tok.get[Boolean]("brackets:en") ^
brackets.contains(idx)))
+ }
+
+ @Test
+ def test(): Unit =
+ check("A [ B C ] D", Set(2, 3))
+ check("A [ B { C } ] D", Set(2, 3, 4, 5))
+ check("A [ B { C } ] [ [ D ] ] [ E ]", Set(2, 3, 4, 5, 8, 9, 10, 13))
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
index 27e9982..e4d7335 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
@@ -44,5 +44,7 @@ class NCEnDictionaryTokenEnricherSpec:
enricher.enrich(null, null, toks.asJava)
+ NCTestUtils.printTokens(toks, "dict:en")
+
require(toks.head.get[Boolean]("dict:en"))
require(!toks.last.get[Boolean]("dict:en"))
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
new file mode 100644
index 0000000..7e2e113
--- /dev/null
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.nlp.token.enricher
+
+import org.apache.nlpcraft.NCToken
+import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.internal.nlp.util.{NCTestRequest, NCTestToken,
NCTestUtils}
+import org.junit.jupiter.api.{BeforeEach, Test}
+
+import scala.jdk.CollectionConverters.*
+
+/**
+ *
+ */
+class NCEnQuotesTokenEnricherSpec:
+ private var parser: NCEnOpenNlpTokenParser = _
+ private var enricher: NCEnQuotesTokenEnricher = _
+
+ @BeforeEach
+ def start(): Unit =
+ parser = NCTestUtils.makeAndStart(
+ new NCEnOpenNlpTokenParser(
+ "opennlp/en-token.bin",
+ "opennlp/en-pos-maxent.bin",
+ "opennlp/en-lemmatizer.dict"
+ )
+ )
+ enricher = NCTestUtils.makeAndStart(new NCEnQuotesTokenEnricher)
+
+ private def check(txt: String, quotes: Set[Integer]): Unit =
+ val toks = parser.parse(NCTestRequest(txt))
+
+ val toksSeq = toks.asScala.toSeq
+
+ enricher.enrich(NCTestRequest(txt), null, toks)
+
+ NCTestUtils.printTokens(toksSeq, "quoted:en")
+
+ toksSeq.zipWithIndex.foreach { case (tok, idx) =>
+ require(!(tok.get[Boolean]("quoted:en") ^ quotes.contains(idx)))
+ }
+
+ @Test
+ def test(): Unit =
+ check("It called ' test data '", Set(3, 4))
+ check("It called ' test data ' , ' test data '", Set(3, 4, 8, 9))
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
index f42b2c5..a913070 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
@@ -44,5 +44,7 @@ class NCEnSwearWordsTokenEnricherSpec:
enricher.enrich(null, null, toks.asJava)
+ NCTestUtils.printTokens(toks, "swear:en")
+
require(!toks.head.get[Boolean]("swear:en"))
require(toks.last.get[Boolean]("swear:en"))
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
index fcb4afa..5a28898 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnWordsTokenEnricherSpec.scala
@@ -44,5 +44,7 @@ class NCEnWordsTokenEnricherSpec:
enricher.enrich(null, null, toks.asJava)
+ NCTestUtils.printTokens(toks, "word:en")
+
require(toks.head.get[Boolean]("word:en"))
require(!toks.last.get[Boolean]("word:en"))
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
index 7820cf2..88c807c 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
@@ -45,7 +45,7 @@ class NCEnOpenNlpTokenParserSpec:
private def test(txt: String, validate: Seq[NCToken] => _): Unit =
val toks = parser.parse(nlp.util.NCTestRequest(txt)).asScala.toSeq
assert(toks.nonEmpty)
- NCTestUtils.printTokens(txt, toks)
+ NCTestUtils.printTokens(toks)
validate(toks)
@Test
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
index 4d62c6c..2b6b270 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
@@ -25,27 +25,47 @@ import org.apache.nlpcraft.*
*/
object NCTestUtils:
/**
+ *
* @param req
* @param toks
+ * @param props
*/
- def printTokens(req: String, toks: Seq[NCToken]): Unit =
+ def printTokens(toks: Seq[NCToken], props: String*): Unit =
val tbl = new NCAsciiTable()
- tbl #= ("Text", "Normalized", "POS", "Stem", "Lemma", "Start", "End",
"Length", "Stopword")
+
+ if props.isEmpty
+ then tbl #= ("Text", "Normalized", "POS", "Stem", "Lemma",
"Start", "End", "Length", "Stopword")
+ else tbl #= ("Text", "Normalized", "POS", "Stem", "Lemma",
"Start", "End", "Length", "Stopword", "Properties")
+
toks.foreach(t =>
- tbl += (
- t.getOriginalText,
- t.getNormalizedText,
- t.getPos,
- t.getStem,
- t.getLemma,
- t.getStartCharIndex,
- t.getEndCharIndex,
- t.getLength,
- t.isStopWord
- )
+ if props.isEmpty then
+ tbl += (
+ t.getOriginalText,
+ t.getNormalizedText,
+ t.getPos,
+ t.getStem,
+ t.getLemma,
+ t.getStartCharIndex,
+ t.getEndCharIndex,
+ t.getLength,
+ t.isStopWord
+ )
+ else
+ tbl += (
+ t.getOriginalText,
+ t.getNormalizedText,
+ t.getPos,
+ t.getStem,
+ t.getLemma,
+ t.getStartCharIndex,
+ t.getEndCharIndex,
+ t.getLength,
+ t.isStopWord,
+ props.map(p => s"$p=${t.get[Any](p)}").mkString("{", ", ",
"}")
+ )
)
- println(s"Request: $req")
+ println(s"Request: ${toks.map(_.getOriginalText).mkString(" ")}")
println(tbl.toString)
/**