This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-471
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-471 by this push:
new e2d9f44 WIP.
e2d9f44 is described below
commit e2d9f445cdab093ae2e603545da75d7536b28162
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Dec 28 12:47:08 2021 +0300
WIP.
---
.../scala/org/apache/nlpcraft/NCPropertyMap.java | 8 +-
.../org/apache/nlpcraft/NCPropertyMapAdapter.java | 5 ++
.../enricher/impl/NCOpenNlpTokenEnricherImpl.scala | 81 -------------------
.../opennlp/NCOpenNlpEntityParser.java} | 39 +++++-----
.../opennlp/impl/NCOpenNlpEntityParserImpl.scala | 90 ++++++++++++++++++++++
.../apache/nlpcraft/internal/util/NCUtils.scala | 8 +-
.../enricher/NCEnBracketsTokenEnricherSpec.scala | 2 +-
.../enricher/NCEnDictionaryTokenEnricherSpec.scala | 2 +-
.../enricher/NCEnLanguageTokenEnricherSpec.scala | 2 +-
.../enricher/NCEnQuotesTokenEnricherSpec.scala | 2 +-
.../enricher/NCEnSwearWordsTokenEnricherSpec.scala | 2 +-
...rSpec.scala => NCOpenNlpEntityParserSpec.scala} | 48 ++++++------
.../nlpcraft/internal/nlp/util/NCTestUtils.scala | 72 ++++++++---------
13 files changed, 193 insertions(+), 168 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java
index 13a8119..2cb97dc 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java
@@ -17,7 +17,7 @@
package org.apache.nlpcraft;
-import java.util.Optional;
+import java.util.*;
/**
*
@@ -76,4 +76,10 @@ public interface NCPropertyMap {
* @return
*/
boolean remove(String key, Object obj);
+
+ /**
+ *
+ * @return
+ */
+ Set<String> keysSet();
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java
index a82689d..c2ca3d1 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java
@@ -59,4 +59,9 @@ public class NCPropertyMapAdapter implements NCPropertyMap {
public boolean remove(String key, Object obj) {
return map.remove(key, obj);
}
+
+ @Override
+ public Set<String> keysSet() {
+ return map.keySet();
+ }
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
deleted file mode 100644
index 9ac7600..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.internal.nlp.token.enricher.impl
-
-import com.typesafe.scalalogging.LazyLogging
-import opennlp.tools.namefind.*
-import org.apache.nlpcraft.*
-import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnQuotesImpl.*
-import org.apache.nlpcraft.internal.util.NCUtils
-
-import java.io.*
-import java.util.{List as JList, Map as JMap}
-import scala.concurrent.ExecutionContext
-import scala.jdk.CollectionConverters.*
-import scala.util.Using
-import scala.util.control.Exception.catching
-
-object NCOpenNlpTokenEnricherImpl {
- def apply(res: String): NCOpenNlpTokenEnricherImpl = new
NCOpenNlpTokenEnricherImpl(NCUtils.getStream(res), res)
- def apply(f: File): NCOpenNlpTokenEnricherImpl = new
NCOpenNlpTokenEnricherImpl(new FileInputStream(f), f.getAbsolutePath)
-}
-/**
- *
- */
-class NCOpenNlpTokenEnricherImpl(is: InputStream, res: String) extends
NCTokenEnricher with LazyLogging:
- @volatile private var finder: NameFinderME = _
-
- override def start(): Unit =
- finder = new NameFinderME(new
TokenNameFinderModel(NCUtils.getStream(res)))
- logger.trace(s"Loaded resource: $res")
-
- override def stop(): Unit = finder = null
-
- override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
JList[NCToken]): Unit =
- val toksSeq = toks.asScala
- val words = toksSeq.toArray.map(_.getOriginalText)
-
- case class Holder(start: Int, end: Int, name: String, probability:
Double)
-
- val hs = this.synchronized {
- val hs = finder.find(words).map(p => Holder(p.getStart, p.getEnd -
1, p.getType, p.getProb) ).toSeq
-
- finder.clearAdaptiveData()
-
- hs
- }
-
- val toksSeqIdxs = toks.asScala.zipWithIndex
-
- for ((h, hIdx) <- hs.zipWithIndex)
- def calcIndex(getHolderIndex: Holder => Int) =
- toksSeqIdxs.find { case (_, idx) => idx == getHolderIndex(h) }
match
- case Some((_, idx)) => idx
- case None => -1
-
- val i1 = calcIndex(_.start)
- lazy val i2 = calcIndex(_.end)
-
- if i1 != -1 && i2 != -1 then
- for ((tok, idx) <- toksSeqIdxs if idx >= i1 && idx <= i2)
- tok.put(s"opennlp:${h.name}", tok.getOriginalText)
- tok.put(s"opennlp:${h.name}:probability", h.probability)
- tok.put(s"opennlp:${h.name}:id", hIdx + 1)
-
- // To avoid scala unexpected NPE from previous operation.
- ()
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpEntityParser.java
similarity index 56%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpEntityParser.java
index eb6b931..d25679e 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpEntityParser.java
@@ -15,49 +15,51 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher;
+package org.apache.nlpcraft.internal.nlp.token.parser.opennlp;
-import org.apache.nlpcraft.*;
-import
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCOpenNlpTokenEnricherImpl;
+import org.apache.nlpcraft.NCEntity;
+import org.apache.nlpcraft.NCEntityParser;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCOpenNlpEntityParserImpl;
import java.io.File;
-import java.util.*;
+import java.util.List;
+import java.util.Objects;
/**
- * TODO: 3 properties
- * - opennlp:name - token text
- * - opennlp:name:id, integer startig from 1 (for grouping multiple words
tokens)
- * - opennlp:name:probability, 0..1 probability
- * where 'name' is element model name (from trained file or resource).
+ * Generates entity with
+ * - ID `opennlp:name` where 'name' is element model name (from trained file
or resource) and
+ * - one property - opennlp:name:probability, where value is double between 0
and 1.
*
* <p>
* Models can be download here: http://opennlp.sourceforge.net/models-1.5/ or
trained.
* <p>
* Component is language independent.
* <p>
- * TODO: which constructors should we keep?
*/
-public class NCOpenNlpTokenEnricher implements NCTokenEnricher {
- private final NCOpenNlpTokenEnricherImpl impl;
+public class NCOpenNlpEntityParser implements NCEntityParser {
+ private final NCOpenNlpEntityParserImpl impl;
/**
* @param name
* @param modelSrc
*/
- public NCOpenNlpTokenEnricher(String modelSrc) {
+ public NCOpenNlpEntityParser(String modelSrc) {
Objects.requireNonNull(modelSrc, "Model source cannot be null.");
- this.impl = NCOpenNlpTokenEnricherImpl.apply(modelSrc);
+ this.impl = NCOpenNlpEntityParserImpl.apply(modelSrc);
}
/**
* @param name
* @param modelFile
*/
- public NCOpenNlpTokenEnricher(File modelFile) {
+ public NCOpenNlpEntityParser(File modelFile) {
Objects.requireNonNull(modelFile, "Model file cannot be null.");
- this.impl = NCOpenNlpTokenEnricherImpl.apply(modelFile);
+ this.impl = NCOpenNlpEntityParserImpl.apply(modelFile);
}
@Override
@@ -71,8 +73,7 @@ public class NCOpenNlpTokenEnricher implements
NCTokenEnricher {
}
@Override
- public void enrich(NCRequest req, NCModelConfig cfg, List<NCToken> toks) {
- assert impl != null;
- impl.enrich(req, cfg, toks);
+ public List<NCEntity> parse(NCRequest req, NCModelConfig cfg,
List<NCToken> toks) {
+ return impl.parse(req, cfg, toks);
}
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
new file mode 100644
index 0000000..4283637
--- /dev/null
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl
+
+import com.typesafe.scalalogging.LazyLogging
+import opennlp.tools.namefind.*
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnQuotesImpl.*
+import org.apache.nlpcraft.internal.util.NCUtils
+
+import java.io.*
+import java.util
+import java.util.{Optional, List as JList, Map as JMap}
+import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.*
+import scala.util.Using
+import scala.util.control.Exception.catching
+
+object NCOpenNlpEntityParserImpl {
+ def apply(res: String): NCOpenNlpEntityParserImpl = new
NCOpenNlpEntityParserImpl(NCUtils.getStream(res), res)
+ def apply(f: File): NCOpenNlpEntityParserImpl = new
NCOpenNlpEntityParserImpl(new FileInputStream(f), f.getAbsolutePath)
+}
+
+/**
+ *
+ */
+class NCOpenNlpEntityParserImpl(is: InputStream, res: String) extends
NCEntityParser with LazyLogging:
+ @volatile private var finder: NameFinderME = _
+
+ override def start(): Unit =
+ finder = new NameFinderME(new
TokenNameFinderModel(NCUtils.getStream(res)))
+ logger.trace(s"Loaded resource: $res")
+
+ override def stop(): Unit = finder = null
+
+ override def parse(req: NCRequest, cfg: NCModelConfig, toks:
JList[NCToken]): JList[NCEntity] =
+ val toksSeq = toks.asScala
+ val words = toksSeq.toArray.map(_.getOriginalText)
+
+ case class Holder(start: Int, end: Int, name: String, probability:
Double)
+
+ val hs = this.synchronized {
+ try
+ finder.find(words).map(p => Holder(p.getStart, p.getEnd - 1,
p.getType, p.getProb) ).toSeq
+ finally
+ finder.clearAdaptiveData()
+ }
+
+ val ents = new util.ArrayList[NCEntity]()
+
+ if hs.nonEmpty then
+ val toksIdxs = toks.asScala.zipWithIndex
+
+ for ((h, hIdx) <- hs.zipWithIndex)
+ def calcIndex(getHolderIndex: Holder => Int) =
+ toksIdxs.find { case (_, idx) => idx == getHolderIndex(h)
} match
+ case Some((_, idx)) => idx
+ case None => -1
+
+ val i1 = calcIndex(_.start)
+ lazy val i2 = calcIndex(_.end)
+
+ if i1 != -1 && i2 != -1 then
+ val ent = new NCPropertyMapAdapter with NCEntity {
+ override def getTokens: JList[NCToken] =
+ toksIdxs.flatMap { case (t, idx) => if idx >= i1
&& idx <= i2 then Some(t) else None }.asJava
+ override def getRequestId: String = req.getRequestId
+ override def getId: String = s"opennlp:${h.name}"
+ override def getIndex: Int = 0 // TODO:
+ override def getGuid: String =
NCUtils.genUUID().toString
+ }
+ ent.put(s"opennlp:${h.name}:probability", h.probability)
+ ents.add(ent);
+
+ util.Collections.unmodifiableList(ents) // TODO: should we wrap?
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index 239e29f..77bb9d7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -24,7 +24,7 @@ import org.apache.nlpcraft.internal.ansi.NCAnsi.*
import java.io.*
import java.net.*
-import java.util.Random
+import java.util.{Random, UUID}
import java.util.regex.Pattern
import java.util.zip.{GZIPInputStream, GZIPOutputStream}
import scala.annotation.tailrec
@@ -929,6 +929,12 @@ object NCUtils extends LazyLogging:
bodies.map(body => Future { body() } (ec)).foreach(Await.result(_,
Duration.Inf))
/**
+ *
+ * @return
+ */
+ def genUUID(): UUID = UUID.randomUUID()
+
+ /**
* Gets all sequential permutations of tokens in this NLP sentence.
*
* For example, if NLP sentence contains "a, b, c, d" tokens, then
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
index 3201e06..42c3887 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
@@ -45,7 +45,7 @@ class NCEnBracketsTokenEnricherSpec:
val toks = parser.parse(NCTestRequest(txt), null)
enricher.enrich(NCTestRequest(txt), null, toks)
val seq = toks.asScala.toSeq
- NCTestUtils.printTokens(seq, "brackets:en")
+ NCTestUtils.printTokens(seq)
seq.zipWithIndex.foreach { case (tok, idx) =>
require(!(tok.get[Boolean]("brackets:en") ^
brackets.contains(idx)))
}
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
index e4d7335..da11baa 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
@@ -44,7 +44,7 @@ class NCEnDictionaryTokenEnricherSpec:
enricher.enrich(null, null, toks.asJava)
- NCTestUtils.printTokens(toks, "dict:en")
+ NCTestUtils.printTokens(toks)
require(toks.head.get[Boolean]("dict:en"))
require(!toks.last.get[Boolean]("dict:en"))
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
index 52e3156..6003eb6 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
@@ -44,7 +44,7 @@ class NCEnLanguageTokenEnricherSpec:
enricher.enrich(null, null, toks.asJava)
- NCTestUtils.printTokens(toks, "lang:en")
+ NCTestUtils.printTokens(toks)
require(toks.head.get[Boolean]("lang:en"))
require(!toks.last.get[Boolean]("lang:en"))
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
index 182e4a4..48fe24d 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
@@ -45,7 +45,7 @@ class NCEnQuotesTokenEnricherSpec:
val toks = parser.parse(NCTestRequest(txt), null)
val toksSeq = toks.asScala.toSeq
enricher.enrich(NCTestRequest(txt), null, toks)
- NCTestUtils.printTokens(toksSeq, "quoted:en")
+ NCTestUtils.printTokens(toksSeq)
toksSeq.zipWithIndex.foreach { case (tok, idx) =>
require(!(tok.get[Boolean]("quoted:en") ^ quotes.contains(idx)))
}
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
index a913070..45ab328 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
@@ -44,7 +44,7 @@ class NCEnSwearWordsTokenEnricherSpec:
enricher.enrich(null, null, toks.asJava)
- NCTestUtils.printTokens(toks, "swear:en")
+ NCTestUtils.printTokens(toks)
require(!toks.head.get[Boolean]("swear:en"))
require(toks.last.get[Boolean]("swear:en"))
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpEntityParserSpec.scala
similarity index 54%
rename from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
rename to
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpEntityParserSpec.scala
index d3ed968..c3e118b 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpEntityParserSpec.scala
@@ -17,13 +17,14 @@
package org.apache.nlpcraft.internal.nlp.token.enricher
-import org.apache.nlpcraft.NCLifecycle
-import
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCOpenNlpTokenEnricherImpl
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.{NCEntity, NCLifecycle}
+import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.{NCEnOpenNlpTokenParser,
NCOpenNlpEntityParser}
import org.apache.nlpcraft.internal.nlp.util.*
import org.apache.nlpcraft.internal.util.NCUtils
import org.junit.jupiter.api.*
+import java.util
+import scala.collection.mutable
import scala.concurrent.ExecutionContext
import scala.jdk.CollectionConverters.*
import scala.jdk.OptionConverters.RichOptional
@@ -31,16 +32,16 @@ import scala.jdk.OptionConverters.RichOptional
/**
*
*/
-class NCOpenNlpTokenEnricherSpec:
- private val enrichers =
scala.collection.mutable.ArrayBuffer.empty[NCOpenNlpTokenEnricher]
- private var parser: NCEnOpenNlpTokenParser = _
+class NCOpenNlpEntityParserSpec:
+ private val eParsers =
scala.collection.mutable.ArrayBuffer.empty[NCOpenNlpEntityParser]
+ private var tParser: NCEnOpenNlpTokenParser = _
@BeforeEach
def start(): Unit =
- parser = NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
+ tParser = NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
def add(res: String): Unit =
- enrichers += NCTestUtils.makeAndStart(new
NCOpenNlpTokenEnricher(s"opennlp/$res"))
+ eParsers += NCTestUtils.makeAndStart(new
NCOpenNlpEntityParser(s"opennlp/$res"))
NCUtils.execPar(
// en-ner-time.bin is skipped. I can't find any working example.
@@ -52,26 +53,23 @@ class NCOpenNlpTokenEnricherSpec:
() => add("en-ner-percentage.bin")
)(ExecutionContext.Implicits.global)
- private def check(txt: String, expected: String): Unit =
+ private def checkSingleEntity(txt: String, expected: String): Unit =
val req = NCTestRequest(txt)
- val toks = parser.parse(req, null)
- enrichers.foreach(_.enrich(req, null, toks))
- val toksSeq = toks.asScala.toSeq
+ val toks = tParser.parse(req, null)
+ val resSeq = eParsers.map(_.parse(req, null,
toks).asScala.toSeq).filter(_.size == 1)
- val propName = s"opennlp:$expected"
- val propProb = s"opennlp:${expected}:probability"
- val propId = s"opennlp:${expected}:id"
- NCTestUtils.printTokens(toksSeq, propName, propProb, propId)
+ require(resSeq.size == 1)
- require(toksSeq.exists(_.getOpt(propName).isPresent))
- require(toksSeq.exists(_.getOpt(propProb).isPresent))
- require(toksSeq.exists(_.getOpt(propId).isPresent))
+ val res = resSeq.head
+
+ NCTestUtils.printEntities(txt, res)
+
require(res.exists(_.getOpt(s"opennlp:${expected}:probability").isPresent))
@Test
def test(): Unit =
- check("today", "date")
- check("Moscow", "location")
- check("10 is 5 % from 200", "percentage")
- check("Tim Cook", "person")
- check("Microsoft", "organization")
- check("Current price is higher for 20 USA dollars", "money")
\ No newline at end of file
+ checkSingleEntity("today", "date")
+ checkSingleEntity("Moscow", "location")
+ checkSingleEntity("10 is 5 % from 200", "percentage")
+ checkSingleEntity("Tim Cook", "person")
+ checkSingleEntity("Microsoft", "organization")
+ checkSingleEntity("Current price is higher for 20 USA dollars",
"money")
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
index b53bb51..25bb543 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
@@ -20,57 +20,57 @@ package org.apache.nlpcraft.internal.nlp.util
import org.apache.nlpcraft.internal.ascii.NCAsciiTable
import org.apache.nlpcraft.*
import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
+import scala.jdk.CollectionConverters.*
/**
*
*/
object NCTestUtils:
/**
- *
- * @param req
* @param toks
- * @param props
*/
- def printTokens(toks: Seq[NCToken], props: String*): Unit =
+ def printTokens(toks: Seq[NCToken]): Unit =
val tbl = new NCAsciiTable()
- if props.isEmpty
- then tbl #= ("Text", "Normalized", "POS", "Stem", "Lemma",
"Start", "End", "Length", "Stopword")
- else tbl #= ("Text", "Normalized", "POS", "Stem", "Lemma",
"Start", "End", "Length", "Stopword", "Properties")
-
- toks.foreach(t =>
- if props.isEmpty then
- tbl += (
- t.getOriginalText,
- t.getNormalizedText,
- t.getPos,
- t.getStem,
- t.getLemma,
- t.getStartCharIndex,
- t.getEndCharIndex,
- t.getLength,
- t.isStopWord
- )
- else
- tbl += (
- t.getOriginalText,
- t.getNormalizedText,
- t.getPos,
- t.getStem,
- t.getLemma,
- t.getStartCharIndex,
- t.getEndCharIndex,
- t.getLength,
- t.isStopWord,
- props.map(p => s"$p=${t.get[Any](p)}").mkString("{", ", ",
"}")
- )
- )
+ tbl #= ("Origin", "Normalized", "POS", "Stem", "Lemma", "Start",
"End", "Length", "Stopword", "Properties")
+ for (t <- toks)
+ tbl += (
+ t.getOriginalText,
+ t.getNormalizedText,
+ t.getPos,
+ t.getStem,
+ t.getLemma,
+ t.getStartCharIndex,
+ t.getEndCharIndex,
+ t.getLength,
+ t.isStopWord,
+ t.keysSet().asScala.map(p =>
s"$p=${t.get[Any](p)}").mkString("[", ", ", "]")
+ )
println(s"Request: ${toks.map(_.getOriginalText).mkString(" ")}")
println(tbl.toString)
/**
*
+ * @param req
+ * @param ents
+ */
+ def printEntities(req: String, ents: Seq[NCEntity]): Unit =
+ val tbl = new NCAsciiTable()
+
+ tbl #= ("EntityId", "Tokens", "Properties")
+ for (e <- ents)
+ tbl += (
+ e.getId,
+ e.getTokens.asScala.map(_.getOriginalText).mkString("|"),
+ e.keysSet().asScala.map(p =>
s"$p=${e.get[Any](p)}").mkString("{", ", ", "}")
+ )
+
+ println(s"Request: $req")
+ println(tbl.toString)
+
+ /**
+ *
* @param make
* @tparam T
* @return
@@ -94,4 +94,4 @@ object NCTestUtils:
"opennlp/en-token.bin",
"opennlp/en-pos-maxent.bin",
"opennlp/en-lemmatizer.dict"
- )
+ )
\ No newline at end of file