[incubator-nlpcraft] branch NLPCRAFT-471 updated: WIP.

sergeykamov Tue, 28 Dec 2021 01:47:22 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-471
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-471 by this push:
     new e2d9f44  WIP.
e2d9f44 is described below

commit e2d9f445cdab093ae2e603545da75d7536b28162
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Dec 28 12:47:08 2021 +0300

    WIP.
---
 .../scala/org/apache/nlpcraft/NCPropertyMap.java   |  8 +-
 .../org/apache/nlpcraft/NCPropertyMapAdapter.java  |  5 ++
 .../enricher/impl/NCOpenNlpTokenEnricherImpl.scala | 81 -------------------
 .../opennlp/NCOpenNlpEntityParser.java}            | 39 +++++-----
 .../opennlp/impl/NCOpenNlpEntityParserImpl.scala   | 90 ++++++++++++++++++++++
 .../apache/nlpcraft/internal/util/NCUtils.scala    |  8 +-
 .../enricher/NCEnBracketsTokenEnricherSpec.scala   |  2 +-
 .../enricher/NCEnDictionaryTokenEnricherSpec.scala |  2 +-
 .../enricher/NCEnLanguageTokenEnricherSpec.scala   |  2 +-
 .../enricher/NCEnQuotesTokenEnricherSpec.scala     |  2 +-
 .../enricher/NCEnSwearWordsTokenEnricherSpec.scala |  2 +-
 ...rSpec.scala => NCOpenNlpEntityParserSpec.scala} | 48 ++++++------
 .../nlpcraft/internal/nlp/util/NCTestUtils.scala   | 72 ++++++++---------
 13 files changed, 193 insertions(+), 168 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java
index 13a8119..2cb97dc 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java
@@ -17,7 +17,7 @@
 
 package org.apache.nlpcraft;
 
-import java.util.Optional;
+import java.util.*;
 
 /**
  *
@@ -76,4 +76,10 @@ public interface NCPropertyMap {
      * @return
      */
     boolean remove(String key, Object obj);
+
+    /**
+     *
+     * @return
+     */
+    Set<String> keysSet();
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java
index a82689d..c2ca3d1 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java
@@ -59,4 +59,9 @@ public class NCPropertyMapAdapter implements NCPropertyMap {
     public boolean remove(String key, Object obj) {
         return map.remove(key, obj);
     }
+
+    @Override
+    public Set<String> keysSet() {
+        return map.keySet();
+    }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
deleted file mode 100644
index 9ac7600..0000000
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.internal.nlp.token.enricher.impl
-
-import com.typesafe.scalalogging.LazyLogging
-import opennlp.tools.namefind.*
-import org.apache.nlpcraft.*
-import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnQuotesImpl.*
-import org.apache.nlpcraft.internal.util.NCUtils
-
-import java.io.*
-import java.util.{List as JList, Map as JMap}
-import scala.concurrent.ExecutionContext
-import scala.jdk.CollectionConverters.*
-import scala.util.Using
-import scala.util.control.Exception.catching
-
-object NCOpenNlpTokenEnricherImpl {
-    def apply(res: String): NCOpenNlpTokenEnricherImpl = new 
NCOpenNlpTokenEnricherImpl(NCUtils.getStream(res), res)
-    def apply(f: File): NCOpenNlpTokenEnricherImpl = new 
NCOpenNlpTokenEnricherImpl(new FileInputStream(f), f.getAbsolutePath)
-}
-/**
-  *
-  */
-class NCOpenNlpTokenEnricherImpl(is: InputStream, res: String) extends 
NCTokenEnricher with LazyLogging:
-    @volatile private var finder: NameFinderME = _
-
-    override def start(): Unit =
-        finder = new NameFinderME(new 
TokenNameFinderModel(NCUtils.getStream(res)))
-        logger.trace(s"Loaded resource: $res")
-
-    override def stop(): Unit = finder = null
-
-    override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
JList[NCToken]): Unit =
-        val toksSeq = toks.asScala
-        val words = toksSeq.toArray.map(_.getOriginalText)
-
-        case class Holder(start: Int, end: Int, name: String, probability: 
Double)
-
-        val hs = this.synchronized {
-            val hs = finder.find(words).map(p => Holder(p.getStart, p.getEnd - 
1, p.getType, p.getProb) ).toSeq
-
-            finder.clearAdaptiveData()
-
-            hs
-        }
-
-        val toksSeqIdxs = toks.asScala.zipWithIndex
-
-        for ((h, hIdx) <- hs.zipWithIndex)
-            def calcIndex(getHolderIndex: Holder => Int) =
-                toksSeqIdxs.find { case (_, idx) => idx == getHolderIndex(h) } 
match
-                    case Some((_, idx)) => idx
-                    case None => -1
-
-            val i1 = calcIndex(_.start)
-            lazy val i2 = calcIndex(_.end)
-
-            if i1 != -1 && i2 != -1 then
-                for ((tok, idx) <- toksSeqIdxs if idx >= i1 && idx <= i2)
-                    tok.put(s"opennlp:${h.name}", tok.getOriginalText)
-                    tok.put(s"opennlp:${h.name}:probability", h.probability)
-                    tok.put(s"opennlp:${h.name}:id", hIdx + 1)
-
-                    // To avoid scala unexpected NPE from previous operation.
-                    ()
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpEntityParser.java
similarity index 56%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpEntityParser.java
index eb6b931..d25679e 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpEntityParser.java
@@ -15,49 +15,51 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.internal.nlp.token.enricher;
+package org.apache.nlpcraft.internal.nlp.token.parser.opennlp;
 
-import org.apache.nlpcraft.*;
-import 
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCOpenNlpTokenEnricherImpl;
+import org.apache.nlpcraft.NCEntity;
+import org.apache.nlpcraft.NCEntityParser;
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import 
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCOpenNlpEntityParserImpl;
 
 import java.io.File;
-import java.util.*;
+import java.util.List;
+import java.util.Objects;
 
 /**
- * TODO: 3 properties
- *  - opennlp:name - token text
- *  - opennlp:name:id, integer startig from 1 (for grouping multiple words 
tokens)
- *  - opennlp:name:probability, 0..1 probability
- *  where 'name' is element model name (from trained file or resource).
+ * Generates entity with
+ *  - ID `opennlp:name` where 'name' is element model name (from trained file 
or resource) and
+ *  - one property - opennlp:name:probability, where value is double between 0 
and 1.
  *
  * <p>
  * Models can be download here: http://opennlp.sourceforge.net/models-1.5/ or 
trained.
  * <p>
  * Component is language independent.
  * <p>
- *  TODO: which constructors should we keep?
  */
-public class NCOpenNlpTokenEnricher implements NCTokenEnricher {
-    private final NCOpenNlpTokenEnricherImpl impl;
+public class NCOpenNlpEntityParser implements NCEntityParser {
+    private final NCOpenNlpEntityParserImpl impl;
 
     /**
      * @param name
      * @param modelSrc
      */
-    public NCOpenNlpTokenEnricher(String modelSrc) {
+    public NCOpenNlpEntityParser(String modelSrc) {
         Objects.requireNonNull(modelSrc, "Model source cannot be null.");
 
-        this.impl = NCOpenNlpTokenEnricherImpl.apply(modelSrc);
+        this.impl = NCOpenNlpEntityParserImpl.apply(modelSrc);
     }
 
     /**
      * @param name
      * @param modelFile
      */
-    public NCOpenNlpTokenEnricher(File modelFile) {
+    public NCOpenNlpEntityParser(File modelFile) {
         Objects.requireNonNull(modelFile, "Model file cannot be null.");
 
-        this.impl = NCOpenNlpTokenEnricherImpl.apply(modelFile);
+        this.impl = NCOpenNlpEntityParserImpl.apply(modelFile);
     }
 
     @Override
@@ -71,8 +73,7 @@ public class NCOpenNlpTokenEnricher implements 
NCTokenEnricher {
     }
 
     @Override
-    public void enrich(NCRequest req, NCModelConfig cfg, List<NCToken> toks) {
-        assert impl != null;
-        impl.enrich(req, cfg, toks);
+    public List<NCEntity> parse(NCRequest req, NCModelConfig cfg, 
List<NCToken> toks) {
+        return impl.parse(req, cfg, toks);
     }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
new file mode 100644
index 0000000..4283637
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl
+
+import com.typesafe.scalalogging.LazyLogging
+import opennlp.tools.namefind.*
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnQuotesImpl.*
+import org.apache.nlpcraft.internal.util.NCUtils
+
+import java.io.*
+import java.util
+import java.util.{Optional, List as JList, Map as JMap}
+import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.*
+import scala.util.Using
+import scala.util.control.Exception.catching
+
+object NCOpenNlpEntityParserImpl {
+    def apply(res: String): NCOpenNlpEntityParserImpl = new 
NCOpenNlpEntityParserImpl(NCUtils.getStream(res), res)
+    def apply(f: File): NCOpenNlpEntityParserImpl = new 
NCOpenNlpEntityParserImpl(new FileInputStream(f), f.getAbsolutePath)
+}
+
+/**
+  *
+  */
+class NCOpenNlpEntityParserImpl(is: InputStream, res: String) extends 
NCEntityParser with LazyLogging:
+    @volatile private var finder: NameFinderME = _
+
+    override def start(): Unit =
+        finder = new NameFinderME(new 
TokenNameFinderModel(NCUtils.getStream(res)))
+        logger.trace(s"Loaded resource: $res")
+
+    override def stop(): Unit = finder = null
+
+    override def parse(req: NCRequest, cfg: NCModelConfig, toks: 
JList[NCToken]): JList[NCEntity]  =
+        val toksSeq = toks.asScala
+        val words = toksSeq.toArray.map(_.getOriginalText)
+
+        case class Holder(start: Int, end: Int, name: String, probability: 
Double)
+
+        val hs = this.synchronized {
+            try
+                finder.find(words).map(p => Holder(p.getStart, p.getEnd - 1, 
p.getType, p.getProb) ).toSeq
+            finally
+                finder.clearAdaptiveData()
+        }
+
+        val ents = new util.ArrayList[NCEntity]()
+
+        if hs.nonEmpty then
+            val toksIdxs = toks.asScala.zipWithIndex
+
+            for ((h, hIdx) <- hs.zipWithIndex)
+                def calcIndex(getHolderIndex: Holder => Int) =
+                    toksIdxs.find { case (_, idx) => idx == getHolderIndex(h) 
} match
+                        case Some((_, idx)) => idx
+                        case None => -1
+
+                val i1 = calcIndex(_.start)
+                lazy val i2 = calcIndex(_.end)
+
+                if i1 != -1 && i2 != -1 then
+                    val ent = new NCPropertyMapAdapter with NCEntity {
+                        override def getTokens: JList[NCToken] =
+                            toksIdxs.flatMap { case (t, idx) => if idx >= i1 
&& idx <= i2 then Some(t) else None }.asJava
+                        override def getRequestId: String = req.getRequestId
+                        override def getId: String = s"opennlp:${h.name}"
+                        override def getIndex: Int = 0 // TODO:
+                        override def getGuid: String = 
NCUtils.genUUID().toString
+                    }
+                    ent.put(s"opennlp:${h.name}:probability", h.probability)
+                    ents.add(ent);
+
+        util.Collections.unmodifiableList(ents) // TODO: should we wrap?
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index 239e29f..77bb9d7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -24,7 +24,7 @@ import org.apache.nlpcraft.internal.ansi.NCAnsi.*
 
 import java.io.*
 import java.net.*
-import java.util.Random
+import java.util.{Random, UUID}
 import java.util.regex.Pattern
 import java.util.zip.{GZIPInputStream, GZIPOutputStream}
 import scala.annotation.tailrec
@@ -929,6 +929,12 @@ object NCUtils extends LazyLogging:
         bodies.map(body => Future { body() } (ec)).foreach(Await.result(_, 
Duration.Inf))
 
     /**
+      *
+      * @return
+      */
+    def genUUID(): UUID = UUID.randomUUID()
+
+    /**
       * Gets all sequential permutations of tokens in this NLP sentence.
       *
       * For example, if NLP sentence contains "a, b, c, d" tokens, then
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
index 3201e06..42c3887 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
@@ -45,7 +45,7 @@ class NCEnBracketsTokenEnricherSpec:
         val toks = parser.parse(NCTestRequest(txt), null)
         enricher.enrich(NCTestRequest(txt), null, toks)
         val seq = toks.asScala.toSeq
-        NCTestUtils.printTokens(seq, "brackets:en")
+        NCTestUtils.printTokens(seq)
         seq.zipWithIndex.foreach { case (tok, idx) =>
             require(!(tok.get[Boolean]("brackets:en") ^ 
brackets.contains(idx)))
         }
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
index e4d7335..da11baa 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
@@ -44,7 +44,7 @@ class NCEnDictionaryTokenEnricherSpec:
 
         enricher.enrich(null, null, toks.asJava)
 
-        NCTestUtils.printTokens(toks, "dict:en")
+        NCTestUtils.printTokens(toks)
 
         require(toks.head.get[Boolean]("dict:en"))
         require(!toks.last.get[Boolean]("dict:en"))
\ No newline at end of file
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
index 52e3156..6003eb6 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
@@ -44,7 +44,7 @@ class NCEnLanguageTokenEnricherSpec:
 
         enricher.enrich(null, null, toks.asJava)
 
-        NCTestUtils.printTokens(toks, "lang:en")
+        NCTestUtils.printTokens(toks)
 
         require(toks.head.get[Boolean]("lang:en"))
         require(!toks.last.get[Boolean]("lang:en"))
\ No newline at end of file
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
index 182e4a4..48fe24d 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
@@ -45,7 +45,7 @@ class NCEnQuotesTokenEnricherSpec:
         val toks = parser.parse(NCTestRequest(txt), null)
         val toksSeq = toks.asScala.toSeq
         enricher.enrich(NCTestRequest(txt), null, toks)
-        NCTestUtils.printTokens(toksSeq, "quoted:en")
+        NCTestUtils.printTokens(toksSeq)
         toksSeq.zipWithIndex.foreach { case (tok, idx) =>
             require(!(tok.get[Boolean]("quoted:en") ^ quotes.contains(idx)))
         }
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
index a913070..45ab328 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
@@ -44,7 +44,7 @@ class NCEnSwearWordsTokenEnricherSpec:
 
         enricher.enrich(null, null, toks.asJava)
 
-        NCTestUtils.printTokens(toks, "swear:en")
+        NCTestUtils.printTokens(toks)
 
         require(!toks.head.get[Boolean]("swear:en"))
         require(toks.last.get[Boolean]("swear:en"))
\ No newline at end of file
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpEntityParserSpec.scala
similarity index 54%
rename from 
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
rename to 
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpEntityParserSpec.scala
index d3ed968..c3e118b 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpEntityParserSpec.scala
@@ -17,13 +17,14 @@
 
 package org.apache.nlpcraft.internal.nlp.token.enricher
 
-import org.apache.nlpcraft.NCLifecycle
-import 
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCOpenNlpTokenEnricherImpl
-import 
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.{NCEntity, NCLifecycle}
+import 
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.{NCEnOpenNlpTokenParser, 
NCOpenNlpEntityParser}
 import org.apache.nlpcraft.internal.nlp.util.*
 import org.apache.nlpcraft.internal.util.NCUtils
 import org.junit.jupiter.api.*
 
+import java.util
+import scala.collection.mutable
 import scala.concurrent.ExecutionContext
 import scala.jdk.CollectionConverters.*
 import scala.jdk.OptionConverters.RichOptional
@@ -31,16 +32,16 @@ import scala.jdk.OptionConverters.RichOptional
 /**
   *
   */
-class NCOpenNlpTokenEnricherSpec:
-    private val enrichers = 
scala.collection.mutable.ArrayBuffer.empty[NCOpenNlpTokenEnricher]
-    private var parser: NCEnOpenNlpTokenParser = _
+class NCOpenNlpEntityParserSpec:
+    private val eParsers = 
scala.collection.mutable.ArrayBuffer.empty[NCOpenNlpEntityParser]
+    private var tParser: NCEnOpenNlpTokenParser = _
 
     @BeforeEach
     def start(): Unit =
-        parser = NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
+        tParser = NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
 
         def add(res: String): Unit =
-            enrichers += NCTestUtils.makeAndStart(new 
NCOpenNlpTokenEnricher(s"opennlp/$res"))
+            eParsers += NCTestUtils.makeAndStart(new 
NCOpenNlpEntityParser(s"opennlp/$res"))
 
         NCUtils.execPar(
             // en-ner-time.bin is skipped. I can't find any working example.
@@ -52,26 +53,23 @@ class NCOpenNlpTokenEnricherSpec:
             () => add("en-ner-percentage.bin")
         )(ExecutionContext.Implicits.global)
 
-    private def check(txt: String, expected: String): Unit =
+    private def checkSingleEntity(txt: String, expected: String): Unit =
         val req = NCTestRequest(txt)
-        val toks = parser.parse(req, null)
-        enrichers.foreach(_.enrich(req, null, toks))
-        val toksSeq = toks.asScala.toSeq
+        val toks = tParser.parse(req, null)
+        val resSeq = eParsers.map(_.parse(req, null, 
toks).asScala.toSeq).filter(_.size == 1)
 
-        val propName = s"opennlp:$expected"
-        val propProb = s"opennlp:${expected}:probability"
-        val propId = s"opennlp:${expected}:id"
-        NCTestUtils.printTokens(toksSeq, propName, propProb, propId)
+        require(resSeq.size == 1)
 
-        require(toksSeq.exists(_.getOpt(propName).isPresent))
-        require(toksSeq.exists(_.getOpt(propProb).isPresent))
-        require(toksSeq.exists(_.getOpt(propId).isPresent))
+        val res = resSeq.head
+
+        NCTestUtils.printEntities(txt, res)
+        
require(res.exists(_.getOpt(s"opennlp:${expected}:probability").isPresent))
 
     @Test
     def test(): Unit =
-        check("today", "date")
-        check("Moscow", "location")
-        check("10 is 5 % from 200", "percentage")
-        check("Tim Cook", "person")
-        check("Microsoft", "organization")
-        check("Current price is higher for 20 USA dollars", "money")
\ No newline at end of file
+        checkSingleEntity("today", "date")
+        checkSingleEntity("Moscow", "location")
+        checkSingleEntity("10 is 5 % from 200", "percentage")
+        checkSingleEntity("Tim Cook", "person")
+        checkSingleEntity("Microsoft", "organization")
+        checkSingleEntity("Current price is higher for 20 USA dollars", 
"money")
\ No newline at end of file
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
index b53bb51..25bb543 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
@@ -20,57 +20,57 @@ package org.apache.nlpcraft.internal.nlp.util
 import org.apache.nlpcraft.internal.ascii.NCAsciiTable
 import org.apache.nlpcraft.*
 import 
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
+import scala.jdk.CollectionConverters.*
 
 /**
   *
   */
 object NCTestUtils:
     /**
-      *
-      * @param req
       * @param toks
-      * @param props
       */
-    def printTokens(toks: Seq[NCToken], props: String*): Unit =
+    def printTokens(toks: Seq[NCToken]): Unit =
         val tbl = new NCAsciiTable()
 
-        if props.isEmpty
-            then tbl #= ("Text", "Normalized", "POS", "Stem", "Lemma", 
"Start", "End", "Length", "Stopword")
-            else tbl #= ("Text", "Normalized", "POS", "Stem", "Lemma", 
"Start", "End", "Length", "Stopword", "Properties")
-
-        toks.foreach(t =>
-            if props.isEmpty then
-                tbl += (
-                    t.getOriginalText,
-                    t.getNormalizedText,
-                    t.getPos,
-                    t.getStem,
-                    t.getLemma,
-                    t.getStartCharIndex,
-                    t.getEndCharIndex,
-                    t.getLength,
-                    t.isStopWord
-                )
-            else
-                tbl += (
-                    t.getOriginalText,
-                    t.getNormalizedText,
-                    t.getPos,
-                    t.getStem,
-                    t.getLemma,
-                    t.getStartCharIndex,
-                    t.getEndCharIndex,
-                    t.getLength,
-                    t.isStopWord,
-                    props.map(p => s"$p=${t.get[Any](p)}").mkString("{", ", ", 
"}")
-                )
-        )
+        tbl #= ("Origin", "Normalized", "POS", "Stem", "Lemma", "Start", 
"End", "Length", "Stopword", "Properties")
+        for (t <- toks)
+            tbl += (
+                t.getOriginalText,
+                t.getNormalizedText,
+                t.getPos,
+                t.getStem,
+                t.getLemma,
+                t.getStartCharIndex,
+                t.getEndCharIndex,
+                t.getLength,
+                t.isStopWord,
+                t.keysSet().asScala.map(p => 
s"$p=${t.get[Any](p)}").mkString("[", ", ", "]")
+            )
 
         println(s"Request: ${toks.map(_.getOriginalText).mkString(" ")}")
         println(tbl.toString)
 
     /**
       *
+      * @param req
+      * @param ents
+      */
+    def printEntities(req: String, ents: Seq[NCEntity]): Unit =
+        val tbl = new NCAsciiTable()
+
+        tbl #= ("EntityId", "Tokens", "Properties")
+        for (e <- ents)
+            tbl += (
+                e.getId,
+                e.getTokens.asScala.map(_.getOriginalText).mkString("|"),
+                e.keysSet().asScala.map(p => 
s"$p=${e.get[Any](p)}").mkString("{", ", ", "}")
+            )
+
+        println(s"Request: $req")
+        println(tbl.toString)
+
+    /**
+      *
       * @param make
       * @tparam T
       * @return
@@ -94,4 +94,4 @@ object NCTestUtils:
         "opennlp/en-token.bin",
         "opennlp/en-pos-maxent.bin",
         "opennlp/en-lemmatizer.dict"
-    )    
+    )
\ No newline at end of file

[incubator-nlpcraft] branch NLPCRAFT-471 updated: WIP.

Reply via email to