This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-471
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-471 by this push:
new ed891d3 WIP.
ed891d3 is described below
commit ed891d30c87b89dd32aa9267f9b44a0edbf477dd
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Dec 27 19:54:40 2021 +0300
WIP.
---
.../nlp/token/enricher/NCOpenNlpTokenEnricher.java | 37 ++++----------
.../enricher/impl/NCOpenNlpTokenEnricherImpl.scala | 59 +++++++++++-----------
.../enricher/NCOpenNlpTokenEnricherSpec.scala | 33 ++++++------
3 files changed, 56 insertions(+), 73 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
index 67f5771..da7742f 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
@@ -17,45 +17,27 @@
package org.apache.nlpcraft.internal.nlp.token.enricher;
-import org.apache.nlpcraft.NCModelConfig;
-import org.apache.nlpcraft.NCRequest;
-import org.apache.nlpcraft.NCToken;
-import org.apache.nlpcraft.NCTokenEnricher;
+import org.apache.nlpcraft.*;
import
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCOpenNlpTokenEnricherImpl;
import java.io.File;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Objects;
+import java.util.*;
/**
* TODO: enriches with properties:
* - opennlp:name, values - look at constructor keys)
* - opennlp:probability, 0..1 probability
- *
- * Models can be download here: http://opennlp.sourceforge.net/models-1.5/ or
trained.
- *
- * Component is language independent.
- *
+ * <p>
+ * Models can be download here: http://opennlp.sourceforge.net/models-1.5/ or
trained.
+ * <p>
+ * Component is language independent.
+ * <p>
* TODO: which constructors should we keep?
*/
public class NCOpenNlpTokenEnricher implements NCTokenEnricher {
private final NCOpenNlpTokenEnricherImpl impl;
/**
- * Map key is property name, value is model definition via path, resource
or URL.
- *
- * @param models
- */
- public NCOpenNlpTokenEnricher(Map<String, String> models) {
- Objects.requireNonNull(models, "Models cannot be null.");
-
- this.impl = new NCOpenNlpTokenEnricherImpl(models);
- }
-
- /**
- *
* @param name
* @param modelSrc
*/
@@ -63,11 +45,10 @@ public class NCOpenNlpTokenEnricher implements
NCTokenEnricher {
Objects.requireNonNull(name, "Name cannot be null.");
Objects.requireNonNull(modelSrc, "Model source cannot be null.");
- this.impl = new NCOpenNlpTokenEnricherImpl(new HashMap<>() {{
put(name, modelSrc); }});
+ this.impl = NCOpenNlpTokenEnricherImpl.apply(name, modelSrc);
}
/**
- *
* @param name
* @param modelFile
*/
@@ -75,7 +56,7 @@ public class NCOpenNlpTokenEnricher implements
NCTokenEnricher {
Objects.requireNonNull(name, "Name cannot be null.");
Objects.requireNonNull(modelFile, "Model file cannot be null.");
- this.impl = new NCOpenNlpTokenEnricherImpl(new HashMap<>() {{
put(name, modelFile.getAbsolutePath()); }});
+ this.impl = NCOpenNlpTokenEnricherImpl.apply(name, modelFile);
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
index b8a9e00..63f1f50 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
@@ -30,21 +30,21 @@ import scala.jdk.CollectionConverters.*
import scala.util.Using
import scala.util.control.Exception.catching
+object NCOpenNlpTokenEnricherImpl {
+ def apply(name: String, res: String) = new
NCOpenNlpTokenEnricherImpl(name, NCUtils.getStream(res), res)
+ def apply(name: String, f: File) = new NCOpenNlpTokenEnricherImpl(name,
new FileInputStream(f), f.getAbsolutePath)
+}
/**
*
*/
-class NCOpenNlpTokenEnricherImpl(models: JMap[String, String]) extends
NCTokenEnricher with LazyLogging:
- @volatile private var nerFinders: Map[String, NameFinderME] = _
+class NCOpenNlpTokenEnricherImpl(name: String, is: InputStream, res: String)
extends NCTokenEnricher with LazyLogging:
+ @volatile private var finder: NameFinderME = _
- override def start(): Unit = nerFinders =
- models.asScala.map {
- case (name, res) =>
- val mdl = new NameFinderME(new
TokenNameFinderModel(NCUtils.getStream(res)))
- logger.trace(s"Loaded resource: $res")
- name -> mdl
- }.toMap
+ override def start(): Unit =
+ finder = new NameFinderME(new
TokenNameFinderModel(NCUtils.getStream(res)))
+ logger.trace(s"Loaded resource: $res")
- override def stop(): Unit = nerFinders = null
+ override def stop(): Unit = finder = null
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
JList[NCToken]): Unit =
val toksSeq = toks.asScala
@@ -53,29 +53,28 @@ class NCOpenNlpTokenEnricherImpl(models: JMap[String,
String]) extends NCTokenEn
case class Holder(start: Int, end: Int, name: String, probability:
Double)
val hs = this.synchronized {
- val hs = nerFinders.
- flatMap { case (name, finder) =>
- finder.find(words).map(p => Holder(p.getStart, p.getEnd -
1, name, p.getProb)).toSeq
- }
- nerFinders.values.foreach(_.clearAdaptiveData())
+ val hs = finder.find(words).map(p => Holder(p.getStart, p.getEnd -
1, name, p.getProb)).toSeq
+
+ finder.clearAdaptiveData()
+
hs
}
- if hs.nonEmpty then
- val toksSeqIdxs = toks.asScala.zipWithIndex
- for (h <- hs)
- def calcIndex(getHolderIndex: Holder => Int) =
- toksSeqIdxs.find { case (_, idx) => idx ==
getHolderIndex(h) } match
- case Some((_, idx)) => idx
- case None => -1
+ val toksSeqIdxs = toks.asScala.zipWithIndex
+
+ for (h <- hs)
+ def calcIndex(getHolderIndex: Holder => Int) =
+ toksSeqIdxs.find { case (_, idx) => idx == getHolderIndex(h) }
match
+ case Some((_, idx)) => idx
+ case None => -1
- val i1 = calcIndex(_.start)
- lazy val i2 = calcIndex(_.end)
+ val i1 = calcIndex(_.start)
+ lazy val i2 = calcIndex(_.end)
- if i1 != -1 && i2 != -1 then
- for ((tok, idx) <- toksSeqIdxs if idx >= i1 && idx <= i2)
- tok.put(s"opennlp:name", h.name)
- tok.put(s"opennlp:probability", h.probability)
+ if i1 != -1 && i2 != -1 then
+ for ((tok, idx) <- toksSeqIdxs if idx >= i1 && idx <= i2)
+ tok.put(s"opennlp:name", h.name)
+ tok.put(s"opennlp:probability", h.probability)
- // To avoid scala unexpected NPE from previous
operation.
- ()
\ No newline at end of file
+ // To avoid scala unexpected NPE from previous operation.
+ ()
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
index aeb3d66..2da2c91 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
@@ -17,11 +17,14 @@
package org.apache.nlpcraft.internal.nlp.token.enricher
+import org.apache.nlpcraft.NCLifecycle
import
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCOpenNlpTokenEnricherImpl
import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
import org.apache.nlpcraft.internal.nlp.util.*
+import org.apache.nlpcraft.internal.util.NCUtils
import org.junit.jupiter.api.*
+import scala.concurrent.ExecutionContext
import scala.jdk.CollectionConverters.*
import scala.jdk.OptionConverters.RichOptional
@@ -29,30 +32,30 @@ import scala.jdk.OptionConverters.RichOptional
*
*/
class NCOpenNlpTokenEnricherSpec:
+ private val enrichers =
scala.collection.mutable.ArrayBuffer.empty[NCOpenNlpTokenEnricher]
private var parser: NCEnOpenNlpTokenParser = _
- private var enricher: NCOpenNlpTokenEnricher = _
+
@BeforeEach
def start(): Unit =
parser = NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
- enricher = NCTestUtils.makeAndStart(
- // en-ner-time.bin is skipped. I can't find any working example.
- new NCOpenNlpTokenEnricher(
- Map(
- "location" -> "opennlp/en-ner-location.bin",
- "money" -> "opennlp/en-ner-money.bin",
- "person" -> "opennlp/en-ner-person.bin",
- "organization" -> "opennlp/en-ner-organization.bin",
- "date" -> "opennlp/en-ner-date.bin",
- "percentage" -> "opennlp/en-ner-percentage.bin"
- ).asJava
- )
- )
+
+ def add(name: String, res: String): Unit =
+ enrichers += NCTestUtils.makeAndStart(new
NCOpenNlpTokenEnricher(name, s"opennlp/$res"))
+
+ NCUtils.executeParallel(
+ () => add("location", "en-ner-location.bin"),
+ () => add("money", "en-ner-money.bin"),
+ () => add("person", "en-ner-person.bin"),
+ () => add("organization", "en-ner-organization.bin"),
+ () => add("date", "en-ner-date.bin"),
+ () => add("percentage", "en-ner-percentage.bin")
+ )(ExecutionContext.Implicits.global)
private def check(txt: String, expected: String*): Unit =
val req = NCTestRequest(txt)
val toks = parser.parse(req)
- enricher.enrich(req, null, toks)
+ enrichers.foreach(_.enrich(req, null, toks))
val toksSeq = toks.asScala.toSeq
NCTestUtils.printTokens(toksSeq, "opennlp:name", "opennlp:probability")