This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-471
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-471 by this push:
new df1d1c6 WIP.
df1d1c6 is described below
commit df1d1c676689e9e5c3fda832f28327113b88133f
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Dec 28 00:31:44 2021 +0300
WIP.
---
.../nlp/token/enricher/NCOpenNlpTokenEnricher.java | 19 ++++++------
.../enricher/impl/NCOpenNlpTokenEnricherImpl.scala | 15 +++++-----
.../enricher/NCOpenNlpTokenEnricherSpec.scala | 35 ++++++++++------------
3 files changed, 34 insertions(+), 35 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
index da7742f..242c750 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
@@ -24,9 +24,12 @@ import java.io.File;
import java.util.*;
/**
- * TODO: enriches with properties:
- * - opennlp:name, values - look at constructor keys)
- * - opennlp:probability, 0..1 probability
+ * TODO: 3 properties
+ * - opennlp:name - token text
+ * - opennlp:name:id, integer startig from 1 (for grouping multiple words
tokens)
+ * - opennlp:name:probability, 0..1 probability
+ * where 'name' is element model name.
+ *
* <p>
* Models can be download here: http://opennlp.sourceforge.net/models-1.5/ or
trained.
* <p>
@@ -41,22 +44,20 @@ public class NCOpenNlpTokenEnricher implements
NCTokenEnricher {
* @param name
* @param modelSrc
*/
- public NCOpenNlpTokenEnricher(String name, String modelSrc) {
- Objects.requireNonNull(name, "Name cannot be null.");
+ public NCOpenNlpTokenEnricher(String modelSrc) {
Objects.requireNonNull(modelSrc, "Model source cannot be null.");
- this.impl = NCOpenNlpTokenEnricherImpl.apply(name, modelSrc);
+ this.impl = NCOpenNlpTokenEnricherImpl.apply(modelSrc);
}
/**
* @param name
* @param modelFile
*/
- public NCOpenNlpTokenEnricher(String name, File modelFile) {
- Objects.requireNonNull(name, "Name cannot be null.");
+ public NCOpenNlpTokenEnricher(File modelFile) {
Objects.requireNonNull(modelFile, "Model file cannot be null.");
- this.impl = NCOpenNlpTokenEnricherImpl.apply(name, modelFile);
+ this.impl = NCOpenNlpTokenEnricherImpl.apply(modelFile);
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
index 63f1f50..9ac7600 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
@@ -31,13 +31,13 @@ import scala.util.Using
import scala.util.control.Exception.catching
object NCOpenNlpTokenEnricherImpl {
- def apply(name: String, res: String) = new
NCOpenNlpTokenEnricherImpl(name, NCUtils.getStream(res), res)
- def apply(name: String, f: File) = new NCOpenNlpTokenEnricherImpl(name,
new FileInputStream(f), f.getAbsolutePath)
+ def apply(res: String): NCOpenNlpTokenEnricherImpl = new
NCOpenNlpTokenEnricherImpl(NCUtils.getStream(res), res)
+ def apply(f: File): NCOpenNlpTokenEnricherImpl = new
NCOpenNlpTokenEnricherImpl(new FileInputStream(f), f.getAbsolutePath)
}
/**
*
*/
-class NCOpenNlpTokenEnricherImpl(name: String, is: InputStream, res: String)
extends NCTokenEnricher with LazyLogging:
+class NCOpenNlpTokenEnricherImpl(is: InputStream, res: String) extends
NCTokenEnricher with LazyLogging:
@volatile private var finder: NameFinderME = _
override def start(): Unit =
@@ -53,7 +53,7 @@ class NCOpenNlpTokenEnricherImpl(name: String, is:
InputStream, res: String) ext
case class Holder(start: Int, end: Int, name: String, probability:
Double)
val hs = this.synchronized {
- val hs = finder.find(words).map(p => Holder(p.getStart, p.getEnd -
1, name, p.getProb)).toSeq
+ val hs = finder.find(words).map(p => Holder(p.getStart, p.getEnd -
1, p.getType, p.getProb) ).toSeq
finder.clearAdaptiveData()
@@ -62,7 +62,7 @@ class NCOpenNlpTokenEnricherImpl(name: String, is:
InputStream, res: String) ext
val toksSeqIdxs = toks.asScala.zipWithIndex
- for (h <- hs)
+ for ((h, hIdx) <- hs.zipWithIndex)
def calcIndex(getHolderIndex: Holder => Int) =
toksSeqIdxs.find { case (_, idx) => idx == getHolderIndex(h) }
match
case Some((_, idx)) => idx
@@ -73,8 +73,9 @@ class NCOpenNlpTokenEnricherImpl(name: String, is:
InputStream, res: String) ext
if i1 != -1 && i2 != -1 then
for ((tok, idx) <- toksSeqIdxs if idx >= i1 && idx <= i2)
- tok.put(s"opennlp:name", h.name)
- tok.put(s"opennlp:probability", h.probability)
+ tok.put(s"opennlp:${h.name}", tok.getOriginalText)
+ tok.put(s"opennlp:${h.name}:probability", h.probability)
+ tok.put(s"opennlp:${h.name}:id", hIdx + 1)
// To avoid scala unexpected NPE from previous operation.
()
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
index 104ec2f..2b826b4 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
@@ -39,36 +39,33 @@ class NCOpenNlpTokenEnricherSpec:
def start(): Unit =
parser = NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
- def add(name: String, res: String): Unit =
- enrichers += NCTestUtils.makeAndStart(new
NCOpenNlpTokenEnricher(name, s"opennlp/$res"))
+ def add(res: String): Unit =
+ enrichers += NCTestUtils.makeAndStart(new
NCOpenNlpTokenEnricher(s"opennlp/$res"))
NCUtils.executeParallel(
// en-ner-time.bin is skipped. I can't find any working example.
- () => add("location", "en-ner-location.bin"),
- () => add("money", "en-ner-money.bin"),
- () => add("person", "en-ner-person.bin"),
- () => add("organization", "en-ner-organization.bin"),
- () => add("date", "en-ner-date.bin"),
- () => add("percentage", "en-ner-percentage.bin")
+ () => add("en-ner-location.bin"),
+ () => add("en-ner-money.bin"),
+ () => add("en-ner-person.bin"),
+ () => add("en-ner-organization.bin"),
+ () => add("en-ner-date.bin"),
+ () => add("en-ner-percentage.bin")
)(ExecutionContext.Implicits.global)
- private def check(txt: String, expected: String*): Unit =
+ private def check(txt: String, expected: String): Unit =
val req = NCTestRequest(txt)
val toks = parser.parse(req)
enrichers.foreach(_.enrich(req, null, toks))
val toksSeq = toks.asScala.toSeq
- NCTestUtils.printTokens(toksSeq, "opennlp:name", "opennlp:probability")
+ val propName = s"opennlp:$expected"
+ val propProb = s"opennlp:${expected}:probability"
+ val propId = s"opennlp:${expected}:id"
+ NCTestUtils.printTokens(toksSeq, propName, propProb, propId)
- require(toksSeq.exists(_.getOpt("opennlp:name").isPresent))
- require(toksSeq.exists(_.getOpt("opennlp:probability").isPresent))
-
- for (exp <- expected)
- require(toksSeq.exists(t =>
- t.getOpt[String]("opennlp:name").toScala match
- case Some(v) => v == exp
- case None => false
- ))
+ require(toksSeq.exists(_.getOpt(propName).isPresent))
+ require(toksSeq.exists(_.getOpt(propProb).isPresent))
+ require(toksSeq.exists(_.getOpt(propId).isPresent))
@Test
def test(): Unit =