This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-471
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit eb07d388e24ecfbf39277fdd2418427821714598
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Dec 27 16:00:05 2021 +0300

    WIP.
---
 .../nlp/token/enricher/NCOpenNlpTokenEnricher.java |  66 ++++++++++++++++
 .../enricher/impl/NCOpenNlpTokenEnricherImpl.scala |  77 +++++++++++++++++++
 .../enricher/NCOpenNlpTokenEnricherSpec.scala      |  84 +++++++++++++++++++++
 .../nlpcraft/internal/nlp/util/NCTestRequest.scala |   2 +-
 .../src/test/resources/opennlp/en-ner-date.bin     | Bin 0 -> 5030307 bytes
 .../src/test/resources/opennlp/en-ner-location.bin | Bin 0 -> 5110658 bytes
 .../src/test/resources/opennlp/en-ner-money.bin    | Bin 0 -> 4806234 bytes
 .../test/resources/opennlp/en-ner-organization.bin | Bin 0 -> 5297172 bytes
 .../test/resources/opennlp/en-ner-percentage.bin   | Bin 0 -> 4728645 bytes
 .../src/test/resources/opennlp/en-ner-person.bin   | Bin 0 -> 5207953 bytes
 10 files changed, 228 insertions(+), 1 deletion(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
new file mode 100644
index 0000000..5db8704
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.nlp.token.enricher;
+
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCRequest;
+import org.apache.nlpcraft.NCToken;
+import org.apache.nlpcraft.NCTokenEnricher;
+import 
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCOpenNlpTokenEnricherImpl;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+/**
+ * TODO: enriches with properties:
+ *  - opennlp:name, values - look at constructor keys)
+ *  - opennlp:probability, 0..1 probability
+ *
+ *  Models can be download here: http://opennlp.sourceforge.net/models-1.5/
+ */
+public class NCOpenNlpTokenEnricher implements NCTokenEnricher {
+    private final NCOpenNlpTokenEnricherImpl impl;
+
+    /**
+     * Map key is property name, value is model definition via path, resource 
or URL.
+     *
+     * @param models
+     */
+    public NCOpenNlpTokenEnricher(Map<String, String> models) {
+        Objects.requireNonNull(models, "Models cannot be null.");
+
+        this.impl = new NCOpenNlpTokenEnricherImpl(models);
+    }
+
+    @Override
+    public void start() {
+        impl.start();
+    }
+
+    @Override
+    public void stop() {
+        impl.stop();
+    }
+
+    @Override
+    public void enrich(NCRequest req, NCModelConfig cfg, List<NCToken> toks) {
+        assert impl != null;
+        impl.enrich(req, cfg, toks);
+    }
+}
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
new file mode 100644
index 0000000..81b7b37
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.nlp.token.enricher.impl
+
+import com.typesafe.scalalogging.LazyLogging
+import opennlp.tools.namefind.*
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnQuotesImpl.*
+import org.apache.nlpcraft.internal.util.NCUtils
+
+import java.io.*
+import java.util.{List as JList, Map as JMap}
+import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.*
+import scala.util.Using
+import scala.util.control.Exception.catching
+
+/**
+  *
+  */
+class NCOpenNlpTokenEnricherImpl(models: JMap[String, String]) extends 
NCTokenEnricher with LazyLogging:
+    @volatile private var nerFinders: Map[String, NameFinderME] = _
+
+    override def start(): Unit = nerFinders =
+        models.asScala.map { case (name, path) => name -> new NameFinderME(new 
TokenNameFinderModel(NCUtils.getStream(path))) }.toMap
+
+    override def stop(): Unit = nerFinders = null
+
+    override def enrich(req: NCRequest, cfg: NCModelConfig, toks: 
JList[NCToken]): Unit =
+        val toksSeq = toks.asScala
+        val words = toksSeq.toArray.map(_.getOriginalText)
+
+        case class Holder(start: Int, end: Int, name: String, probability: 
Double)
+
+        val hs =
+            this.
+                synchronized {
+                    val res = nerFinders.
+                        flatMap {
+                            case (name, finder) =>
+                                finder.find(words).map(p => Holder(p.getStart, 
p.getEnd - 1, name, p.getProb)).toSeq
+                        }
+                    nerFinders.values.foreach(_.clearAdaptiveData())
+                    res
+                }
+
+        if hs.nonEmpty then
+            val toksSeqIdxs = toks.asScala.zipWithIndex
+            for (h <- hs)
+                val t1 = toksSeqIdxs.find { case (_, idx) => idx == h.start }
+                val t2 = toksSeqIdxs.find { case (_, idx) => idx == h.end }
+
+                if t1.nonEmpty && t2.nonEmpty then
+                    val i1 = t1.get._2
+                    val i2 = t2.get._2
+
+                    for ((tok, idx) <- toksSeqIdxs if idx >= i1 && idx <= i2)
+                        tok.put(s"opennlp:name", h.name)
+                        tok.put(s"opennlp:probability", h.probability)
+
+                        // To avoid scala unexpected NPE from previous 
operation.
+                        ()
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
new file mode 100644
index 0000000..3a3ea55
--- /dev/null
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.internal.nlp.token.enricher
+
+import 
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCOpenNlpTokenEnricherImpl
+import 
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.internal.nlp.util.*
+import org.junit.jupiter.api.*
+
+import scala.jdk.CollectionConverters.*
+import scala.jdk.OptionConverters.RichOptional
+
+/**
+  *
+  */
+class NCOpenNlpTokenEnricherSpec:
+    private var parser: NCEnOpenNlpTokenParser = _
+    private var enricher: NCOpenNlpTokenEnricher = _
+
+    @BeforeEach
+    def start(): Unit =
+        parser = NCTestUtils.makeAndStart(
+            new NCEnOpenNlpTokenParser(
+                "opennlp/en-token.bin",
+                "opennlp/en-pos-maxent.bin",
+                "opennlp/en-lemmatizer.dict"
+            )
+        )
+        enricher = NCTestUtils.makeAndStart(
+            // en-ner-time.bin is skipped. I can't find any working example.
+            new NCOpenNlpTokenEnricher(
+                Map(
+                    "location" -> "opennlp/en-ner-location.bin",
+                    "money" -> "opennlp/en-ner-money.bin",
+                    "person" -> "opennlp/en-ner-person.bin",
+                    "organization" -> "opennlp/en-ner-organization.bin",
+                    "date" -> "opennlp/en-ner-date.bin",
+                    "percentage" -> "opennlp/en-ner-percentage.bin"
+                ).asJava
+            )
+        )
+
+    private def check(txt: String, expected: String*): Unit =
+        val req = NCTestRequest(txt)
+        val toks = parser.parse(req)
+        val toksSeq = toks.asScala.toSeq
+
+        enricher.enrich(req, null, toks)
+
+        NCTestUtils.printTokens(toksSeq, "opennlp:name", "opennlp:probability")
+
+        require(toksSeq.exists(_.getOpt("opennlp:name").isPresent))
+        require(toksSeq.exists(_.getOpt("opennlp:probability").isPresent))
+
+        for (exp <- expected)
+            require(toksSeq.exists(t =>
+                t.getOpt[String]("opennlp:name").toScala match
+                    case Some(v) => v == exp
+                    case None => false
+            ))
+
+    @Test
+    def test(): Unit =
+        check("today", "date")
+        check("Moscow", "location")
+        check("10 is 5 % from 200", "percentage")
+        check("Tim Cook", "person")
+        check("Microsoft", "organization")
+        check("Current price is higher for 20 USA dollars", "money")
\ No newline at end of file
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestRequest.scala
 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestRequest.scala
index 190bb77..30670a0 100644
--- 
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestRequest.scala
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestRequest.scala
@@ -41,7 +41,7 @@ case class NCTestRequest(
 ) extends NCRequest:
     override def getUserId: String = userId
     override def getRequestId: String = reqId
-    override def getNormalizedText: String = txt.toLowerCase
+    override def getNormalizedText: String = txt.split(" 
").map(_.strip).filter(_.nonEmpty).mkString(" ")
     override def getOriginalText: String = txt
     override def getReceiveTimestamp: Long = ts
     override def getUserAgent: String = userAgent
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-date.bin 
b/nlpcraft/src/test/resources/opennlp/en-ner-date.bin
new file mode 100644
index 0000000..a69923a
Binary files /dev/null and 
b/nlpcraft/src/test/resources/opennlp/en-ner-date.bin differ
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-location.bin 
b/nlpcraft/src/test/resources/opennlp/en-ner-location.bin
new file mode 100644
index 0000000..f3788bc
Binary files /dev/null and 
b/nlpcraft/src/test/resources/opennlp/en-ner-location.bin differ
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-money.bin 
b/nlpcraft/src/test/resources/opennlp/en-ner-money.bin
new file mode 100644
index 0000000..2431e0f
Binary files /dev/null and 
b/nlpcraft/src/test/resources/opennlp/en-ner-money.bin differ
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-organization.bin 
b/nlpcraft/src/test/resources/opennlp/en-ner-organization.bin
new file mode 100644
index 0000000..1fb6d9f
Binary files /dev/null and 
b/nlpcraft/src/test/resources/opennlp/en-ner-organization.bin differ
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-percentage.bin 
b/nlpcraft/src/test/resources/opennlp/en-ner-percentage.bin
new file mode 100644
index 0000000..98cee1a
Binary files /dev/null and 
b/nlpcraft/src/test/resources/opennlp/en-ner-percentage.bin differ
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-person.bin 
b/nlpcraft/src/test/resources/opennlp/en-ner-person.bin
new file mode 100644
index 0000000..2f68318
Binary files /dev/null and 
b/nlpcraft/src/test/resources/opennlp/en-ner-person.bin differ

Reply via email to