This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-471 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit eb07d388e24ecfbf39277fdd2418427821714598 Author: Sergey Kamov <[email protected]> AuthorDate: Mon Dec 27 16:00:05 2021 +0300 WIP. --- .../nlp/token/enricher/NCOpenNlpTokenEnricher.java | 66 ++++++++++++++++ .../enricher/impl/NCOpenNlpTokenEnricherImpl.scala | 77 +++++++++++++++++++ .../enricher/NCOpenNlpTokenEnricherSpec.scala | 84 +++++++++++++++++++++ .../nlpcraft/internal/nlp/util/NCTestRequest.scala | 2 +- .../src/test/resources/opennlp/en-ner-date.bin | Bin 0 -> 5030307 bytes .../src/test/resources/opennlp/en-ner-location.bin | Bin 0 -> 5110658 bytes .../src/test/resources/opennlp/en-ner-money.bin | Bin 0 -> 4806234 bytes .../test/resources/opennlp/en-ner-organization.bin | Bin 0 -> 5297172 bytes .../test/resources/opennlp/en-ner-percentage.bin | Bin 0 -> 4728645 bytes .../src/test/resources/opennlp/en-ner-person.bin | Bin 0 -> 5207953 bytes 10 files changed, 228 insertions(+), 1 deletion(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java new file mode 100644 index 0000000..5db8704 --- /dev/null +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricher.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nlpcraft.internal.nlp.token.enricher; + +import org.apache.nlpcraft.NCModelConfig; +import org.apache.nlpcraft.NCRequest; +import org.apache.nlpcraft.NCToken; +import org.apache.nlpcraft.NCTokenEnricher; +import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCOpenNlpTokenEnricherImpl; + +import java.util.List; +import java.util.Map; +import java.util.Objects; + +/** + * TODO: enriches with properties: + * - opennlp:name, values - look at constructor keys) + * - opennlp:probability, 0..1 probability + * + * Models can be download here: http://opennlp.sourceforge.net/models-1.5/ + */ +public class NCOpenNlpTokenEnricher implements NCTokenEnricher { + private final NCOpenNlpTokenEnricherImpl impl; + + /** + * Map key is property name, value is model definition via path, resource or URL. + * + * @param models + */ + public NCOpenNlpTokenEnricher(Map<String, String> models) { + Objects.requireNonNull(models, "Models cannot be null."); + + this.impl = new NCOpenNlpTokenEnricherImpl(models); + } + + @Override + public void start() { + impl.start(); + } + + @Override + public void stop() { + impl.stop(); + } + + @Override + public void enrich(NCRequest req, NCModelConfig cfg, List<NCToken> toks) { + assert impl != null; + impl.enrich(req, cfg, toks); + } +} diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala new file mode 100644 index 0000000..81b7b37 --- /dev/null +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCOpenNlpTokenEnricherImpl.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nlpcraft.internal.nlp.token.enricher.impl + +import com.typesafe.scalalogging.LazyLogging +import opennlp.tools.namefind.* +import org.apache.nlpcraft.* +import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnQuotesImpl.* +import org.apache.nlpcraft.internal.util.NCUtils + +import java.io.* +import java.util.{List as JList, Map as JMap} +import scala.concurrent.ExecutionContext +import scala.jdk.CollectionConverters.* +import scala.util.Using +import scala.util.control.Exception.catching + +/** + * + */ +class NCOpenNlpTokenEnricherImpl(models: JMap[String, String]) extends NCTokenEnricher with LazyLogging: + @volatile private var nerFinders: Map[String, NameFinderME] = _ + + override def start(): Unit = nerFinders = + models.asScala.map { case (name, path) => name -> new NameFinderME(new TokenNameFinderModel(NCUtils.getStream(path))) }.toMap + + override def stop(): Unit = nerFinders = null + + override def enrich(req: NCRequest, cfg: NCModelConfig, toks: JList[NCToken]): Unit = + val toksSeq = toks.asScala + val words = toksSeq.toArray.map(_.getOriginalText) + + case class Holder(start: Int, end: Int, name: String, probability: Double) + + val hs = + this. + synchronized { + val res = nerFinders. + flatMap { + case (name, finder) => + finder.find(words).map(p => Holder(p.getStart, p.getEnd - 1, name, p.getProb)).toSeq + } + nerFinders.values.foreach(_.clearAdaptiveData()) + res + } + + if hs.nonEmpty then + val toksSeqIdxs = toks.asScala.zipWithIndex + for (h <- hs) + val t1 = toksSeqIdxs.find { case (_, idx) => idx == h.start } + val t2 = toksSeqIdxs.find { case (_, idx) => idx == h.end } + + if t1.nonEmpty && t2.nonEmpty then + val i1 = t1.get._2 + val i2 = t2.get._2 + + for ((tok, idx) <- toksSeqIdxs if idx >= i1 && idx <= i2) + tok.put(s"opennlp:name", h.name) + tok.put(s"opennlp:probability", h.probability) + + // To avoid scala unexpected NPE from previous operation. + () diff --git a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala new file mode 100644 index 0000000..3a3ea55 --- /dev/null +++ b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCOpenNlpTokenEnricherSpec.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nlpcraft.internal.nlp.token.enricher + +import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCOpenNlpTokenEnricherImpl +import org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser +import org.apache.nlpcraft.internal.nlp.util.* +import org.junit.jupiter.api.* + +import scala.jdk.CollectionConverters.* +import scala.jdk.OptionConverters.RichOptional + +/** + * + */ +class NCOpenNlpTokenEnricherSpec: + private var parser: NCEnOpenNlpTokenParser = _ + private var enricher: NCOpenNlpTokenEnricher = _ + + @BeforeEach + def start(): Unit = + parser = NCTestUtils.makeAndStart( + new NCEnOpenNlpTokenParser( + "opennlp/en-token.bin", + "opennlp/en-pos-maxent.bin", + "opennlp/en-lemmatizer.dict" + ) + ) + enricher = NCTestUtils.makeAndStart( + // en-ner-time.bin is skipped. I can't find any working example. + new NCOpenNlpTokenEnricher( + Map( + "location" -> "opennlp/en-ner-location.bin", + "money" -> "opennlp/en-ner-money.bin", + "person" -> "opennlp/en-ner-person.bin", + "organization" -> "opennlp/en-ner-organization.bin", + "date" -> "opennlp/en-ner-date.bin", + "percentage" -> "opennlp/en-ner-percentage.bin" + ).asJava + ) + ) + + private def check(txt: String, expected: String*): Unit = + val req = NCTestRequest(txt) + val toks = parser.parse(req) + val toksSeq = toks.asScala.toSeq + + enricher.enrich(req, null, toks) + + NCTestUtils.printTokens(toksSeq, "opennlp:name", "opennlp:probability") + + require(toksSeq.exists(_.getOpt("opennlp:name").isPresent)) + require(toksSeq.exists(_.getOpt("opennlp:probability").isPresent)) + + for (exp <- expected) + require(toksSeq.exists(t => + t.getOpt[String]("opennlp:name").toScala match + case Some(v) => v == exp + case None => false + )) + + @Test + def test(): Unit = + check("today", "date") + check("Moscow", "location") + check("10 is 5 % from 200", "percentage") + check("Tim Cook", "person") + check("Microsoft", "organization") + check("Current price is higher for 20 USA dollars", "money") \ No newline at end of file diff --git a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestRequest.scala b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestRequest.scala index 190bb77..30670a0 100644 --- a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestRequest.scala +++ b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestRequest.scala @@ -41,7 +41,7 @@ case class NCTestRequest( ) extends NCRequest: override def getUserId: String = userId override def getRequestId: String = reqId - override def getNormalizedText: String = txt.toLowerCase + override def getNormalizedText: String = txt.split(" ").map(_.strip).filter(_.nonEmpty).mkString(" ") override def getOriginalText: String = txt override def getReceiveTimestamp: Long = ts override def getUserAgent: String = userAgent diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-date.bin b/nlpcraft/src/test/resources/opennlp/en-ner-date.bin new file mode 100644 index 0000000..a69923a Binary files /dev/null and b/nlpcraft/src/test/resources/opennlp/en-ner-date.bin differ diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-location.bin b/nlpcraft/src/test/resources/opennlp/en-ner-location.bin new file mode 100644 index 0000000..f3788bc Binary files /dev/null and b/nlpcraft/src/test/resources/opennlp/en-ner-location.bin differ diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-money.bin b/nlpcraft/src/test/resources/opennlp/en-ner-money.bin new file mode 100644 index 0000000..2431e0f Binary files /dev/null and b/nlpcraft/src/test/resources/opennlp/en-ner-money.bin differ diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-organization.bin b/nlpcraft/src/test/resources/opennlp/en-ner-organization.bin new file mode 100644 index 0000000..1fb6d9f Binary files /dev/null and b/nlpcraft/src/test/resources/opennlp/en-ner-organization.bin differ diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-percentage.bin b/nlpcraft/src/test/resources/opennlp/en-ner-percentage.bin new file mode 100644 index 0000000..98cee1a Binary files /dev/null and b/nlpcraft/src/test/resources/opennlp/en-ner-percentage.bin differ diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-person.bin b/nlpcraft/src/test/resources/opennlp/en-ner-person.bin new file mode 100644 index 0000000..2f68318 Binary files /dev/null and b/nlpcraft/src/test/resources/opennlp/en-ner-person.bin differ
