This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new dd246f7 WIP.
dd246f7 is described below
commit dd246f77087c8cefa104ba288c6db69af578a150
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Jan 5 13:04:40 2022 +0300
WIP.
---
nlpcraft-stanford/pom.xml | 24 +++++
.../parser/stanford/NCStanfordEntityParser.java | 26 ++++-
.../stanford/impl/NCStanfordEntityParserImpl.scala | 119 +++++++++++----------
.../parser/stanford/NCStanfordTokenParser.java | 18 ++--
.../parser/stanford/impl/NCStanfordNlpImpl.scala | 54 +++-------
.../stanford/NCStanfordEntityParserSpec.scala | 51 +++++++++
.../stanford/NCStanfordTokenParserSpec.scala | 61 +++++++++++
.../nlpcraft/nlp/utils/NCStanfordTestConfig.scala} | 35 ++----
8 files changed, 257 insertions(+), 131 deletions(-)
diff --git a/nlpcraft-stanford/pom.xml b/nlpcraft-stanford/pom.xml
index 2034364..c6300a2 100644
--- a/nlpcraft-stanford/pom.xml
+++ b/nlpcraft-stanford/pom.xml
@@ -46,6 +46,30 @@
<artifactId>stanford-corenlp</artifactId>
<classifier>models</classifier>
</dependency>
+
+ <!--
+ JUnit & ScalaTest dependencies.
+ ===============================
+ -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>nlpcraft</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter-api</artifactId>
+ <version>${junit.ver}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.scalatest</groupId>
+ <artifactId>scalatest_${scala.major.ver}</artifactId>
+ <version>${scalatest.ver}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
diff --git
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/entity/parser/stanford/NCStanfordEntityParser.java
b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/entity/parser/stanford/NCStanfordEntityParser.java
index d29ebc3..d5c5e4a 100644
---
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/entity/parser/stanford/NCStanfordEntityParser.java
+++
b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/entity/parser/stanford/NCStanfordEntityParser.java
@@ -17,18 +17,38 @@
package org.apache.nlpcraft.nlp.entity.parser.stanford;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import org.apache.nlpcraft.*;
import org.apache.nlpcraft.nlp.entity.parser.stanford.impl.*;
import java.util.List;
+import java.util.Objects;
+import java.util.Set;
/**
- *
+ * Generates entities with
+ * - ID `stanford:{name}` where 'name' is element name from configured
StanfordCoreNLP instance, from supported set
+ * - property `stanford:{name}:confidence`, where confidence is double value
between 0 and 1. Optional.
+ * - property `stanford:{name}:nne`, where nne is normalized value. Optional.
*/
public class NCStanfordEntityParser implements NCEntityParser {
private final NCStanfordEntityParserImpl impl;
- public NCStanfordEntityParser(NCStanfordEntityParserImpl impl) {
- this.impl = impl;
+ /**
+ * Requires configured StanfordCoreNLP instance.
+ * Example:
+ * Properties props = new Properties()
+ * props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner")
+ * StanfordCoreNLP stanford = new StanfordCoreNLP(props)
+ * Look at https://stanfordnlp.github.io/CoreNLP/ner.html#java-api-example
for more details.
+ *
+ * @param stanford
+ * @param supported
+ */
+ public NCStanfordEntityParser(StanfordCoreNLP stanford, Set<String>
supported) {
+ Objects.requireNonNull(stanford, "Stanford instance cannot be null.");
+ Objects.requireNonNull(supported, "Supported elements set cannot be
null.");
+
+ this.impl = new NCStanfordEntityParserImpl(stanford, supported);
}
@Override
diff --git
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/entity/parser/stanford/impl/NCStanfordEntityParserImpl.scala
b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/entity/parser/stanford/impl/NCStanfordEntityParserImpl.scala
index 7d7b388..a7d02a8 100644
---
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/entity/parser/stanford/impl/NCStanfordEntityParserImpl.scala
+++
b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/entity/parser/stanford/impl/NCStanfordEntityParserImpl.scala
@@ -17,67 +17,72 @@
package org.apache.nlpcraft.nlp.entity.parser.stanford.impl
+import edu.stanford.nlp.ling.CoreAnnotations.NormalizedNamedEntityTagAnnotation
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
import org.apache.nlpcraft.*
import java.util
+import java.util.List as JList
+import java.util.Set as JSet
import java.util.Properties
+import java.util.stream.Collectors
import scala.jdk.CollectionConverters.*
-class NCStanfordEntityParserImpl extends NCEntityParser:
- @volatile private var stanford: StanfordCoreNLP = _
-
- // https://stanfordnlp.github.io/CoreNLP/ner.html#java-api-example
- override def start(cfg: NCModelConfig): Unit =
- val p = new Properties()
-
- p.setProperty("annotators", "nctokenize, ssplit, pos, lemma, ner")
-
- // Created with hardcoded properties just for minimize configuration
issues.
- stanford = new StanfordCoreNLP(p)
-
- override def stop(): Unit = stanford = null
- override def parse(req: NCRequest, cfg: NCModelConfig, toks:
util.List[NCToken]): util.List[NCEntity] =
- null
-// val doc = new CoreDocument("a")
-//
-// stanford.annotate(req.getText)
-//
-// doc.entityMentions().asScala.
-// filter(e => ebiTokens.contains(e.entityType().toLowerCase)).
-// foreach(e => {
-// val offsets = e.charOffsets()
-//
-// val t1 = toks.find(_.startCharIndex == offsets.first)
-// val t2 = toks.find(_.endCharIndex == offsets.second)
-//
-// if (t1.nonEmpty && t2.nonEmpty) {
-// val buf = collection.mutable.ArrayBuffer.empty[(String, Any)]
-//
-// val nne =
e.coreMap().get(classOf[NormalizedNamedEntityTagAnnotation])
-//
-// if (nne != null)
-// buf += "nne" -> nne
-//
-// val conf = e.entityTypeConfidences()
-//
-// // Key ignored because it can be category with higher level
(`location` for type `country`)
-// if (conf.size() == 1)
-// buf += "confidence" -> conf.asScala.head._2
-//
-// val typ = e.entityType().toLowerCase
-//
-// val i1 = t1.get.startCharIndex
-// val i2 = t2.get.endCharIndex
-// val toks = ns.filter(t => t.startCharIndex >= i1 &&
t.endCharIndex <= i2)
-//
-// val note = NCNlpSentenceNote(
-// toks.map(_.index),
-// s"stanford:$typ",
-// buf.toSeq: _*
-// )
-//
-// toks.foreach(_.add(note))
-// }
-// })
-//
+/**
+ *
+ * @param stanford
+ * @param supported
+ */
+class NCStanfordEntityParserImpl(stanford: StanfordCoreNLP, supported:
JSet[String]) extends NCEntityParser:
+ override def parse(req: NCRequest, cfg: NCModelConfig, toksList:
JList[NCToken]): JList[NCEntity] =
+ val toks = toksList.asScala.toSeq
+
+ // It is important: don't use request text, it can contains extra
spaces.
+ val doc = new CoreDocument(toks.map(_.getText).mkString(" "))
+
+ stanford.annotate(doc)
+
+ extension (t: NCToken)
+ def startCharIndex =
toks.take(t.getIndex).map(_.getText.length).sum + t.getIndex
+ def endCharIndex = toks.take(t.getIndex +
1).map(_.getText.length).sum + t.getIndex
+
+ val res = new util.ArrayList[NCEntity]()
+
+ for (e <- doc.entityMentions().asScala)
+ val typ = e.entityType().toLowerCase
+
+ if supported.contains(typ) then
+ val offsets = e.charOffsets()
+
+ val t1 = toks.find(_.startCharIndex == offsets.first)
+ val t2 = toks.find(_.endCharIndex == offsets.second)
+
+ if t1.nonEmpty && t2.nonEmpty then
+ val props = collection.mutable.ArrayBuffer.empty[(String,
Any)]
+
+ val nne =
e.coreMap().get(classOf[NormalizedNamedEntityTagAnnotation])
+
+ if nne != null then props += "nne" -> nne
+
+ val conf = e.entityTypeConfidences()
+
+ // Key ignored because it can be category with higher
level (`location` for type `country`)
+ if conf.size() == 1 then props += "confidence" ->
conf.asScala.head._2
+
+ val typ = e.entityType().toLowerCase
+
+ val i1 = t1.get.startCharIndex
+ val i2 = t2.get.endCharIndex
+ val entToks = toks.filter(t => t.startCharIndex >= i1 &&
t.endCharIndex <= i2)
+
+ if entToks.nonEmpty then
+ res.add(
+ new NCPropertyMapAdapter with NCEntity:
+ props.foreach { (k, v) =>
put(s"stanford:$typ:$k", v) }
+
+ override def getTokens: JList[NCToken] =
entToks.asJava
+ override def getRequestId: String =
req.getRequestId
+ override def getId: String = s"stanford:$typ"
+ )
+
+ res
\ No newline at end of file
diff --git
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/NCStanfordTokenParser.java
b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/NCStanfordTokenParser.java
index 7ec5386..8dfd6c8 100644
---
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/NCStanfordTokenParser.java
+++
b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/NCStanfordTokenParser.java
@@ -17,6 +17,7 @@
package org.apache.nlpcraft.nlp.token.parser.stanford;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import org.apache.nlpcraft.*;
import org.apache.nlpcraft.nlp.token.parser.stanford.impl.*;
import java.util.List;
@@ -39,15 +40,20 @@ public class NCStanfordTokenParser implements NCTokenParser
{
}
/**
+ * Requires configured StanfordCoreNLP instance.
+ * Example:
+ * Properties props = new Properties()
+ * props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner")
+ * StanfordCoreNLP stanford = new StanfordCoreNLP(props)
+ * Look at https://stanfordnlp.github.io/CoreNLP/ner.html#java-api-example
for more details.
*
- * @param tokMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tokenizer model.
- * @param posMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tagger model.
- * @param lemmaDicSrc Local filesystem path, resources file path or URL
for OpenNLP lemmatizer dictionary.
- * @throws NCException
+ * @param stanford
*/
- public NCStanfordTokenParser() {
+ public NCStanfordTokenParser(StanfordCoreNLP stanford) {
+ Objects.requireNonNull(stanford, "Stanford instance cannot be null.");
+
try {
- impl = new NCStanfordNlpImpl();
+ impl = new NCStanfordNlpImpl(stanford);
}
catch (Exception e) {
throw new NCException("Failed to create OpenNLP token parser.", e);
diff --git
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNlpImpl.scala
b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNlpImpl.scala
index 95d3b2d..e427b8d 100644
---
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNlpImpl.scala
+++
b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNlpImpl.scala
@@ -17,66 +17,40 @@
package org.apache.nlpcraft.nlp.token.parser.stanford.impl
+import edu.stanford.nlp.ling.*
+import edu.stanford.nlp.ling.CoreAnnotations.*
+import edu.stanford.nlp.pipeline.*
+import edu.stanford.nlp.process.PTBTokenizer
+import edu.stanford.nlp.util.*
import org.apache.nlpcraft.*
import java.io.StringReader
import java.util
+import java.util.stream.Collectors
import java.util.{Properties, List as JList}
-import edu.stanford.nlp.ling.CoreAnnotations.*
-import edu.stanford.nlp.ling.*
-import edu.stanford.nlp.util.*
-import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
-import edu.stanford.nlp.process.PTBTokenizer
import scala.jdk.CollectionConverters.*
-import java.util.stream.Collectors
-
-class NCStanfordNlpImpl extends NCTokenParser:
- @volatile private var stanford: StanfordCoreNLP = _
-
- override def start(cfg: NCModelConfig): Unit =
- val p = new Properties()
-
- p.setProperty("annotators", "nctokenize, ssplit, pos, lemma, ner")
-
- // Created with hardcoded properties just for minimize configuration
issues.
- stanford = new StanfordCoreNLP(p)
-
- override def stop(): Unit = stanford = null
-
+class NCStanfordNlpImpl(stanford: StanfordCoreNLP) extends NCTokenParser:
override def tokenize(text: String): JList[String] =
PTBTokenizer.newPTBTokenizer(new
StringReader(text)).tokenize().stream().map(p =>
p.word()).collect(Collectors.toList)
override def getStem(s: String): String = null // TODO:
- // TODO: getPoses and getLemmas are equal.
- override def getPoses(toks: JList[String]): JList[String] =
- val doc = new CoreDocument("a")
+ private def get(toks: JList[String], getData: CoreLabel => String) =
+ val doc = new CoreDocument(toks.stream().collect(Collectors.joining("
")))
stanford.annotate(doc)
val a: JList[CoreMap] =
doc.annotation().get(classOf[SentencesAnnotation])
- if (a == null)
+ if a == null then
throw new NCException("Sentence annotation not found.")
a.stream().flatMap(p => {
val value: JList[CoreLabel] =
p.asInstanceOf[ArrayCoreMap].get(classOf[TokensAnnotation])
- value.stream().map(_.tag())
+ value.stream().map(p => getData(p))
}).collect(Collectors.toList)
- override def getLemmas(toks: JList[String], poses: JList[String]):
JList[String] =
- val doc = new CoreDocument("a")
-
- stanford.annotate(doc)
-
- val a: JList[CoreMap] =
doc.annotation().get(classOf[SentencesAnnotation])
-
- if (a == null)
- throw new NCException("Sentence annotation not found.")
-
- a.stream().flatMap(p => {
- val value: JList[CoreLabel] =
p.asInstanceOf[ArrayCoreMap].get(classOf[TokensAnnotation])
-
- value.stream().map(_.lemma())
- }).collect(Collectors.toList)
+ // TODO: getPoses and getLemmas are equal.
+ override def getPoses(toks: JList[String]): JList[String] = get(toks,
_.tag())
+ override def getLemmas(toks: JList[String], poses: JList[String]):
JList[String] = get(toks, _.lemma())
\ No newline at end of file
diff --git
a/nlpcraft-stanford/src/test/java/org/apache/nlpcraft/nlp/entity/parser/stanford/NCStanfordEntityParserSpec.scala
b/nlpcraft-stanford/src/test/java/org/apache/nlpcraft/nlp/entity/parser/stanford/NCStanfordEntityParserSpec.scala
new file mode 100644
index 0000000..464587c
--- /dev/null
+++
b/nlpcraft-stanford/src/test/java/org/apache/nlpcraft/nlp/entity/parser/stanford/NCStanfordEntityParserSpec.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.entity.parser.stanford
+
+import org.apache.nlpcraft.nlp.entity.parser.stanford.NCStanfordEntityParser
+import org.apache.nlpcraft.nlp.token.parser.stanford.NCStanfordTokenParser
+import org.apache.nlpcraft.nlp.util.NCTestToken
+import org.apache.nlpcraft.nlp.utils.NCStanfordTestConfig
+import org.junit.jupiter.api.Test
+import org.apache.nlpcraft.nlp.util.NCTestToken
+
+import scala.jdk.CollectionConverters.*
+import org.apache.nlpcraft.nlp.util.NCTestUtils
+
+class NCStanfordEntityParserSpec:
+ private val parser = NCStanfordTokenParser(NCStanfordTestConfig.STANFORD)
+
+ parser.start(null)
+
+ @Test
+ def test(): Unit =
+ val p = NCStanfordEntityParser(
+ NCStanfordTestConfig.STANFORD,
+ Set("city", "date", "number", "email").asJava
+ )
+
+ p.start(null)
+
+ val txt = "Los Angeles today, 23 and [email protected]"
+
+ val res = p.parse(null, null, NCTestUtils.mkTokens(parser, txt))
+
+ NCTestUtils.printEntities(txt, res.asScala.toSeq)
+
+ require(res.size() == 4)
+
diff --git
a/nlpcraft-stanford/src/test/java/org/apache/nlpcraft/nlp/token/parser/stanford/NCStanfordTokenParserSpec.scala
b/nlpcraft-stanford/src/test/java/org/apache/nlpcraft/nlp/token/parser/stanford/NCStanfordTokenParserSpec.scala
new file mode 100644
index 0000000..185d3f7
--- /dev/null
+++
b/nlpcraft-stanford/src/test/java/org/apache/nlpcraft/nlp/token/parser/stanford/NCStanfordTokenParserSpec.scala
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.token.parser.stanford
+
+import java.util.Properties
+import org.junit.jupiter.api.*
+import edu.stanford.nlp.pipeline.StanfordCoreNLP
+import org.apache.nlpcraft.nlp.utils.NCStanfordTestConfig
+
+import scala.jdk.CollectionConverters.*
+
+/**
+ *
+ */
+class NCStanfordTokenParserSpec:
+ @Test
+ def test(): Unit =
+ val parser = NCStanfordTokenParser(NCStanfordTestConfig.STANFORD)
+
+ parser.start(null)
+
+ // 1. Tokenization.
+ val toks = parser.tokenize("I had a lunch with brand name 'AAA'")
+
+ println(s"Tokens: ${toks.asScala.mkString("|")}")
+
+ require(toks.size() > 1)
+
+ // 2. POS tagging.
+ val poses = parser.getPoses(toks)
+
+ println(s"Poses: ${poses.asScala.mkString("|")}")
+
+ require(poses.size() == toks.size())
+ require(poses.asScala.toSeq.distinct.size > 1)
+
+ // 3. Lemma.
+ val lemmas = parser.getLemmas(toks, poses)
+
+ println(s"Lemmas: ${lemmas.asScala.mkString("|")}")
+
+ require(lemmas.size() == toks.size())
+ require(lemmas.asScala.zip(toks.asScala).exists { _ != _ })
+
+
+
diff --git
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/entity/parser/stanford/NCStanfordEntityParser.java
b/nlpcraft-stanford/src/test/java/org/apache/nlpcraft/nlp/utils/NCStanfordTestConfig.scala
similarity index 53%
copy from
nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/entity/parser/stanford/NCStanfordEntityParser.java
copy to
nlpcraft-stanford/src/test/java/org/apache/nlpcraft/nlp/utils/NCStanfordTestConfig.scala
index d29ebc3..5af17ea 100644
---
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/entity/parser/stanford/NCStanfordEntityParser.java
+++
b/nlpcraft-stanford/src/test/java/org/apache/nlpcraft/nlp/utils/NCStanfordTestConfig.scala
@@ -15,34 +15,19 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.nlp.entity.parser.stanford;
+package org.apache.nlpcraft.nlp.utils
-import org.apache.nlpcraft.*;
-import org.apache.nlpcraft.nlp.entity.parser.stanford.impl.*;
-import java.util.List;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP
-/**
- *
- */
-public class NCStanfordEntityParser implements NCEntityParser {
- private final NCStanfordEntityParserImpl impl;
+import java.util.Properties
- public NCStanfordEntityParser(NCStanfordEntityParserImpl impl) {
- this.impl = impl;
- }
-
- @Override
- public List<NCEntity> parse(NCRequest req, NCModelConfig cfg,
List<NCToken> toks) {
- return impl.parse(req, cfg, toks);
- }
+/**
+ *
+ */
+object NCStanfordTestConfig {
+ private val props = new Properties()
- @Override
- public void start(NCModelConfig cfg) {
- impl.start(cfg);
- }
+ props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner")
- @Override
- public void stop() {
- impl.stop();
- }
+ val STANFORD = new StanfordCoreNLP(props)
}