This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new 00baf4f POS and Lemmatizer optional configuration for OpenNlp token
parser.
00baf4f is described below
commit 00baf4f64e81d05e59eeb5dc30be6f9ebe4360a4
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Feb 25 15:18:02 2022 +0300
POS and Lemmatizer optional configuration for OpenNlp token parser.
---
.../impl/NCStanfordNLPTokenParserImpl.scala | 12 ++++---
.../token/parser/opennlp/NCOpenNLPTokenParser.java | 2 --
.../opennlp/impl/NCOpenNLPTokenParserImpl.scala | 18 +++++-----
.../parser/opennlp/NCOpenNLPTokenParserSpec.scala | 40 ++++++++++++++++++++++
4 files changed, 57 insertions(+), 15 deletions(-)
diff --git
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNLPTokenParserImpl.scala
b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNLPTokenParserImpl.scala
index ba24664..15152e8 100644
---
a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNLPTokenParserImpl.scala
+++
b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNLPTokenParserImpl.scala
@@ -37,6 +37,8 @@ import scala.jdk.CollectionConverters.*
class NCStanfordNLPTokenParserImpl(stanford: StanfordCoreNLP) extends
NCTokenParser:
require(stanford != null)
+ private def nvl(v: String, dflt : => String): String = if v != null then v
else dflt
+
override def tokenize(text: String): JList[NCToken] =
val doc = new CoreDocument(text)
stanford.annotate(doc)
@@ -45,10 +47,12 @@ class NCStanfordNLPTokenParserImpl(stanford:
StanfordCoreNLP) extends NCTokenPar
val toks =
ann.asScala.flatMap(_.asInstanceOf[ArrayCoreMap].get(classOf[TokensAnnotation]).asScala).
zipWithIndex.map { (t, idx) =>
- new NCPropertyMapAdapter with NCToken :
- override val getText: String = t.originalText()
- override val getLemma: String = t.lemma()
- override val getPos: String = t.tag()
+ val txt = t.originalText()
+
+ new NCPropertyMapAdapter with NCToken:
+ override val getText: String = txt
+ override val getLemma: String = nvl(t.lemma(), txt)
+ override val getPos: String = nvl(t.tag(), "")
override val getIndex: Int = idx
override val getStartCharIndex: Int = t.beginPosition()
override val getEndCharIndex: Int = t.endPosition()
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParser.java
index a9cdbf2..629c8aa 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParser.java
@@ -45,8 +45,6 @@ public class NCOpenNLPTokenParser implements NCTokenParser {
*/
public NCOpenNLPTokenParser(String tokMdlSrc, String posMdlSrc, String
lemmaDicSrc) {
Objects.requireNonNull(tokMdlSrc, "Tokenizer model path cannot be
null.");
- Objects.requireNonNull(posMdlSrc, "POS model path cannot be null.");
- Objects.requireNonNull(lemmaDicSrc, "Lemmatizer model path cannot be
null.");
impl = new NCOpenNLPTokenParserImpl(tokMdlSrc, posMdlSrc, lemmaDicSrc);
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/NCOpenNLPTokenParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/NCOpenNLPTokenParserImpl.scala
index c1074fb..b52d32e 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/NCOpenNLPTokenParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/NCOpenNLPTokenParserImpl.scala
@@ -40,8 +40,6 @@ import scala.jdk.CollectionConverters.*
*/
class NCOpenNLPTokenParserImpl(tokMdl: String, posMdlSrc: String,
lemmaDicSrc: String) extends NCTokenParser with LazyLogging:
require(tokMdl != null)
- require(posMdlSrc != null)
- require(lemmaDicSrc != null)
private var tagger: POSTaggerME = _
private var lemmatizer: DictionaryLemmatizer = _
@@ -52,12 +50,14 @@ class NCOpenNLPTokenParserImpl(tokMdl: String, posMdlSrc:
String, lemmaDicSrc:
private def init(): Unit =
NCUtils.execPar(
() => {
- tagger = new POSTaggerME(new
POSModel(NCUtils.getStream(posMdlSrc)))
- logger.trace(s"Loaded resource: $posMdlSrc")
+ if posMdlSrc != null then
+ tagger = new POSTaggerME(new
POSModel(NCUtils.getStream(posMdlSrc)))
+ logger.trace(s"Loaded resource: $posMdlSrc")
},
() => {
- lemmatizer = new
DictionaryLemmatizer(NCUtils.getStream(lemmaDicSrc))
- logger.trace(s"Loaded resource: $lemmaDicSrc")
+ if lemmaDicSrc != null then
+ lemmatizer = new
DictionaryLemmatizer(NCUtils.getStream(lemmaDicSrc))
+ logger.trace(s"Loaded resource: $lemmaDicSrc")
},
() => {
tokenizer = new TokenizerME(new
TokenizerModel(NCUtils.getStream(tokMdl)))
@@ -71,8 +71,8 @@ class NCOpenNLPTokenParserImpl(tokMdl: String, posMdlSrc:
String, lemmaDicSrc:
this.synchronized {
val hs = tokenizer.tokenizePos(text).map(p =>
Holder(p.getCoveredText(text).toString, p.getStart, p.getEnd))
val toks = hs.map(_.text)
- val poses = tagger.tag(toks)
- var lemmas = lemmatizer.lemmatize(toks, poses)
+ val poses = if tagger != null then tagger.tag(toks) else
toks.map(_ => "")
+ var lemmas = if lemmatizer != null then lemmatizer.lemmatize(toks,
poses) else toks
require(toks.length == poses.length && toks.length ==
lemmas.length)
@@ -83,7 +83,7 @@ class NCOpenNLPTokenParserImpl(tokMdl: String, posMdlSrc:
String, lemmaDicSrc:
case ((lemma, pos), i) => Option.when(lemma == "O" && pos ==
"NN")(i)
}
- if suspIdxs.nonEmpty then
+ if suspIdxs.nonEmpty && lemmatizer != null then
val fixes: Map[Int, String] = lemmatizer.
lemmatize(suspIdxs.map(i => toks(i)), suspIdxs.map(_ =>
"NNN")).
zipWithIndex.
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParserSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParserSpec.scala
index 12b52d3..009097c 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParserSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParserSpec.scala
@@ -96,3 +96,43 @@ class NCOpenNLPTokenParserSpec:
"< < [ a ] > >",
toks => require(!isStopWord(toks.find(_.getText == "a").get))
)
+
+ @Test
+ def testNullable(): Unit =
+ val txt = "parents had files"
+
+ // 1. Nullable.
+ var parser = new NCOpenNLPTokenParser(
+ "opennlp/en-token.bin",
+ null,
+ null
+ )
+
+ var tbl = NCAsciiTable("Text", "Lemma", "POS")
+
+ for (t <- parser.tokenize(txt).asScala)
+ tbl += (t.getText, t.getLemma, t.getPos)
+
+ require(t.getPos.isEmpty)
+ require(t.getText == t.getLemma)
+
+ println(tbl.toString)
+
+ // 2. Not nullable.
+ parser = new NCOpenNLPTokenParser(
+ "opennlp/en-token.bin",
+ "opennlp/en-pos-maxent.bin",
+ "opennlp/en-lemmatizer.dict"
+ )
+
+ tbl = NCAsciiTable("Text", "Lemma", "POS")
+
+ for (t <- parser.tokenize(txt).asScala)
+ tbl += (t.getText, t.getLemma, t.getPos)
+
+ require(t.getPos.nonEmpty)
+ require(t.getText != t.getLemma)
+
+ println(tbl.toString)
+
+