This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master_test
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master_test by this push:
new 1009caf WIP.
1009caf is described below
commit 1009caf1ad635bb86ec8abc2066d35bb418e9359
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Dec 14 16:26:45 2021 +0300
WIP.
---
.../token/parser/opennlp/NCOpenNlpTokenParser.java | 7 +++-
.../parser/opennlp/NCOpenNlpTokenParserImpl.scala | 39 ++++++++--------------
2 files changed, 20 insertions(+), 26 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
index f0bae1f..fc741e5 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
@@ -25,8 +25,13 @@ import java.io.File;
import java.io.InputStream;
import java.util.List;
+
/**
- *
+ * TODO:
+ * Models can be downloaded from the following resources:
+ * - tokenizer: http://opennlp.sourceforge.net/models-1.5/en-token.bin
+ * - tagger: http://opennlp.sourceforge.net/models-1.5/en-pos-maxent.bin
+ * - lemmatizer:
https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict
*/
public class NCOpenNlpTokenParser implements NCTokenParser {
private final NCOpenNlpTokenParserImpl delegate;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserImpl.scala
index d04fb43..7d19ed1 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserImpl.scala
@@ -23,7 +23,7 @@ import opennlp.tools.stemmer.PorterStemmer
import opennlp.tools.tokenize.{Tokenizer, TokenizerME, TokenizerModel}
import org.apache.nlpcraft.*
-import java.io.{File, FileNotFoundException, IOException, InputStream,
BufferedInputStream as BIS, FileInputStream as FIS}
+import java.io.{File, FileNotFoundException, IOException, InputStream as IS,
BufferedInputStream as BIS, FileInputStream as FIS}
import java.net.URL
import java.util
import java.util.{List, Objects}
@@ -112,11 +112,7 @@ private[opennlp] object NCOpenNlpTokenParserImpl {
* @param lemmatizer
*/
@throws[NullPointerException]
- def apply(
- tokenizer: InputStream,
- tagger: InputStream,
- lemmatizer: InputStream
- ): NCOpenNlpTokenParserImpl = {
+ def apply(tokenizer: IS, tagger: IS, lemmatizer: IS):
NCOpenNlpTokenParserImpl = {
verify(tokenizer, tagger, lemmatizer)
new NCOpenNlpTokenParserImpl(tokenizer, tagger, lemmatizer)
@@ -146,11 +142,7 @@ private[opennlp] object NCOpenNlpTokenParserImpl {
* @param taggerStream
* @param lemmatizerStream
*/
-private[opennlp] class NCOpenNlpTokenParserImpl(
- tokenizerStream: InputStream,
- taggerStream: InputStream,
- lemmatizerStream: InputStream,
-) extends NCTokenParser {
+private[opennlp] class NCOpenNlpTokenParserImpl(tokenizerStream: IS,
taggerStream: IS, lemmatizerStream: IS) extends NCTokenParser {
private val stemmer = new PorterStemmer
var extraStopWords: util.List[String] = _
@@ -161,13 +153,11 @@ private[opennlp] class NCOpenNlpTokenParserImpl(
@volatile private var lemmatizer: DictionaryLemmatizer = _
override def parse(req: NCRequest): util.List[NCToken] = {
- require(tokenizer != null)
+ case class Holder(origin: String, normalized: String, start: Int, end:
Int, lenght: Int)
+ abstract class NCOpenNlpToken extends NCParameterizedAdapter with
NCToken
val sen = req.getNormalizedText
- case class Holder(origin: String, normalized: String, start: Int, end:
Int, lenght: Int)
- abstract class NCTokenImpl extends NCParameterizedAdapter with NCToken
-
val hs =
tokenizer.tokenizePos(sen).map(
t => {
@@ -194,27 +184,26 @@ private[opennlp] class NCOpenNlpTokenParserImpl(
// ...
// time-ball NN time-ball
// ...
- val suspIdxs: Seq[Int] =
- lemmas.
- zip(poses).
- zipWithIndex.flatMap {
- // "0" is flag that lemma cannot be obtained for some reasons.
- case ((lemma, pos), i) => if (lemma == "O" && pos == "NN") Some(i)
- else None
- }
+ val suspIdxs =
+ lemmas.
+ zip(poses).
+ zipWithIndex.flatMap {
+ // "0" is flag that lemma cannot be obtained for some reasons.
+ case ((lemma, pos), i) => if (lemma == "O" && pos == "NN")
Some(i) else None
+ }
if (suspIdxs.nonEmpty) {
val fixes: Map[Int, String] =
lemmatizer.
lemmatize(suspIdxs.map(i => words(i)).toArray,
suspIdxs.map(_ => "NNN").toArray).
zipWithIndex.
- flatMap { case (lemma, i) => if (lemma != "0")
Some(suspIdxs(i) -> lemma) else None}.toMap
+ flatMap { case (lemma, i) => if (lemma != "0")
Some(suspIdxs(i) -> lemma) else None }.toMap
lemmas = lemmas.zipWithIndex.map { case (lemma, idx) =>
fixes.getOrElse(idx, lemma) }
}
hs.zip(poses).zip(lemmas).toIndexedSeq.map { case ((h, pos), lemma) =>
- new NCTokenImpl() {
+ new NCOpenNlpToken() {
override def getOriginalText: String = h.origin
override def getNormalizedText: String = h.normalized
override def getLemma: String = lemma