This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch master_test
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master_test by this push:
new f6c60b5 WIP
f6c60b5 is described below
commit f6c60b54a40022dc2447f33829e69d8f17f1750a
Author: Aaron Radzinski <[email protected]>
AuthorDate: Fri Dec 17 11:05:43 2021 -0800
WIP
---
.../token/parser/opennlp/NCOpenNlpTokenParser.java | 25 ++++++++++++++---
.../token/parser/opennlp/impl/NCOpenNlpImpl.scala | 31 +++++++++++++++-------
2 files changed, 43 insertions(+), 13 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
index 3789280..a590d72 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
@@ -19,7 +19,10 @@ package
org.apache.nlpcraft.internal.nlp.token.parser.opennlp;
import org.apache.nlpcraft.*;
import org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.*;
+
+import java.io.BufferedInputStream;
import java.io.File;
+import java.io.FileInputStream;
import java.io.InputStream;
import java.util.List;
@@ -42,10 +45,19 @@ public class NCOpenNlpTokenParser implements NCTokenParser {
* @param tokMdl
* @param posMdl
* @param lemmaDic
+ * @throws NCException
*/
public NCOpenNlpTokenParser(File tokMdl, File posMdl, File lemmaDic) {
- // TODO
- impl = null;
+ try {
+ impl = new NCOpenNlpImpl(
+ new BufferedInputStream(new FileInputStream(tokMdl)),
+ new BufferedInputStream(new FileInputStream(posMdl)),
+ new BufferedInputStream(new FileInputStream(lemmaDic))
+ );
+ }
+ catch (Exception e) {
+ throw new NCException("Failed to create OpenNLP token parser.", e);
+ }
}
/**
@@ -53,10 +65,15 @@ public class NCOpenNlpTokenParser implements NCTokenParser {
* @param tokMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tokenizer model.
* @param posMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tagger model.
* @param lemmaDicSrc Local filesystem path, resources file path or URL
for OpenNLP lemmatizer dictionary.
+ * @throws NCException
*/
public NCOpenNlpTokenParser(String tokMdlSrc, String posMdlSrc, String
lemmaDicSrc) {
- // TODO
- impl = null;
+ try {
+ impl = NCOpenNlpImpl.apply(tokMdlSrc, posMdlSrc, lemmaDicSrc);
+ }
+ catch (Exception e) {
+ throw new NCException("Failed to create OpenNLP token parser.", e);
+ }
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
index 03d1c32..920efde 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
@@ -28,41 +28,54 @@ import opennlp.tools.tokenize.*
import scala.jdk.CollectionConverters.*
+object NCOpenNlpImpl:
+ /**
+ *
+ * @param tokMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tokenizer model.
+ * @param posMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tagger model.
+ * @param lemmaDicSrc Local filesystem path, resources file path or URL
for OpenNLP lemmatizer dictionary.
+ * @return
+ */
+ def apply(tokMdlSrc: String, posMdlSrc: String, lemmaDicSrc: String):
NCOpenNlpImpl = ???
+
+
/**
*
- * @param tokModelIn
- * @param posModelIn
+ * @param tokMdlIn
+ * @param posMdlIn
* @param lemmaDicIn
*/
-class NCOpenNlpImpl(tokModelIn: InputStream, posModelIn: InputStream,
lemmaDicIn: InputStream):
- private val tokenizer = new TokenizerME(new TokenizerModel(tokModelIn))
- private val tagger = new POSTaggerME(new POSModel(posModelIn))
+class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn: InputStream, lemmaDicIn:
InputStream):
+ private val tokenizer = new TokenizerME(new TokenizerModel(tokMdlIn))
+ private val tagger = new POSTaggerME(new POSModel(posMdlIn))
private val lemmatizer = new DictionaryLemmatizer(lemmaDicIn)
private val stemmer = new PorterStemmer
+ private var addStopWords = List.empty[String]
+ private var exclStopWords = List.empty[String]
/**
*
* @return
*/
- def getAdditionalStopWords: JList[String] = ???
+ def getAdditionalStopWords: JList[String] = addStopWords.asJava
/**
*
* @return
*/
- def getExcludedStopWords: JList[String] = ???
+ def getExcludedStopWords: JList[String] = exclStopWords.asJava
/**
*
* @param addStopWords
*/
- def setAdditionalStopWords(addStopWords: JList[String]): Unit = ???
+ def setAdditionalStopWords(addStopWords: JList[String]): Unit =
this.addStopWords = addStopWords.asScala.toList
/**
*
* @param exclStopWords
*/
- def setExcludedStopWords(exclStopWords: JList[String]): Unit = ???
+ def setExcludedStopWords(exclStopWords: JList[String]): Unit =
this.exclStopWords = exclStopWords.asScala.toList
/**
*