This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new 4a9df81 WIP.
4a9df81 is described below
commit 4a9df814779ffb89bdadf8254acb31a55ff4e6ac
Author: Sergey Kamov <[email protected]>
AuthorDate: Thu Dec 30 22:36:24 2021 +0300
WIP.
---
.../parser/opennlp/NCOpenNlpEntityParser.java | 12 +-------
.../opennlp/impl/NCOpenNlpEntityParserImpl.scala | 7 ++---
.../parser/semantic/NCSemanticEntityParser.java | 13 --------
.../semantic/en/NCEnSemanticEntityParser.java | 9 ------
.../semantic/impl/NCSemanticEntityParserImpl.scala | 24 +++------------
.../enricher/en/NCEnSwearWordsTokenEnricher.java | 15 +--------
.../enricher/en/impl/NCEnSwearWordsImpl.scala | 27 ++--------------
.../parser/opennlp/en/NCEnOpenNlpTokenParser.java | 21 +------------
.../parser/opennlp/en/impl/NCEnOpenNlpImpl.scala | 36 +++-------------------
.../nlp/tokenizer/opennlp/NCOpenNlpTokenizer.java | 18 +----------
.../opennlp/impl/NCOpenNlpTokenizerImpl.scala | 11 ++-----
11 files changed, 20 insertions(+), 173 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParser.java
index a725430..1ea0930 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParser.java
@@ -20,7 +20,6 @@ package org.apache.nlpcraft.nlp.entity.parser.opennlp;
import org.apache.nlpcraft.*;
import
org.apache.nlpcraft.nlp.entity.parser.opennlp.impl.NCOpenNlpEntityParserImpl;
-import java.io.File;
import java.util.List;
import java.util.Objects;
@@ -45,16 +44,7 @@ public class NCOpenNlpEntityParser implements NCEntityParser
{
public NCOpenNlpEntityParser(String mdlSrc) {
Objects.requireNonNull(mdlSrc, "Model source cannot be null.");
- this.impl = NCOpenNlpEntityParserImpl.apply(mdlSrc);
- }
-
- /**
- * @param mdlFile
- */
- public NCOpenNlpEntityParser(File mdlFile) {
- Objects.requireNonNull(mdlFile, "Model file cannot be null.");
-
- this.impl = NCOpenNlpEntityParserImpl.apply(mdlFile);
+ this.impl = new NCOpenNlpEntityParserImpl(mdlSrc);
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
index f35b46e..96519ca 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
@@ -31,14 +31,11 @@ import scala.jdk.CollectionConverters.*
import scala.language.postfixOps
import scala.util.Using
-object NCOpenNlpEntityParserImpl:
- def apply(res: String): NCOpenNlpEntityParserImpl = new
NCOpenNlpEntityParserImpl(NCUtils.getStream(res), res)
- def apply(f: File): NCOpenNlpEntityParserImpl = new
NCOpenNlpEntityParserImpl(new FileInputStream(f), f.getAbsolutePath)
-
/**
*
+ * @param res
*/
-class NCOpenNlpEntityParserImpl(is: InputStream, res: String) extends
NCEntityParser with LazyLogging :
+class NCOpenNlpEntityParserImpl(res: String) extends NCEntityParser with
LazyLogging :
@volatile private var finder: NameFinderME = _
private case class Holder(start: Int, end: Int, name: String, probability:
Double)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
index 223d5dd..9c09111 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParser.java
@@ -24,7 +24,6 @@ import org.apache.nlpcraft.NCRequest;
import org.apache.nlpcraft.NCToken;
import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticEntityParserImpl;
-import java.io.File;
import java.util.*;
/**
@@ -61,18 +60,6 @@ public class NCSemanticEntityParser implements
NCEntityParser {
/**
*
* @param stemmer
- * @param mdlFile
- */
- public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, File mdlFile)
{
- Objects.requireNonNull(stemmer, "Stemmer cannot be null");
- Objects.requireNonNull(mdlFile, "File cannot be null");
-
- impl = NCSemanticEntityParserImpl.apply(stemmer, mdlFile);
- }
-
- /**
- *
- * @param stemmer
* @param mdlSrc
*/
public NCSemanticEntityParser(NCSemanticTextStemmer stemmer, String
mdlSrc) {
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/en/NCEnSemanticEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/en/NCEnSemanticEntityParser.java
index 6452938..ae21ccb 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/en/NCEnSemanticEntityParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/en/NCEnSemanticEntityParser.java
@@ -4,7 +4,6 @@ import
org.apache.nlpcraft.nlp.entity.parser.semantic.NCSemanticElement;
import org.apache.nlpcraft.nlp.entity.parser.semantic.NCSemanticEntityParser;
import
org.apache.nlpcraft.nlp.entity.parser.semantic.impl.en.NCEnSemanticTextStemmer;
-import java.io.File;
import java.util.List;
import java.util.Map;
@@ -30,14 +29,6 @@ public class NCEnSemanticEntityParser extends
NCSemanticEntityParser {
}
/**
- *
- * @param mdlFile
- */
- public NCEnSemanticEntityParser(File mdlFile) {
- super(new NCEnSemanticTextStemmer(), mdlFile);
- }
-
- /**
*
* @param mdlSrc
*/
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index 2169f40..44adac1 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -40,25 +40,11 @@ object NCSemanticEntityParserImpl:
stemmer, macros = if macros == null then null else
macros.asScala.toMap, elements = elems.asScala.toSeq
)
- def apply(stemmer: NCSemanticTextStemmer, mdlFile: File):
NCSemanticEntityParserImpl =
- require(stemmer != null)
- require(mdlFile != null)
-
- new NCSemanticEntityParserImpl(
- stemmer,
- is = new BufferedInputStream(new FileInputStream(mdlFile)),
- typ = NCSemanticSourceType(mdlFile.getName)
- )
-
def apply(stemmer: NCSemanticTextStemmer, mdlSrc: String):
NCSemanticEntityParserImpl =
require(stemmer != null)
require(mdlSrc != null)
- new NCSemanticEntityParserImpl(
- stemmer,
- is = new BufferedInputStream(NCUtils.getStream(mdlSrc)),
- typ = NCSemanticSourceType(mdlSrc)
- )
+ new NCSemanticEntityParserImpl(stemmer, res = mdlSrc, typ =
NCSemanticSourceType(mdlSrc))
/**
* @param baseTokens Tokens.
@@ -133,18 +119,18 @@ class NCSemanticEntityParserImpl(
stemmer: NCSemanticTextStemmer,
macros: Map[String, String] = null,
elements: Seq[NCSemanticElement] = null,
- is: InputStream = null,
+ res: String = null,
typ: NCSemanticSourceType = null
) extends NCEntityParser with LazyLogging:
require(stemmer != null)
- require(macros != null && elements != null || is != null && typ != null)
+ require(macros != null && elements != null || res != null && typ != null)
@volatile private var h: NCSemanticSynonymsHolder = _
override def start(cfg: NCModelConfig): Unit =
val (macros, elements) =
- if is != null then
- val src = NCSemanticDataReader.read(is, typ)
+ if res != null then
+ val src = NCSemanticDataReader.read(new
BufferedInputStream(NCUtils.getStream(res)), typ)
(src.macros, src.elements)
else
(this.macros, this.elements)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
index e5b84b2..aca1786 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
@@ -20,7 +20,6 @@ package org.apache.nlpcraft.nlp.token.enricher.en;
import org.apache.nlpcraft.*;
import org.apache.nlpcraft.nlp.token.enricher.en.impl.NCEnSwearWordsImpl;
-import java.io.File;
import java.util.List;
import java.util.Objects;
@@ -31,18 +30,6 @@ public class NCEnSwearWordsTokenEnricher implements
NCTokenEnricher {
private final NCEnSwearWordsImpl impl;
/**
- *
- * TODO: swear_words.txt
- *
- * @param mdlFile
- */
- public NCEnSwearWordsTokenEnricher(File mdlFile) {
- Objects.requireNonNull(mdlFile, "Swear words model file cannot be
null.");
-
- impl = NCEnSwearWordsImpl.apply(mdlFile);
- }
-
- /**
* TODO: swear_words.txt
*
* @param mdlSrc
@@ -50,7 +37,7 @@ public class NCEnSwearWordsTokenEnricher implements
NCTokenEnricher {
public NCEnSwearWordsTokenEnricher(String mdlSrc) {
Objects.requireNonNull(mdlSrc, "Swear words model file cannot be
null.");
- impl = NCEnSwearWordsImpl.apply(mdlSrc);
+ impl = new NCEnSwearWordsImpl(mdlSrc);
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnSwearWordsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnSwearWordsImpl.scala
index 4aa3a55..ea11dc0 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnSwearWordsImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/impl/NCEnSwearWordsImpl.scala
@@ -24,35 +24,12 @@ import org.apache.nlpcraft.internal.util.NCUtils
import java.io.*
-/**
- *
- */
-object NCEnSwearWordsImpl:
- /**
- *
- * @param mdlFile
- * @return
- */
- def apply(mdlFile: File): NCEnSwearWordsImpl =
- new NCEnSwearWordsImpl(new BufferedInputStream(new
FileInputStream(mdlFile)), mdlFile.getPath)
-
- /**
- *
- * @param mdlSrc
- * @return
- */
- def apply(mdlSrc: String): NCEnSwearWordsImpl =
- new NCEnSwearWordsImpl(NCUtils.getStream(mdlSrc), mdlSrc)
-
-/**
- *
- */
-class NCEnSwearWordsImpl(is: InputStream, res: String) extends NCTokenEnricher
with LazyLogging:
+class NCEnSwearWordsImpl(res: String) extends NCTokenEnricher with LazyLogging:
@volatile private var swearWords: Set[String] = _
override def start(cfg: NCModelConfig): Unit =
val stemmer = new PorterStemmer
- swearWords = NCUtils.readTextStream(is,
"UTF-8").map(stemmer.stem).toSet
+ swearWords = NCUtils.readTextStream(NCUtils.getStream(res),
"UTF-8").map(stemmer.stem).toSet
logger.trace(s"Loaded resource: $res")
override def stop(): Unit = swearWords = null
override def enrich(req: NCRequest, cfg: NCModelConfig, toks:
java.util.List[NCToken]): Unit =
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParser.java
index 4e77aa8..cb0a8b4 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParser.java
@@ -48,25 +48,6 @@ public class NCEnOpenNlpTokenParser implements NCTokenParser
{
/**
*
- * @param tokMdl
- * @param posMdl
- * @param lemmaDic
- * @throws NCException
- */
- public NCEnOpenNlpTokenParser(File posMdl, File lemmaDic) {
- Objects.requireNonNull(posMdl, "POS model file cannot be null.");
- Objects.requireNonNull(lemmaDic, "Lemmatizer model file cannot be
null.");
-
- try {
- impl = NCEnOpenNlpImpl.apply(posMdl, lemmaDic);
- }
- catch (Exception e) {
- throw new NCException("Failed to create OpenNLP token parser.", e);
- }
- }
-
- /**
- *
* @param tokMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tokenizer model.
* @param posMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tagger model.
* @param lemmaDicSrc Local filesystem path, resources file path or URL
for OpenNLP lemmatizer dictionary.
@@ -77,7 +58,7 @@ public class NCEnOpenNlpTokenParser implements NCTokenParser {
Objects.requireNonNull(lemmaDicSrc, "Lemmatizer model path cannot be
null.");
try {
- impl = NCEnOpenNlpImpl.apply(posMdlSrc, lemmaDicSrc);
+ impl = new NCEnOpenNlpImpl(posMdlSrc, lemmaDicSrc);
}
catch (Exception e) {
throw new NCException("Failed to create OpenNLP token parser.", e);
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
index 35b3c7f..6c62be8 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
@@ -31,39 +31,13 @@ import java.util.{Collections, List as JList, Set as JSet}
import scala.concurrent.ExecutionContext
import scala.jdk.CollectionConverters.*
-/**
- *
- */
-object NCEnOpenNlpImpl:
- /**
- *
- * @param posMdlSrc Local filesystem path, resources file path or URL for
OpenNLP tagger model.
- * @param lemmaDicSrc Local filesystem path, resources file path or URL
for OpenNLP lemmatizer dictionary.
- * @return
- */
- def apply(posMdlSrc: String, lemmaDicSrc: String): NCEnOpenNlpImpl =
- new NCEnOpenNlpImpl(NCUtils.getStream(posMdlSrc),
NCUtils.getStream(lemmaDicSrc))
-
- /**
- *
- * @param posMdlFile Local file for OpenNLP tagger model.
- * @param lemmaDicFile Local file for OpenNLP lemmatizer dictionary.
- * @return
- */
- def apply(posMdlFile: File, lemmaDicFile: File): NCEnOpenNlpImpl =
- def toStream(f: File) = new BufferedInputStream(new FileInputStream(f))
-
- new NCEnOpenNlpImpl(toStream(posMdlFile), toStream(lemmaDicFile))
/**
*
- * @param posMdlIn
- * @param lemmaDicIn
+ * @param posMdlSrc
+ * @param lemmaDicSrc
*/
-class NCEnOpenNlpImpl(
- posMdlIn: InputStream,
- lemmaDicIn: InputStream
-) extends NCTokenParser :
+class NCEnOpenNlpImpl(posMdlSrc: String, lemmaDicSrc: String) extends
NCTokenParser :
private val stemmer = new PorterStemmer
@volatile var tagger: POSTaggerME = _
@@ -75,8 +49,8 @@ class NCEnOpenNlpImpl(
override def start(cfg: NCModelConfig): Unit =
NCUtils.execPar(
- () => tagger = new POSTaggerME(new POSModel(posMdlIn)),
- () => lemmatizer = new DictionaryLemmatizer(lemmaDicIn),
+ () => tagger = new POSTaggerME(new
POSModel(NCUtils.getStream(posMdlSrc))),
+ () => lemmatizer = new
DictionaryLemmatizer(NCUtils.getStream(lemmaDicSrc)),
() => swFinder = new NCEnStopWordsFinder(stem(addStopWords),
stem(exclStopWords))
)(ExecutionContext.Implicits.global)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/NCOpenNlpTokenizer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/NCOpenNlpTokenizer.java
index 88f12e4..bf63f0d 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/NCOpenNlpTokenizer.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/NCOpenNlpTokenizer.java
@@ -23,7 +23,6 @@ import org.apache.nlpcraft.NCTokenizer;
import org.apache.nlpcraft.NCWord;
import org.apache.nlpcraft.nlp.tokenizer.opennlp.impl.NCOpenNlpTokenizerImpl;
-import java.io.File;
import java.util.List;
import java.util.Objects;
@@ -37,26 +36,11 @@ public class NCOpenNlpTokenizer implements NCTokenizer {
*
* @param tokMdl
*/
- public NCOpenNlpTokenizer(File tokMdl) {
- Objects.requireNonNull(tokMdl, "Tokenizer model file cannot be null.");
-
- try {
- impl = NCOpenNlpTokenizerImpl.apply(tokMdl);
- }
- catch (Exception e) {
- throw new NCException("Failed to create OpenNLP tokenizer from: "
+ tokMdl, e);
- }
- }
-
- /**
- *
- * @param tokMdl
- */
public NCOpenNlpTokenizer(String tokMdl) {
Objects.requireNonNull(tokMdl, "Tokenizer model source cannot be
null.");
try {
- impl = NCOpenNlpTokenizerImpl.apply(tokMdl);
+ impl = new NCOpenNlpTokenizerImpl(tokMdl);
}
catch (Exception e) {
throw new NCException("Failed to create OpenNLP tokenizer from: "
+ tokMdl, e);
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala
index fe8db46..65a541d 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala
@@ -27,19 +27,12 @@ import scala.jdk.CollectionConverters.*
/**
*
- */
-object NCOpenNlpTokenizerImpl:
- def apply(file: File): NCOpenNlpTokenizerImpl = new
NCOpenNlpTokenizerImpl(new BufferedInputStream(new FileInputStream(file)))
- def apply(src: String): NCOpenNlpTokenizerImpl = new
NCOpenNlpTokenizerImpl(NCUtils.getStream(src))
-
-/**
- *
* @param is
*/
-class NCOpenNlpTokenizerImpl(is: InputStream) extends NCTokenizer:
+class NCOpenNlpTokenizerImpl(src: String) extends NCTokenizer:
@volatile var tokenizer: TokenizerME = _
- override def start(cfg: NCModelConfig): Unit = tokenizer = new
TokenizerME(new TokenizerModel(is))
+ override def start(cfg: NCModelConfig): Unit = tokenizer = new
TokenizerME(new TokenizerModel(NCUtils.getStream(src)))
override def stop(): Unit = tokenizer = null
override def tokenize(cfg: NCModelConfig, txt: String): util.List[NCWord] =
this.synchronized { tokenizer.tokenizePos(txt) }