This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new a3a8d8d WIP
new d7e8042 Merge branch 'NLPCRAFT-472' of
https://github.com/apache/incubator-nlpcraft into NLPCRAFT-472
a3a8d8d is described below
commit a3a8d8da6b234038350b4861bb8a767c412da243
Author: Aaron Radzinski <[email protected]>
AuthorDate: Wed Dec 29 12:36:51 2021 -0800
WIP
---
.../org/apache/nlpcraft/NCModelConfigAdapter.java | 10 +++++-----
.../main/scala/org/apache/nlpcraft/NCToken.java | 2 --
.../scala/org/apache/nlpcraft/NCTokenizer.java | 6 ++++++
.../parser/opennlp/impl/en/NCEnOpenNlpImpl.scala | 4 +++-
.../nlpcraft/nlp/tokenizer/NCOpenNlpTokenizer.java | 23 ++++++++++++++++++++--
.../tokenizer/impl/NCOpenNlpTokenizerImpl.scala | 12 +++++++++--
6 files changed, 45 insertions(+), 12 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
index fe5ca2a..9a51d83 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
@@ -27,7 +27,7 @@ public class NCModelConfigAdapter extends
NCPropertyMapAdapter implements NCMode
private final String id;
private final String name;
private final String version;
- private final NCTokenizer tokernizer;
+ private final NCTokenizer tokenizer;
private final List<NCTokenParser> tokParsers = new ArrayList<>();
private final List<NCTokenEnricher> tokEnrichers = new ArrayList<>();
private final List<NCEntityEnricher> entEnrichers = new ArrayList<>();
@@ -40,18 +40,18 @@ public class NCModelConfigAdapter extends
NCPropertyMapAdapter implements NCMode
* @param version
* @param tokParser
*/
- public NCModelConfigAdapter(String id, String name, String version,
NCTokenizer tokernizer, NCTokenParser tokParser, NCEntityParser entParser) {
+ public NCModelConfigAdapter(String id, String name, String version,
NCTokenizer tokenizer, NCTokenParser tokParser, NCEntityParser entParser) {
Objects.requireNonNull(id, "ID cannot be null.");
Objects.requireNonNull(name, "Name cannot be null.");
Objects.requireNonNull(version, "Version cannot be null.");
- Objects.requireNonNull(tokernizer, "Tokenizer cannot be null.");
+ Objects.requireNonNull(tokenizer, "Tokenizer cannot be null.");
Objects.requireNonNull(tokParser, "Token parser cannot be null.");
Objects.requireNonNull(entParser, "Entity parser cannot be null.");
this.id = id;
this.name = name;
this.version = version;
- this.tokernizer = tokernizer;
+ this.tokenizer = tokenizer;
tokParsers.add(tokParser);
entParsers.add(entParser);
@@ -134,6 +134,6 @@ public class NCModelConfigAdapter extends
NCPropertyMapAdapter implements NCMode
@Override
public NCTokenizer getTokenizer() {
- return tokernizer;
+ return tokenizer;
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
index d3fe623..67e3e8d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
@@ -21,8 +21,6 @@ package org.apache.nlpcraft;
*
*/
public interface NCToken extends NCWord, NCPropertyMap {
- String getText();
-
/**
*
* @return
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenizer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenizer.java
index a27c7f7..1ee784d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenizer.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenizer.java
@@ -23,5 +23,11 @@ import java.util.List;
*
*/
public interface NCTokenizer extends NCLifecycle {
+ /**
+ *
+ * @param cfg
+ * @param txt
+ * @return
+ */
List<NCWord> tokenize(NCModelConfig cfg, String txt);
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
index 9e818f4..c13df98 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
@@ -32,6 +32,9 @@ import java.util.{Collections, List as JList, Set as JSet}
import scala.concurrent.ExecutionContext
import scala.jdk.CollectionConverters.*
+/**
+ *
+ */
object NCEnOpenNlpImpl:
/**
*
@@ -55,7 +58,6 @@ object NCEnOpenNlpImpl:
/**
*
- * @param tokMdlIn
* @param posMdlIn
* @param lemmaDicIn
*/
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/NCOpenNlpTokenizer.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/NCOpenNlpTokenizer.java
index 3e1aeb9..af76b7b 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/NCOpenNlpTokenizer.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/NCOpenNlpTokenizer.java
@@ -17,6 +17,7 @@
package org.apache.nlpcraft.nlp.tokenizer;
+import org.apache.nlpcraft.NCException;
import org.apache.nlpcraft.NCModelConfig;
import org.apache.nlpcraft.NCTokenizer;
import org.apache.nlpcraft.NCWord;
@@ -32,16 +33,34 @@ import java.util.Objects;
public class NCOpenNlpTokenizer implements NCTokenizer {
private final NCOpenNlpTokenizerImpl impl;
+ /**
+ *
+ * @param tokMdl
+ */
public NCOpenNlpTokenizer(File tokMdl) {
Objects.requireNonNull(tokMdl, "Tokenizer model file cannot be null.");
- impl = NCOpenNlpTokenizerImpl.apply(tokMdl);
+ try {
+ impl = NCOpenNlpTokenizerImpl.apply(tokMdl);
+ }
+ catch (Exception e) {
+ throw new NCException("Failed to create OpenNLP tokenizer from: "
+ tokMdl, e);
+ }
}
+ /**
+ *
+ * @param tokMdl
+ */
public NCOpenNlpTokenizer(String tokMdl) {
Objects.requireNonNull(tokMdl, "Tokenizer model source cannot be
null.");
- impl = NCOpenNlpTokenizerImpl.apply(tokMdl);
+ try {
+ impl = NCOpenNlpTokenizerImpl.apply(tokMdl);
+ }
+ catch (Exception e) {
+ throw new NCException("Failed to create OpenNLP tokenizer from: "
+ tokMdl, e);
+ }
}
@Override
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/impl/NCOpenNlpTokenizerImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/impl/NCOpenNlpTokenizerImpl.scala
index 05729a5..7e19c18 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/impl/NCOpenNlpTokenizerImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/impl/NCOpenNlpTokenizerImpl.scala
@@ -14,6 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.apache.nlpcraft.nlp.tokenizer.impl
import opennlp.tools.tokenize.*
@@ -24,18 +25,25 @@ import java.io.*
import java.util
import scala.jdk.CollectionConverters.*
+/**
+ *
+ */
object NCOpenNlpTokenizerImpl:
def apply(file: File): NCOpenNlpTokenizerImpl = new
NCOpenNlpTokenizerImpl(new BufferedInputStream(new FileInputStream(file)))
def apply(src: String): NCOpenNlpTokenizerImpl = new
NCOpenNlpTokenizerImpl(NCUtils.getStream(src))
+/**
+ *
+ * @param is
+ */
class NCOpenNlpTokenizerImpl(is: InputStream) extends NCTokenizer:
@volatile var tokenizer: TokenizerME = _
override def start(cfg: NCModelConfig): Unit = tokenizer = new
TokenizerME(new TokenizerModel(is))
override def stop(): Unit = tokenizer = null
override def tokenize(cfg: NCModelConfig, txt: String): util.List[NCWord] =
- this.synchronized { tokenizer.tokenizePos(txt) }.
- map(span => new NCWord :
+ this.synchronized { tokenizer.tokenizePos(txt) }
+ .map(span => new NCWord:
override def getText: String =
span.getCoveredText(txt).toString
override def getStartCharIndex: Int = span.getStart
override def getEndCharIndex: Int = span.getEnd