This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new 4fb0c76 Open NLP parser implementation. Refactoring.
4fb0c76 is described below
commit 4fb0c7686d1c4f596f47f1bcaed5b60436c974ec
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 29 11:04:18 2021 +0300
Open NLP parser implementation. Refactoring.
---
.../main/scala/org/apache/nlpcraft/NCContext.java | 3 +-
.../main/scala/org/apache/nlpcraft/NCEntity.java | 19 ++---
.../scala/org/apache/nlpcraft/NCLifecycle.java | 2 -
.../scala/org/apache/nlpcraft/NCModelClient.java | 59 +++++++++----
.../org/apache/nlpcraft/NCModelConfigAdapter.java | 8 +-
.../scala/org/apache/nlpcraft/NCPropertyMap.java | 8 +-
.../org/apache/nlpcraft/NCPropertyMapAdapter.java | 5 ++
.../main/scala/org/apache/nlpcraft/NCResult.java | 3 -
.../main/scala/org/apache/nlpcraft/NCToken.java | 6 ++
.../nlpcraft/internal/ansi/NCAnsiProgressBar.scala | 2 +-
.../nlpcraft/internal/ansi/NCAnsiSpinner.scala | 2 +-
.../nlpcraft/internal/ascii/NCAsciiTable.scala | 13 ++-
.../nlpcraft/internal/makro/NCMacroCompiler.scala | 7 +-
.../nlpcraft/internal/makro/NCMacroParser.scala | 3 +-
.../apache/nlpcraft/internal/util/NCUtils.scala | 17 ++--
.../parser/opennlp/NCOpenNlpEntityParser.java | 74 ++++++++++++++++
.../opennlp/impl/NCOpenNlpEntityParserImpl.scala | 83 ++++++++++++++++++
.../enricher/en}/NCEnBracketsTokenEnricher.java | 4 +-
.../enricher/en}/NCEnDictionaryTokenEnricher.java | 5 +-
.../enricher/en}/NCEnLanguageTokenEnricher.java | 4 +-
.../enricher/en}/NCEnQuotesTokenEnricher.java | 9 +-
.../enricher/en}/NCEnSwearWordsTokenEnricher.java | 4 +-
.../token/enricher/impl/en}/NCEnBracketsImpl.scala | 2 +-
.../enricher/impl/en}/NCEnDictionaryImpl.scala | 2 +-
.../enricher/impl/en}/NCEnLanguageWordsImpl.scala | 2 +-
.../token/enricher/impl/en}/NCEnQuotesImpl.scala | 6 +-
.../enricher/impl/en}/NCEnSwearWordsImpl.scala | 14 ++-
.../parser/opennlp/en}/NCEnOpenNlpTokenParser.java | 4 +-
.../parser/opennlp/impl/en}/NCEnOpenNlpImpl.scala | 11 ++-
.../opennlp/impl/en}/NCEnStopWordGenerator.scala | 3 +-
.../opennlp/impl/en}/NCEnStopWordsFinder.scala | 18 ++--
.../nlpcraft/internal/nlp/util/NCTestUtils.scala | 83 ------------------
.../nlp/benchmark/NCBenchmarkAdapter.java | 6 +-
.../opennlp/NCEnOpenNlpTokenParserBenchmark.java | 18 ++--
.../parser/opennlp/NCOpenNlpEntityParserSpec.scala | 75 ++++++++++++++++
.../en}/NCEnBracketsTokenEnricherSpec.scala | 27 +++---
.../en}/NCEnDictionaryTokenEnricherSpec.scala | 15 ++--
.../en}/NCEnLanguageTokenEnricherSpec.scala | 15 ++--
.../enricher/en}/NCEnQuotesTokenEnricherSpec.scala | 25 +++---
.../en}/NCEnSwearWordsTokenEnricherSpec.scala | 15 ++--
.../opennlp/en}/NCEnOpenNlpTokenParserSpec.scala | 19 ++---
.../{internal => }/nlp/util/NCTestRequest.scala | 2 +-
.../{internal => }/nlp/util/NCTestToken.scala | 6 +-
.../org/apache/nlpcraft/nlp/util/NCTestUtils.scala | 94 +++++++++++++++++++++
.../src/test/resources/opennlp/en-ner-date.bin | Bin 0 -> 5030307 bytes
.../src/test/resources/opennlp/en-ner-location.bin | Bin 0 -> 5110658 bytes
.../src/test/resources/opennlp/en-ner-money.bin | Bin 0 -> 4806234 bytes
.../test/resources/opennlp/en-ner-organization.bin | Bin 0 -> 5297172 bytes
.../test/resources/opennlp/en-ner-percentage.bin | Bin 0 -> 4728645 bytes
.../src/test/resources/opennlp/en-ner-person.bin | Bin 0 -> 5207953 bytes
50 files changed, 536 insertions(+), 266 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCContext.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCContext.java
index f560672..39f4004 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCContext.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCContext.java
@@ -17,8 +17,7 @@
package org.apache.nlpcraft;
-import java.util.Collection;
-import java.util.List;
+import java.util.*;
/**
*
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
index f8e0c8e..57aa4e4 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
@@ -17,6 +17,8 @@
package org.apache.nlpcraft;
+import org.apache.nlpcraft.internal.util.NCUtils;
+
import java.util.List;
/**
@@ -43,20 +45,9 @@ public interface NCEntity extends NCPropertyMap {
String getId();
/**
- *
- * @return Index of this entity in the sentence.
- */
- int getIndex();
-
- /**
- * A shortcut method that gets internal globally unique system ID of the
entity.
- * <p>
- * This method is equivalent to:
- * <pre class="brush: java">
- * return meta("nlpcraft:nlp:unid");
- * </pre>
- *
* @return Internal globally unique system ID of the entity.
*/
- String getGuid();
+ default String getGuid() {
+ return NCUtils.genUUID().toString();
+ }
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCLifecycle.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCLifecycle.java
index 0f3774c..0d58925 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCLifecycle.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCLifecycle.java
@@ -17,8 +17,6 @@
package org.apache.nlpcraft;
-import java.io.FileNotFoundException;
-
/**
*
*/
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelClient.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelClient.java
index 7d9b5dc..85f272b 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelClient.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelClient.java
@@ -25,7 +25,7 @@ import java.util.concurrent.*;
*
*/
public class NCModelClient implements NCLifecycle {
- private NCModel mdl;
+ private final NCModel mdl;
/**
*
@@ -43,35 +43,64 @@ public class NCModelClient implements NCLifecycle {
// TODO:
}
- private static void start(List<? extends NCLifecycle> list, NCModelConfig
cfg) {
+ private static void start(ExecutorService s, List<? extends NCLifecycle>
list, NCModelConfig cfg) {
+ assert s != null;
+
if (list != null)
- list.forEach(p -> p.start(cfg));
+ list.forEach(p -> s.execute(() -> p.start(cfg)));
}
- private static void stop(List<? extends NCLifecycle> list) {
+ private static void stop(ExecutorService s, List<? extends NCLifecycle>
list) {
+ assert s != null;
+
if (list != null)
- list.forEach(p -> p.stop());
+ list.forEach(p -> s.execute(() -> p.stop()));
+ }
+
+ private static void stopExecutorService(ExecutorService s) {
+ try {
+ s.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS);
+ }
+ catch (InterruptedException e) {
+ throw new NCException("Thread interrupted.", e);
+ }
+ }
+
+ private static ExecutorService getExecutorService() {
+ return
Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
}
@Override
public void start(NCModelConfig cfg) {
verify();
- start(cfg.getTokenParsers(), cfg);
- start(cfg.getEntityParsers(), cfg);
- start(cfg.getEntityEnrichers(), cfg);
- start(cfg.getTokenEnrichers(), cfg);
+ ExecutorService s = getExecutorService();
+
+ try {
+ start(s, cfg.getTokenParsers(), cfg);
+ start(s, cfg.getEntityParsers(), cfg);
+ start(s, cfg.getEntityEnrichers(), cfg);
+ start(s, cfg.getTokenEnrichers(), cfg);
+ }
+ finally {
+ stopExecutorService(s);
+ }
}
@Override
public void stop() {
NCModelConfig cfg = mdl.getConfig();
-
- stop(cfg.getTokenEnrichers());
- stop(cfg.getEntityEnrichers());
- stop(cfg.getEntityParsers());
- stop(cfg.getTokenParsers());
-
+ ExecutorService s = getExecutorService();
+
+ try {
+ stop(s, cfg.getTokenEnrichers());
+ stop(s, cfg.getEntityEnrichers());
+ stop(s, cfg.getEntityParsers());
+ stop(s, cfg.getTokenEnrichers());
+ }
+ finally {
+ stopExecutorService(s);
+ }
}
/**
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
index a706a14..943ce76 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigAdapter.java
@@ -28,10 +28,10 @@ public class NCModelConfigAdapter extends
NCPropertyMapAdapter implements NCMode
private final String name;
private final String version;
- private List<NCTokenParser> tokParsers = new ArrayList<>();
- private List<NCTokenEnricher> tokEnrichers = new ArrayList<>();
- private List<NCEntityEnricher> entEnrichers = new ArrayList<>();
- private List<NCEntityParser> entParsers = new ArrayList<>();
+ private final List<NCTokenParser> tokParsers = new ArrayList<>();
+ private final List<NCTokenEnricher> tokEnrichers = new ArrayList<>();
+ private final List<NCEntityEnricher> entEnrichers = new ArrayList<>();
+ private final List<NCEntityParser> entParsers = new ArrayList<>();
/**
*
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java
index 13a8119..2cb97dc 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMap.java
@@ -17,7 +17,7 @@
package org.apache.nlpcraft;
-import java.util.Optional;
+import java.util.*;
/**
*
@@ -76,4 +76,10 @@ public interface NCPropertyMap {
* @return
*/
boolean remove(String key, Object obj);
+
+ /**
+ *
+ * @return
+ */
+ Set<String> keysSet();
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java
index a82689d..c2ca3d1 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPropertyMapAdapter.java
@@ -59,4 +59,9 @@ public class NCPropertyMapAdapter implements NCPropertyMap {
public boolean remove(String key, Object obj) {
return map.remove(key, obj);
}
+
+ @Override
+ public Set<String> keysSet() {
+ return map.keySet();
+ }
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResult.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResult.java
index 4f193f5..5461538 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResult.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResult.java
@@ -17,10 +17,7 @@
package org.apache.nlpcraft;
-import org.apache.nlpcraft.internal.util.*;
-
import java.io.Serializable;
-import java.util.Collection;
/**
*
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
index 8e0a9ee..ff4ca24 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
@@ -68,4 +68,10 @@ public interface NCToken extends NCPropertyMap {
* @return
*/
int getLength();
+
+ /**
+ *
+ * @return
+ */
+ int getIndex();
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ansi/NCAnsiProgressBar.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ansi/NCAnsiProgressBar.scala
index e5f4817..3135881 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ansi/NCAnsiProgressBar.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ansi/NCAnsiProgressBar.scala
@@ -21,7 +21,7 @@ import java.io.PrintWriter
import org.apache.nlpcraft.internal.*
import NCAnsi.*
import org.apache.commons.lang3.StringUtils
-import org.apache.nlpcraft.internal.ansi.NCAnsiProgressBar.*
+import NCAnsiProgressBar.*
/**
* Forward-only, bound ANSI-based progress bar.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ansi/NCAnsiSpinner.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ansi/NCAnsiSpinner.scala
index ca1b702..08048a4 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ansi/NCAnsiSpinner.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ansi/NCAnsiSpinner.scala
@@ -20,7 +20,7 @@ package org.apache.nlpcraft.internal.ansi
import java.io.PrintWriter
import NCAnsi.*
import org.apache.nlpcraft.internal.*
-import org.apache.nlpcraft.internal.ansi.NCAnsiSpinner.*
+import NCAnsiSpinner.*
import org.apache.nlpcraft.internal.util.NCUtils
/**
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ascii/NCAsciiTable.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ascii/NCAsciiTable.scala
index 774846f..6bb3b4d 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ascii/NCAsciiTable.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/ascii/NCAsciiTable.scala
@@ -20,11 +20,10 @@ package org.apache.nlpcraft.internal.ascii
import java.io.{IOException, PrintStream}
import java.util.List as JList
import com.typesafe.scalalogging.Logger
-import org.apache.nlpcraft.NCException
-import org.apache.nlpcraft.internal.*
-import org.apache.nlpcraft.internal.ascii.NCAsciiTable.*
-import org.apache.nlpcraft.internal.ansi.NCAnsi.*
+import org.apache.nlpcraft.*
+import NCAsciiTable.*
import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.internal.ansi.NCAnsi.*
import scala.collection.mutable
import scala.jdk.CollectionConverters.CollectionHasAsScala
@@ -557,6 +556,12 @@ class NCAsciiTable:
def error(log: Logger, header: Option[String] = None): Unit =
log.error(mkLogString(header))
/**
+ *
+ * @param header
+ */
+ def print(header: String): Unit = println(mkLogString(Some(header)))
+
+ /**
* Renders this table to log as trace.
*
* @param log Logger.
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/makro/NCMacroCompiler.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/makro/NCMacroCompiler.scala
index 9a98a1a..9e5ecca 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/makro/NCMacroCompiler.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/makro/NCMacroCompiler.scala
@@ -20,13 +20,14 @@ package org.apache.nlpcraft.internal.makro
import com.typesafe.scalalogging.LazyLogging
import org.antlr.v4.runtime.tree.ParseTreeWalker
import org.antlr.v4.runtime.*
-import org.apache.nlpcraft.NCException
import org.apache.nlpcraft.internal.*
import org.apache.nlpcraft.internal.ansi.NCAnsi.*
-import org.apache.nlpcraft.internal.antlr4.*
-import org.apache.nlpcraft.internal.makro.NCMacroCompiler.FiniteStateMachine
+import org.apache.nlpcraft.internal.antlr4.{NCCompilerUtils, *}
+import NCMacroCompiler.FiniteStateMachine
import org.apache.nlpcraft.internal.makro.antlr4.*
import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.NCException
+import org.apache.nlpcraft.internal.makro.antlr4.NCMacroDslLexer
import scala.collection.mutable
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/makro/NCMacroParser.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/makro/NCMacroParser.scala
index af085a5..cf6c3e2 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/makro/NCMacroParser.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/makro/NCMacroParser.scala
@@ -17,8 +17,7 @@
package org.apache.nlpcraft.internal.makro
-import org.apache.nlpcraft.NCException
-import org.apache.nlpcraft.internal.*
+import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
import scala.jdk.CollectionConverters.*
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index b756e63..bad8159 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -19,18 +19,19 @@ package org.apache.nlpcraft.internal.util
import com.google.gson.GsonBuilder
import com.typesafe.scalalogging.*
-import org.apache.nlpcraft.{NCException, NCToken}
+import org.apache.nlpcraft.NCToken
+import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.ansi.NCAnsi.*
import java.io.*
import java.net.*
-import java.util.Random
+import java.util.{Random, UUID}
import java.util.regex.Pattern
-import java.util.zip.{GZIPInputStream, GZIPOutputStream}
+import java.util.zip.*
import scala.annotation.tailrec
import scala.collection.{IndexedSeq, Seq}
import scala.concurrent.duration.Duration
-import scala.concurrent.{Await, ExecutionContext, Future}
+import scala.concurrent.*
import scala.io.Source
import scala.sys.SystemProperties
import scala.util.Using
@@ -870,7 +871,7 @@ object NCUtils extends LazyLogging:
def readResource(res: String, enc: String = "UTF-8", log: Logger =
logger): List[String] =
val list =
try
- Using.resource(Source.fromInputStream(getStream(res),
enc))(_.getLines()).toList
+ Using.resource(Source.fromInputStream(getStream(res),
enc))(_.getLines().toSeq).toList
catch
case e: IOException => throw new NCException(s"Failed to read
stream: $res", e)
@@ -929,6 +930,12 @@ object NCUtils extends LazyLogging:
bodies.map(body => Future { body() } (ec)).foreach(Await.result(_,
Duration.Inf))
/**
+ *
+ * @return
+ */
+ def genUUID(): UUID = UUID.randomUUID()
+
+ /**
* Gets all sequential permutations of tokens in this NLP sentence.
*
* For example, if NLP sentence contains "a, b, c, d" tokens, then
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParser.java
new file mode 100644
index 0000000..4d4b5dd
--- /dev/null
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParser.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.entity.parser.opennlp;
+
+import org.apache.nlpcraft.*;
+import
org.apache.nlpcraft.nlp.entity.parser.opennlp.impl.NCOpenNlpEntityParserImpl;
+
+import java.io.File;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * TODO
+ * Generates entities with
+ * - ID `opennlp:{name}` where 'name' is element model name (from trained
file or resource) and
+ * - one property `opennlp:{name}:probability`, where probability is double
value between 0 and 1.
+ *
+ * <p>
+ * Models can be download here: http://opennlp.sourceforge.net/models-1.5/ or
trained.
+ * <p>
+ * Component is language independent.
+ * <p>
+ */
+public class NCOpenNlpEntityParser implements NCEntityParser {
+ private final NCOpenNlpEntityParserImpl impl;
+
+ /**
+ * @param modelSrc
+ */
+ public NCOpenNlpEntityParser(String modelSrc) {
+ Objects.requireNonNull(modelSrc, "Model source cannot be null.");
+
+ this.impl = NCOpenNlpEntityParserImpl.apply(modelSrc);
+ }
+
+ /**
+ * @param modelFile
+ */
+ public NCOpenNlpEntityParser(File modelFile) {
+ Objects.requireNonNull(modelFile, "Model file cannot be null.");
+
+ this.impl = NCOpenNlpEntityParserImpl.apply(modelFile);
+ }
+
+ @Override
+ public void start(NCModelConfig cfg) {
+ impl.start(cfg);
+ }
+
+ @Override
+ public void stop() {
+ impl.stop();
+ }
+
+ @Override
+ public List<NCEntity> parse(NCRequest req, NCModelConfig cfg,
List<NCToken> toks) {
+ return impl.parse(req, cfg, toks);
+ }
+}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
new file mode 100644
index 0000000..dd3e24e
--- /dev/null
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.entity.parser.opennlp.impl
+
+import com.typesafe.scalalogging.LazyLogging
+import opennlp.tools.namefind.*
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.token.enricher.impl.en.NCEnQuotesImpl.*
+
+import java.io.*
+import java.util
+import java.util.{Optional, List as JList, Map as JMap}
+import scala.Option.*
+import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.*
+import scala.language.postfixOps
+import scala.util.Using
+import scala.util.control.Exception.catching
+
+object NCOpenNlpEntityParserImpl:
+ def apply(res: String): NCOpenNlpEntityParserImpl = new
NCOpenNlpEntityParserImpl(NCUtils.getStream(res), res)
+ def apply(f: File): NCOpenNlpEntityParserImpl = new
NCOpenNlpEntityParserImpl(new FileInputStream(f), f.getAbsolutePath)
+
+/**
+ *
+ */
+class NCOpenNlpEntityParserImpl(is: InputStream, res: String) extends
NCEntityParser with LazyLogging :
+ @volatile private var finder: NameFinderME = _
+
+ private case class Holder(start: Int, end: Int, name: String, probability:
Double)
+
+ override def start(cfg: NCModelConfig): Unit =
+ finder = new NameFinderME(new
TokenNameFinderModel(NCUtils.getStream(res)))
+ logger.trace(s"Loaded resource: $res")
+
+ override def stop(): Unit = finder = null
+
+ private def find(words: Array[String]): Array[Holder] =
+ this.synchronized {
+ try
+ finder.find(words).map(p => Holder(p.getStart, p.getEnd - 1,
p.getType, p.getProb))
+ finally
+ finder.clearAdaptiveData()
+ }
+
+ override def parse(req: NCRequest, cfg: NCModelConfig, toks:
JList[NCToken]): JList[NCEntity] =
+ val toksSeq = toks.asScala
+
+ find(toksSeq.map(_.getText).toArray).flatMap(h =>
+ def calcIndex(getHolderIndex: Holder => Int): Int =
+ toksSeq.find(_.getIndex == getHolderIndex(h)) match
+ case Some(t) => t.getIndex
+ case None => -1
+
+ val i1 = calcIndex(_.start)
+ lazy val i2 = calcIndex(_.end)
+
+ Option.when(i1 != -1 && i2 != -1)(
+ new NCPropertyMapAdapter with NCEntity {
+ put(s"opennlp:${h.name}:probability", h.probability)
+
+ override def getTokens: JList[NCToken] = toksSeq.flatMap(t
=> Option.when(t.getIndex >= i1 && t.getIndex <= i2)(t)).asJava
+ override def getRequestId: String = req.getRequestId
+ override def getId: String = s"opennlp:${h.name}"
+ }
+ )
+ ).toSeq.asJava
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricher.java
similarity index 91%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricher.java
index bf7d0b9..4e3992c 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricher.java
@@ -15,10 +15,10 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher;
+package org.apache.nlpcraft.nlp.token.enricher.en;
import org.apache.nlpcraft.*;
-import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnBracketsImpl;
+import org.apache.nlpcraft.nlp.token.enricher.impl.en.NCEnBracketsImpl;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricher.java
similarity index 91%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricher.java
index 30ed1e9..d42c4f6 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricher.java
@@ -15,10 +15,11 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher;
+package org.apache.nlpcraft.nlp.token.enricher.en;
import org.apache.nlpcraft.*;
-import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnDictionaryImpl;
+import org.apache.nlpcraft.nlp.token.enricher.impl.en.NCEnDictionaryImpl;
+
import java.util.List;
/**
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnLanguageTokenEnricher.java
similarity index 90%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricher.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnLanguageTokenEnricher.java
index cdd765b..64c13f7 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnLanguageTokenEnricher.java
@@ -15,10 +15,10 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher;
+package org.apache.nlpcraft.nlp.token.enricher.en;
import org.apache.nlpcraft.*;
-import
org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnLanguageWordsImpl;
+import org.apache.nlpcraft.nlp.token.enricher.impl.en.NCEnLanguageWordsImpl;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricher.java
similarity index 82%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricher.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricher.java
index 834732c..ba9884a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricher.java
@@ -15,13 +15,10 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher;
+package org.apache.nlpcraft.nlp.token.enricher.en;
-import org.apache.nlpcraft.NCModelConfig;
-import org.apache.nlpcraft.NCRequest;
-import org.apache.nlpcraft.NCToken;
-import org.apache.nlpcraft.NCTokenEnricher;
-import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnQuotesImpl;
+import org.apache.nlpcraft.*;
+import org.apache.nlpcraft.nlp.token.enricher.impl.en.NCEnQuotesImpl;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricher.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
similarity index 93%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricher.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
index 654d446..50a771d 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricher.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricher.java
@@ -15,10 +15,10 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher;
+package org.apache.nlpcraft.nlp.token.enricher.en;
import org.apache.nlpcraft.*;
-import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnSwearWordsImpl;
+import org.apache.nlpcraft.nlp.token.enricher.impl.en.NCEnSwearWordsImpl;
import java.io.File;
import java.util.List;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnBracketsImpl.scala
similarity index 94%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnBracketsImpl.scala
index acb3a02..b81a4c6 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnBracketsImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnBracketsImpl.scala
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher.impl
+package org.apache.nlpcraft.nlp.token.enricher.impl.en
import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnDictionaryImpl.scala
similarity index 95%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnDictionaryImpl.scala
index 0c48f46..a62159f 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnDictionaryImpl.scala
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher.impl
+package org.apache.nlpcraft.nlp.token.enricher.impl.en
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnLanguageWordsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnLanguageWordsImpl.scala
similarity index 92%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnLanguageWordsImpl.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnLanguageWordsImpl.scala
index 7cdcf23..d34d35a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnLanguageWordsImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnLanguageWordsImpl.scala
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher.impl
+package org.apache.nlpcraft.nlp.token.enricher.impl.en
import org.apache.nlpcraft.*
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnQuotesImpl.scala
similarity index 86%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnQuotesImpl.scala
index 17c0048..d18eeff 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnQuotesImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnQuotesImpl.scala
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher.impl
+package org.apache.nlpcraft.nlp.token.enricher.impl.en
import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
@@ -45,8 +45,8 @@ class NCEnQuotesImpl extends NCTokenEnricher with LazyLogging:
// Start and end quote mustn't be same ("a` processed as valid)
if quotes.nonEmpty && quotes.size % 2 == 0 then
val m = toksSeq.zipWithIndex.toMap
- val pairs = quotes.zipWithIndex.drop(1).flatMap {
- (t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx -
1))) else None
+ val pairs = quotes.zipWithIndex.drop(1).flatMap { (t, idx) =>
+ Option.when(idx % 2 != 0)(m(t) -> m(quotes(idx - 1)))
}
toksSeq.zipWithIndex.foreach { (tok, idx) =>
tok.put("quoted:en", pairs.exists { case (from, to) => from >
idx && to < idx })
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnSwearWordsImpl.scala
similarity index 78%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnSwearWordsImpl.scala
index 1551e21..c383e5b 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnSwearWordsImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/enricher/impl/en/NCEnSwearWordsImpl.scala
@@ -15,14 +15,12 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher.impl
+package org.apache.nlpcraft.nlp.token.enricher.impl.en
import com.typesafe.scalalogging.LazyLogging
import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.*
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnOpenNlpImpl
import org.apache.nlpcraft.internal.util.NCUtils
-import org.apache.nlpcraft.internal.util.NCUtils.getStream
import java.io.*
@@ -35,18 +33,16 @@ object NCEnSwearWordsImpl:
* @param mdlFile
* @return
*/
- def apply(mdlFile: File): NCEnSwearWordsImpl = new NCEnSwearWordsImpl(
- new BufferedInputStream(new FileInputStream(mdlFile)), mdlFile.getPath
- )
+ def apply(mdlFile: File): NCEnSwearWordsImpl =
+ new NCEnSwearWordsImpl(new BufferedInputStream(new
FileInputStream(mdlFile)), mdlFile.getPath)
/**
*
* @param mdlSrc
* @return
*/
- def apply(mdlSrc: String): NCEnSwearWordsImpl = new NCEnSwearWordsImpl(
- NCUtils.getStream(mdlSrc), mdlSrc
- )
+ def apply(mdlSrc: String): NCEnSwearWordsImpl =
+ new NCEnSwearWordsImpl(NCUtils.getStream(mdlSrc), mdlSrc)
/**
*
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParser.java
similarity index 96%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParser.java
index d0cd2fb..b290516 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParser.java
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParser.java
@@ -15,10 +15,10 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.parser.opennlp;
+package org.apache.nlpcraft.nlp.token.parser.opennlp.en;
import org.apache.nlpcraft.*;
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnOpenNlpImpl;
+import org.apache.nlpcraft.nlp.token.parser.opennlp.impl.en.NCEnOpenNlpImpl;
import java.io.*;
import java.util.*;
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
similarity index 93%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
index 097981f..e68ebff 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnOpenNlpImpl.scala
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl
+package org.apache.nlpcraft.nlp.token.parser.opennlp.impl.en
import opennlp.tools.lemmatizer.*
import opennlp.tools.postag.*
@@ -23,6 +23,7 @@ import opennlp.tools.stemmer.*
import opennlp.tools.tokenize.*
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.token.parser.opennlp.impl.en.*
import java.io.*
import java.util
@@ -142,7 +143,7 @@ class NCEnOpenNlpImpl(
// Valid POS list:
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
val suspIdxs = lemmas.zip(posTags).zipWithIndex.flatMap {
// "0" is flag that lemma cannot be obtained for some reasons.
- case ((lemma, pos), i) => if lemma == "O" && pos == "NN" then
Some(i) else None
+ case ((lemma, pos), i) => Option.when(lemma == "O" && pos ==
"NN")(i)
}
if suspIdxs.nonEmpty then
@@ -150,13 +151,13 @@ class NCEnOpenNlpImpl(
lemmatize(suspIdxs.map(i => words(i)).toArray,
suspIdxs.map(_ => "NNN").toArray).
zipWithIndex.
flatMap {
- (lemma, i) => if lemma != "0" then Some(suspIdxs(i) ->
lemma) else None
+ (lemma, i) => Option.when(lemma != "0")(suspIdxs(i) ->
lemma)
}.toMap
lemmas = lemmas.zipWithIndex.map {
(lemma, idx) => fixes.getOrElse(idx, lemma)
}
- val res: Seq[NCToken] =
holders.zip(posTags).zip(lemmas).toIndexedSeq.map { case ((h, pos), lemma) =>
+ val res: Seq[NCToken] =
holders.zip(posTags).zip(lemmas).toIndexedSeq.zipWithIndex.map { case (((h,
pos), lemma), idx) =>
new NCPropertyMapAdapter with NCToken:
override def getText: String = h.origin
override def getLemma: String = lemma
@@ -166,6 +167,7 @@ class NCEnOpenNlpImpl(
override def getStartCharIndex: Int = h.start
override def getEndCharIndex: Int = h.end
override def getLength: Int = h.length
+ override def getIndex: Int = idx
}
val stops = swFinder.find(res)
@@ -181,6 +183,7 @@ class NCEnOpenNlpImpl(
override def getStartCharIndex: Int =
tok.getStartCharIndex
override def getEndCharIndex: Int = tok.getEndCharIndex
override def getLength: Int = tok.getLength
+ override def getIndex: Int = tok.getIndex
else
tok
).asJava
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnStopWordGenerator.scala
similarity index 99%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnStopWordGenerator.scala
index 959d149..54fb898 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnStopWordGenerator.scala
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl
+package org.apache.nlpcraft.nlp.token.parser.opennlp.impl.en
import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.internal.util.NCUtils
@@ -174,6 +174,7 @@ object NCEnStopWordGenerator:
mkGzip(NOUN_WORDS_FILE, stem(buf.toSeq))
private def stem(s: String): String = s.split("
").map(stemmer.stem).mkString(" ")
+
private def stem(seq: Seq[String]): Seq[String] = seq.map(stem)
private[impl] def mkFirstWords(): Unit =
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnStopWordsFinder.scala
similarity index 97%
rename from
nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
rename to
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnStopWordsFinder.scala
index 9522168..cf49e7f 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/en/NCEnStopWordsFinder.scala
@@ -15,14 +15,12 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl
+package org.apache.nlpcraft.nlp.token.parser.opennlp.impl.en
import com.typesafe.scalalogging.LazyLogging
import opennlp.tools.stemmer.PorterStemmer
-
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnStopWordsFinder.*
-import org.apache.nlpcraft.internal.util.NCUtils
import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.util.NCUtils
import java.util
import java.util.{List as JList, Set as JSet}
@@ -188,6 +186,8 @@ private[impl] object NCEnStopWordsFinder:
* @param exclStems
*/
private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems:
Set[String]) extends LazyLogging:
+ import NCEnStopWordsFinder.*
+
require(addStems != null)
require(exclStems != null)
@@ -352,7 +352,8 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
mkInstance(any ++ excl.values.flatten, incl, excl)
end mkHolder
def mkHash(form: WordForm): HashHolder = mkHolder(mHash, form,
HashHolder.apply)
- def mkScan(form: WordForm): ScanHolder = mkHolder(mScan, form,
ScanHolder.apply)
+ def mkScan(form: WordForm):
+ ScanHolder = mkHolder(mScan, form, ScanHolder.apply)
isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM), mkHash(ORIG),
mkScan(LEM), mkScan(ORIG))
).toMap
@@ -435,9 +436,8 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
val stops = mutable.HashSet.empty[NCToken]
- for (p <- toks.zipWithIndex)
- val tok = p._1
- val idx = p._2
+ for (tok <- toks)
+ val idx = tok.getIndex
val pos = tok.getPos
val lemma = tok.getLemma
val stem = tok.getStem
@@ -543,7 +543,7 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
if quotes.nonEmpty then
val m = toks.zipWithIndex.toMap
val pairs = quotes.zipWithIndex.drop(1).flatMap {
- (t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx -
1))) else None
+ (t, idx) => Option.when(idx % 2 != 0)(m(t) -> m(quotes(idx -
1)))
}
stops --= stops.filter(t => pairs.exists { (from, to) =>
val idx = m(t)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
deleted file mode 100644
index 06accfd..0000000
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestUtils.scala
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.internal.nlp.util
-
-import org.apache.nlpcraft.internal.ascii.NCAsciiTable
-import org.apache.nlpcraft.*
-
-/**
- *
- */
-object NCTestUtils:
- /**
- *
- * @param toks
- * @param props
- */
- def printTokens(toks: Seq[NCToken], props: String*): Unit =
- val tbl = new NCAsciiTable()
-
- if props.isEmpty
- then tbl #= ("Text", "POS", "Stem", "Lemma", "Start", "End",
"Length", "Stopword")
- else tbl #= ("Text", "POS", "Stem", "Lemma", "Start", "End",
"Length", "Stopword", "Properties")
-
- toks.foreach(t =>
- if props.isEmpty then
- tbl += (
- t.getText,
- t.getPos,
- t.getStem,
- t.getLemma,
- t.getStartCharIndex,
- t.getEndCharIndex,
- t.getLength,
- t.isStopWord
- )
- else
- tbl += (
- t.getText,
- t.getPos,
- t.getStem,
- t.getLemma,
- t.getStartCharIndex,
- t.getEndCharIndex,
- t.getLength,
- t.isStopWord,
- props.map(p => s"$p=${t.get[Any](p)}").mkString("{", ", ",
"}")
- )
- )
-
- println(s"Request: ${toks.map(_.getText).mkString(" ")}")
- println(tbl.toString)
-
- /**
- *
- * @param make
- * @tparam T
- * @return
- */
- def makeAndStart[T <: NCLifecycle](make: => T): T =
- def now() = System.currentTimeMillis()
-
- val start = now()
- val t = make
- val started = now()
-
- t.start(null) // TODO: fix it.
- println(s"'${t.getClass.getSimpleName}' created with time=${started -
start} ms and started=${now() - started} ms.")
- t
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/benchmark/NCBenchmarkAdapter.java
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/NCBenchmarkAdapter.java
similarity index 91%
rename from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/benchmark/NCBenchmarkAdapter.java
rename to
nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/NCBenchmarkAdapter.java
index 3bf2f63..f5096e5 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/benchmark/NCBenchmarkAdapter.java
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/NCBenchmarkAdapter.java
@@ -15,10 +15,10 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.benchmark;
+package org.apache.nlpcraft.nlp.benchmark;
import org.apache.nlpcraft.NCRequest;
-import org.apache.nlpcraft.internal.nlp.util.NCTestRequest;
+import org.apache.nlpcraft.nlp.util.NCTestRequest;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.openjdk.jmh.annotations.*;
@@ -40,7 +40,7 @@ import java.util.concurrent.TimeUnit;
public class NCBenchmarkAdapter {
@State(Scope.Thread)
public static class NCBenchmarkAdapterState {
- public NCRequest request = NCTestRequest.apply(
+ public final NCRequest request = NCTestRequest.apply(
"I am developing an integrated Benchmarking into an application, I
want to use JMH as my framework."
);
}
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/benchmark/token/parser/opennlp/NCEnOpenNlpTokenParserBenchmark.java
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/token/parser/opennlp/NCEnOpenNlpTokenParserBenchmark.java
similarity index 74%
rename from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/benchmark/token/parser/opennlp/NCEnOpenNlpTokenParserBenchmark.java
rename to
nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/token/parser/opennlp/NCEnOpenNlpTokenParserBenchmark.java
index 193dd00..4a7764e 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/benchmark/token/parser/opennlp/NCEnOpenNlpTokenParserBenchmark.java
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/benchmark/token/parser/opennlp/NCEnOpenNlpTokenParserBenchmark.java
@@ -15,13 +15,13 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.benchmark.token.parser.opennlp;
+package org.apache.nlpcraft.nlp.benchmark.token.parser.opennlp;
-import org.apache.nlpcraft.internal.nlp.benchmark.NCBenchmarkAdapter;
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser;
+import org.apache.nlpcraft.nlp.benchmark.NCBenchmarkAdapter;
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser;
+import org.apache.nlpcraft.nlp.util.NCTestUtils;
import org.junit.jupiter.api.Disabled;
-import org.openjdk.jmh.annotations.Benchmark;
-import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;
@@ -29,7 +29,7 @@ import org.openjdk.jmh.infra.Blackhole;
*
*/
@Disabled
-public class NCEnOpenNlpTokenParserBenchmark extends NCBenchmarkAdapter {
+public class NCEnOpenNlpTokenParserBenchmark extends NCBenchmarkAdapter {
private NCEnOpenNlpTokenParser parser;
@Setup
@@ -52,11 +52,7 @@ public class NCEnOpenNlpTokenParserBenchmark extends
NCBenchmarkAdapter {
* @return
*/
private static NCEnOpenNlpTokenParser prepareParser() {
- NCEnOpenNlpTokenParser p = new NCEnOpenNlpTokenParser(
- "opennlp/en-token.bin",
- "opennlp/en-pos-maxent.bin",
- "opennlp/en-lemmatizer.dict"
- );
+ NCEnOpenNlpTokenParser p = NCTestUtils.mkEnParser();
p.start(null); // TODO: fix it.
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParserSpec.scala
new file mode 100644
index 0000000..8e1e78b
--- /dev/null
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/entity/parser/opennlp/NCOpenNlpEntityParserSpec.scala
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.entity.parser.opennlp
+
+import org.apache.nlpcraft.nlp.entity.parser.opennlp.NCOpenNlpEntityParser
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.util.NCUtils
+import org.junit.jupiter.api.*
+
+import java.util
+import scala.collection.mutable
+import scala.concurrent.ExecutionContext
+import scala.jdk.CollectionConverters.*
+import scala.jdk.OptionConverters.RichOptional
+
+/**
+ *
+ */
+class NCOpenNlpEntityParserSpec:
+ private val eParsers =
scala.collection.mutable.ArrayBuffer.empty[NCOpenNlpEntityParser]
+ private var tParser: NCEnOpenNlpTokenParser = _
+
+ @BeforeEach
+ def start(): Unit =
+ tParser = NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
+
+ def add(res: String): Unit =
+ eParsers += NCTestUtils.makeAndStart(new
NCOpenNlpEntityParser(s"opennlp/$res"))
+
+ NCUtils.execPar(
+ // en-ner-time.bin is skipped. I can't find any working example.
+ () => add("en-ner-location.bin"),
+ () => add("en-ner-money.bin"),
+ () => add("en-ner-person.bin"),
+ () => add("en-ner-organization.bin"),
+ () => add("en-ner-date.bin"),
+ () => add("en-ner-percentage.bin")
+ )(ExecutionContext.Implicits.global)
+
+ private def checkSingleEntity(txt: String, expected: String): Unit =
+ val req = NCTestRequest(txt)
+ val toks = tParser.parse(req, null)
+ val resSeq = eParsers.map(_.parse(req, null,
toks).asScala.toSeq).filter(_.size == 1)
+
+ require(resSeq.size == 1)
+
+ val res = resSeq.head
+ NCTestUtils.printEntities(txt, res)
+
require(res.exists(_.getOpt(s"opennlp:$expected:probability").isPresent))
+
+ @Test
+ def test(): Unit =
+ checkSingleEntity("today", "date")
+ checkSingleEntity("Moscow", "location")
+ checkSingleEntity("10 is 5 % from 200", "percentage")
+ checkSingleEntity("Tim Cook", "person")
+ checkSingleEntity("Microsoft", "organization")
+ checkSingleEntity("Current price is higher for 20 USA dollars",
"money")
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricherSpec.scala
similarity index 71%
rename from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
rename to
nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricherSpec.scala
index bcfbed5..6981788 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnBracketsTokenEnricherSpec.scala
@@ -15,12 +15,13 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher
+package org.apache.nlpcraft.nlp.token.enricher.en
-import org.apache.nlpcraft.NCToken
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
-import org.apache.nlpcraft.internal.nlp.util.{NCTestRequest, NCTestToken,
NCTestUtils}
-import org.junit.jupiter.api.{BeforeEach, Test}
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.nlp.token.enricher.en.NCEnBracketsTokenEnricher
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
+import org.junit.jupiter.api.*
import scala.jdk.CollectionConverters.*
@@ -33,13 +34,7 @@ class NCEnBracketsTokenEnricherSpec:
@BeforeEach
def start(): Unit = enricher =
- parser = NCTestUtils.makeAndStart(
- new NCEnOpenNlpTokenParser(
- "opennlp/en-token.bin",
- "opennlp/en-pos-maxent.bin",
- "opennlp/en-lemmatizer.dict"
- )
- )
+ parser = NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
NCTestUtils.makeAndStart(new NCEnBracketsTokenEnricher())
/**
@@ -51,10 +46,10 @@ class NCEnBracketsTokenEnricherSpec:
val toks = parser.parse(NCTestRequest(txt), null)
enricher.enrich(NCTestRequest(txt), null, toks)
val seq = toks.asScala.toSeq
- NCTestUtils.printTokens(seq, "brackets:en")
- seq.zipWithIndex.foreach { case (tok, idx) =>
- require(!(tok.get[Boolean]("brackets:en") ^
brackets.contains(idx)))
- }
+ NCTestUtils.printTokens(seq)
+ seq.foreach (tok =>
+ require(!(tok.get[Boolean]("brackets:en") ^
brackets.contains(tok.getIndex)))
+ )
@Test
def test(): Unit =
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricherSpec.scala
similarity index 71%
rename from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
rename to
nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricherSpec.scala
index e4d7335..fe4703e 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnDictionaryTokenEnricherSpec.scala
@@ -15,11 +15,12 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher
+package org.apache.nlpcraft.nlp.token.enricher.en
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
-import org.apache.nlpcraft.internal.nlp.util.{NCTestToken, NCTestUtils}
-import org.junit.jupiter.api.{BeforeEach, Test}
+import org.apache.nlpcraft.nlp.token.enricher.en.NCEnDictionaryTokenEnricher
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
+import org.junit.jupiter.api.*
import scala.jdk.CollectionConverters.SeqHasAsJava
@@ -35,8 +36,8 @@ class NCEnDictionaryTokenEnricherSpec:
@Test
def test(): Unit =
val toks = Seq(
- NCTestToken(txt = "milk", lemma = "milk"), // Valid english word.
- NCTestToken(txt = "XYZ", lemma = "XYZ") // Invalid english word.
+ NCTestToken(txt = "milk", lemma = "milk", idx = 0), // Valid
english word.
+ NCTestToken(txt = "XYZ", lemma = "XYZ", idx = 1) // Invalid
english word.
)
require(toks.head.getOpt[Boolean]("dict:en").isEmpty)
@@ -44,7 +45,7 @@ class NCEnDictionaryTokenEnricherSpec:
enricher.enrich(null, null, toks.asJava)
- NCTestUtils.printTokens(toks, "dict:en")
+ NCTestUtils.printTokens(toks)
require(toks.head.get[Boolean]("dict:en"))
require(!toks.last.get[Boolean]("dict:en"))
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnLanguageTokenEnricherSpec.scala
similarity index 74%
rename from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
rename to
nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnLanguageTokenEnricherSpec.scala
index 52e3156..6a5eb98 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnLanguageTokenEnricherSpec.scala
@@ -15,11 +15,12 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher
+package org.apache.nlpcraft.nlp.token.enricher.en
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
-import org.apache.nlpcraft.internal.nlp.util.{NCTestToken, NCTestUtils}
-import org.junit.jupiter.api.{BeforeEach, Test}
+import org.apache.nlpcraft.nlp.token.enricher.en.NCEnLanguageTokenEnricher
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
+import org.junit.jupiter.api.*
import scala.jdk.CollectionConverters.SeqHasAsJava
@@ -35,8 +36,8 @@ class NCEnLanguageTokenEnricherSpec:
@Test
def test(): Unit =
val toks = Seq(
- NCTestToken(txt = "english", stem = "english"), // English word.
- NCTestToken(txt = "русский", stem = "русский") // Not english word.
+ NCTestToken(txt = "english", stem = "english", idx = 0), //
English word.
+ NCTestToken(txt = "русский", stem = "русский", idx = 1) // Not
english word.
)
require(toks.head.getOpt[Boolean]("lang:en").isEmpty)
@@ -44,7 +45,7 @@ class NCEnLanguageTokenEnricherSpec:
enricher.enrich(null, null, toks.asJava)
- NCTestUtils.printTokens(toks, "lang:en")
+ NCTestUtils.printTokens(toks)
require(toks.head.get[Boolean]("lang:en"))
require(!toks.last.get[Boolean]("lang:en"))
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricherSpec.scala
similarity index 74%
rename from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
rename to
nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricherSpec.scala
index df65c29..98a9837 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnQuotesTokenEnricherSpec.scala
@@ -15,11 +15,14 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher
+package org.apache.nlpcraft.nlp.token.enricher.en
import org.apache.nlpcraft.NCToken
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
-import org.apache.nlpcraft.internal.nlp.util.{NCTestRequest, NCTestToken,
NCTestUtils}
+import org.apache.nlpcraft.nlp.token.enricher.en.NCEnQuotesTokenEnricher
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.NCTestUtils
+import org.apache.nlpcraft.nlp.util.{NCTestRequest, NCTestUtils}
+import org.apache.nlpcraft.nlp.util.NCTestRequest
import org.junit.jupiter.api.{BeforeEach, Test}
import scala.jdk.CollectionConverters.*
@@ -33,13 +36,7 @@ class NCEnQuotesTokenEnricherSpec:
@BeforeEach
def start(): Unit =
- parser = NCTestUtils.makeAndStart(
- new NCEnOpenNlpTokenParser(
- "opennlp/en-token.bin",
- "opennlp/en-pos-maxent.bin",
- "opennlp/en-lemmatizer.dict"
- )
- )
+ parser = NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
enricher = NCTestUtils.makeAndStart(new NCEnQuotesTokenEnricher)
/**
@@ -51,10 +48,10 @@ class NCEnQuotesTokenEnricherSpec:
val toks = parser.parse(NCTestRequest(txt), null)
val toksSeq = toks.asScala.toSeq
enricher.enrich(NCTestRequest(txt), null, toks)
- NCTestUtils.printTokens(toksSeq, "quoted:en")
- toksSeq.zipWithIndex.foreach { case (tok, idx) =>
- require(!(tok.get[Boolean]("quoted:en") ^ quotes.contains(idx)))
- }
+ NCTestUtils.printTokens(toksSeq)
+ toksSeq.foreach (tok =>
+ require(!(tok.get[Boolean]("quoted:en") ^
quotes.contains(tok.getIndex)))
+ )
@Test
def test(): Unit =
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricherSpec.scala
similarity index 72%
rename from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
rename to
nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricherSpec.scala
index a913070..df9f47b 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/enricher/en/NCEnSwearWordsTokenEnricherSpec.scala
@@ -15,11 +15,12 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.enricher
+package org.apache.nlpcraft.nlp.token.enricher.en
-import
org.apache.nlpcraft.internal.nlp.token.parser.opennlp.NCEnOpenNlpTokenParser
-import org.apache.nlpcraft.internal.nlp.util.{NCTestToken, NCTestUtils}
-import org.junit.jupiter.api.{BeforeEach, Test}
+import org.apache.nlpcraft.nlp.token.enricher.en.NCEnSwearWordsTokenEnricher
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
+import org.junit.jupiter.api.*
import scala.jdk.CollectionConverters.SeqHasAsJava
@@ -35,8 +36,8 @@ class NCEnSwearWordsTokenEnricherSpec:
@Test
def test(): Unit =
val toks = Seq(
- NCTestToken(txt = "english", stem = "english"), // English word.
- NCTestToken(txt = "ass", stem = "ass") // Swear english word.
+ NCTestToken(txt = "english", stem = "english", idx = 0), //
English word.
+ NCTestToken(txt = "ass", stem = "ass", idx = 1) // Swear english
word.
)
require(toks.head.getOpt[Boolean]("swear:en").isEmpty)
@@ -44,7 +45,7 @@ class NCEnSwearWordsTokenEnricherSpec:
enricher.enrich(null, null, toks.asJava)
- NCTestUtils.printTokens(toks, "swear:en")
+ NCTestUtils.printTokens(toks)
require(!toks.head.get[Boolean]("swear:en"))
require(toks.last.get[Boolean]("swear:en"))
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParserSpec.scala
similarity index 86%
rename from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
rename to
nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParserSpec.scala
index fe4b693..5bfc288 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCEnOpenNlpTokenParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/token/parser/opennlp/en/NCEnOpenNlpTokenParserSpec.scala
@@ -15,12 +15,12 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.token.parser.opennlp
+package org.apache.nlpcraft.nlp.token.parser.opennlp.en
-import org.apache.nlpcraft.internal.ascii.NCAsciiTable
-import org.apache.nlpcraft.internal.nlp
-import org.apache.nlpcraft.internal.nlp.util.*
import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.ascii.NCAsciiTable
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.util.*
import org.junit.jupiter.api.*
import java.util
@@ -33,14 +33,7 @@ class NCEnOpenNlpTokenParserSpec:
private var parser: NCEnOpenNlpTokenParser = _
@BeforeEach
- def start(): Unit =
- parser = NCTestUtils.makeAndStart(
- new NCEnOpenNlpTokenParser(
- "opennlp/en-token.bin",
- "opennlp/en-pos-maxent.bin",
- "opennlp/en-lemmatizer.dict"
- )
- )
+ def start(): Unit = parser =
NCTestUtils.makeAndStart(NCTestUtils.mkEnParser)
private def test(txt: String, validate: Seq[NCToken] => _): Unit =
val toks = parser.parse(nlp.util.NCTestRequest(txt),
null).asScala.toSeq
@@ -94,5 +87,5 @@ class NCEnOpenNlpTokenParserSpec:
test(
// Nested brackets.
"< < [ A ] > >",
- toks => require(!toks.find(_.getText == "a").get.isStopWord)
+ toks => require(!toks.find(_.getText == "A").get.isStopWord)
)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestRequest.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
similarity index 97%
rename from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestRequest.scala
rename to
nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
index 8665bf3..ab07b91 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestRequest.scala
+++ b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestRequest.scala
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.util
+package org.apache.nlpcraft.nlp.util
import org.apache.nlpcraft.NCRequest
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestToken.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestToken.scala
similarity index 91%
rename from
nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestToken.scala
rename to nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestToken.scala
index 86d3860..d027e9c 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestToken.scala
+++ b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestToken.scala
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.internal.nlp.util
+package org.apache.nlpcraft.nlp.util
import org.apache.nlpcraft.*
@@ -32,6 +32,7 @@ import org.apache.nlpcraft.*
*/
case class NCTestToken(
txt: String,
+ idx: Int,
lemma: String = null,
stem: String = null,
pos: String = null,
@@ -46,4 +47,5 @@ case class NCTestToken(
override def isStopWord: Boolean = isStop
override def getStartCharIndex: Int = start
override def getEndCharIndex: Int = end
- override def getLength: Int = end - start + 1
\ No newline at end of file
+ override def getLength: Int = end - start + 1
+ override def getIndex: Int = idx
\ No newline at end of file
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
new file mode 100644
index 0000000..933d889
--- /dev/null
+++ b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.util
+
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.ascii.NCAsciiTable
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+
+import scala.jdk.CollectionConverters.*
+
+/**
+ *
+ */
+object NCTestUtils:
+ /**
+ * @param toks
+ */
+ def printTokens(toks: Seq[NCToken]): Unit =
+ val tbl = NCAsciiTable("Text", "Index", "POS", "Stem", "Lemma",
"Start", "End", "Length", "Stopword", "Properties")
+
+ for (t <- toks)
+ tbl += (
+ t.getText,
+ t.getIndex,
+ t.getPos,
+ t.getStem,
+ t.getLemma,
+ t.getStartCharIndex,
+ t.getEndCharIndex,
+ t.getLength,
+ t.isStopWord,
+ t.keysSet().asScala.map(p =>
s"$p=${t.get[Any](p)}").mkString("[", ", ", "]")
+ )
+
+ tbl.print(s"Request: ${toks.map(_.getText).mkString(" ")}")
+
+ /**
+ *
+ * @param req
+ * @param ents
+ */
+ def printEntities(req: String, ents: Seq[NCEntity]): Unit =
+ val tbl = NCAsciiTable("EntityId", "Tokens", "Properties")
+
+ for (e <- ents)
+ tbl += (
+ e.getId,
+ e.getTokens.asScala.map(_.getText).mkString("|"),
+ e.keysSet().asScala.map(p =>
s"$p=${e.get[Any](p)}").mkString("{", ", ", "}")
+ )
+
+ tbl.print(s"Request: $req")
+
+ /**
+ *
+ * @param make
+ * @tparam T
+ * @return
+ */
+ def makeAndStart[T <: NCLifecycle](make: => T): T =
+ def now() = System.currentTimeMillis()
+
+ val start = now()
+ val t = make
+ val started = now()
+
+ t.start(null) // TODO: fix it.
+ println(s"'${t.getClass.getSimpleName}' created in ${started -
start}ms and started in ${now() - started}ms.")
+ t
+
+ /**
+ *
+ * @return
+ */
+ def mkEnParser: NCEnOpenNlpTokenParser = new NCEnOpenNlpTokenParser(
+ "opennlp/en-token.bin",
+ "opennlp/en-pos-maxent.bin",
+ "opennlp/en-lemmatizer.dict"
+ )
\ No newline at end of file
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-date.bin
b/nlpcraft/src/test/resources/opennlp/en-ner-date.bin
new file mode 100644
index 0000000..a69923a
Binary files /dev/null and
b/nlpcraft/src/test/resources/opennlp/en-ner-date.bin differ
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-location.bin
b/nlpcraft/src/test/resources/opennlp/en-ner-location.bin
new file mode 100644
index 0000000..f3788bc
Binary files /dev/null and
b/nlpcraft/src/test/resources/opennlp/en-ner-location.bin differ
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-money.bin
b/nlpcraft/src/test/resources/opennlp/en-ner-money.bin
new file mode 100644
index 0000000..2431e0f
Binary files /dev/null and
b/nlpcraft/src/test/resources/opennlp/en-ner-money.bin differ
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-organization.bin
b/nlpcraft/src/test/resources/opennlp/en-ner-organization.bin
new file mode 100644
index 0000000..1fb6d9f
Binary files /dev/null and
b/nlpcraft/src/test/resources/opennlp/en-ner-organization.bin differ
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-percentage.bin
b/nlpcraft/src/test/resources/opennlp/en-ner-percentage.bin
new file mode 100644
index 0000000..98cee1a
Binary files /dev/null and
b/nlpcraft/src/test/resources/opennlp/en-ner-percentage.bin differ
diff --git a/nlpcraft/src/test/resources/opennlp/en-ner-person.bin
b/nlpcraft/src/test/resources/opennlp/en-ner-person.bin
new file mode 100644
index 0000000..2f68318
Binary files /dev/null and
b/nlpcraft/src/test/resources/opennlp/en-ner-person.bin differ