This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new 0b9ffe6 WIP.
0b9ffe6 is described below
commit 0b9ffe63d1c438595a3e37983ce19469ce47916f
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Dec 21 12:10:52 2021 +0300
WIP.
---
.../opennlp/impl/NCEnStopWordsFinderImpl.scala | 7 ++
.../token/parser/opennlp/impl/NCOpenNlpImpl.scala | 4 ++
.../parser/opennlp/NCOpenNlpTokenParserSpec.scala | 80 +++++++++++++++++-----
3 files changed, 73 insertions(+), 18 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
index 4bb9e85..ff1cfde 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinderImpl.scala
@@ -181,6 +181,9 @@ object NCEnStopWordsFinderImpl:
import NCEnStopWordsFinderImpl.*
class NCEnStopWordsFinderImpl(addStopWords: JSet[String], exclStopWords:
JSet[String]) extends NCStopWordsFinder with LazyLogging:
+ require(addStopWords != null)
+ require(exclStopWords != null)
+
private val addStopWordsStems = addStopWords.asScala
private val exclStopWordsStems = exclStopWords.asScala
@@ -444,6 +447,10 @@ class NCEnStopWordsFinderImpl(addStopWords: JSet[String],
exclStopWords: JSet[St
* @param toks
*/
override def find(toks: JList[NCToken]): JList[NCToken] =
+ // TODO: check ? stop clear?
+ if (percents == null)
+ throw new IllegalStateException(s"${this.getClass.getName} is not
started.")
+
import scala.jdk.CollectionConverters.*
val ns = toks.asScala
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
index 058b95d..2eb8124 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
@@ -90,6 +90,10 @@ class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn:
InputStream, lemmaDicIn: In
* @return
*/
override def parse(req: NCRequest): JList[NCToken] =
+ // TODO: check ? stop clear?
+ if (tokenizer == null)
+ throw new IllegalStateException(s"${this.getClass.getName} is not
started.")
+
// OpenNLP classes are not thread-safe.
this.synchronized {
val sen = req.getNormalizedText
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
index 79665e7..b420805 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala
@@ -17,15 +17,19 @@
package org.apache.nlpcraft.internal.nlp.token.parser.opennlp
-import org.apache.nlpcraft.NCRequest
-import org.junit.jupiter.api.Test
-import scala.jdk.CollectionConverters.ListHasAsScala
+import com.google.gson.GsonBuilder
+import org.apache.nlpcraft.{NCRequest, NCToken}
+import org.junit.jupiter.api.{BeforeEach, Test}
+
+import scala.jdk.CollectionConverters.*
import java.util
-class NCOpenNlpTokenParserSpec {
- @Test
- def test(): Unit =
- val parser =
+class NCOpenNlpTokenParserSpec:
+ private var parser: NCOpenNlpTokenParser = _
+
+ @BeforeEach
+ def start(): Unit = {
+ parser =
new NCOpenNlpTokenParser(
"opennlp/en-token.bin",
"opennlp/en-pos-maxent.bin",
@@ -34,13 +38,31 @@ class NCOpenNlpTokenParserSpec {
)
parser.start()
+ }
+ def pprint(obj: Any, depth: Int = 0, paramName: Option[String] = None):
Unit =
+ val indent = " " * depth
+ val prettyName = paramName.fold("")(x => s"$x: ")
+ val ptype = obj match { case _: Iterable[Any] => "" case obj: Product
=> obj.productPrefix case _ => obj.toString }
+
+ println(s"$indent$prettyName$ptype")
+
+ obj match
+ case seq: Iterable[Any] =>
+ seq.foreach(pprint(_, depth + 1))
+ case obj: Product =>
+ obj.productIterator.zip(obj.productElementNames).foreach {
case (subObj, paramName) =>
+ pprint(subObj, depth + 1, Some(paramName))
+ }
+ case _ =>
+
+ private def request(txt: String): Seq[NCToken] =
val toks = parser.parse(
new NCRequest:
override def getUserId: String = null
override def getRequestId: String = null
- override def getNormalizedText: String =
getOriginalText.toLowerCase
- override def getOriginalText: String = "Test requests!"
+ override def getNormalizedText: String = txt.toLowerCase
+ override def getOriginalText: String = txt
override def getReceiveTimestamp: Long = 0
override def getUserAgent: String = null
override def getRequestData: util.Map[String, AnyRef] = null
@@ -49,16 +71,38 @@ class NCOpenNlpTokenParserSpec {
assert(toks != null)
assert(!toks.isEmpty)
- toks.asScala.foreach(t =>
+ val res = toks.asScala.toSeq
+
+ println(s"Request: $txt")
+
+ res.foreach(t =>
+ pprint(t)
println(
s"Text: ${t.getOriginalText}" +
- s", normalized: ${t.getNormalizedText}" +
- s", pos: ${t.getPos}" +
- s", stem: ${t.getStem}" +
- s", start: ${t.getStartCharIndex}" +
- s", end: ${t.getEndCharIndex}" +
- s", length: ${t.getLength}" +
- s", isStop: ${t.isStopWord}"
+ s", normalized: ${t.getNormalizedText}" +
+ s", pos: ${t.getPos}" +
+ s", stem: ${t.getStem}" +
+ s", start: ${t.getStartCharIndex}" +
+ s", end: ${t.getEndCharIndex}" +
+ s", length: ${t.getLength}" +
+ s", isStop: ${t.isStopWord}"
)
)
- }
+
+ println
+
+ res
+
+ @Test
+ def test(): Unit =
+ var toks = request("Test requests!")
+
+ require(toks.length == 3)
+ require(!toks.head.isStopWord)
+ require(toks.last.isStopWord)
+
+ toks = request("Test requests !")
+
+ require(toks.length == 3)
+ require(!toks.head.isStopWord)
+ require(toks.last.isStopWord)
\ No newline at end of file