This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-471
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-471 by this push:
new 2b497ae WIP.
2b497ae is described below
commit 2b497aea893c84cb6e2411abbd4edd2282908171
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Dec 28 21:01:25 2021 +0300
WIP.
---
nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java | 14 +++++++-------
nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java | 6 ++++++
.../parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala | 12 +++++-------
.../nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala | 4 +++-
.../token/parser/opennlp/impl/NCEnStopWordsFinder.scala | 5 ++---
.../nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala | 6 +++---
.../token/enricher/NCEnDictionaryTokenEnricherSpec.scala | 4 ++--
.../nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala | 4 ++--
.../nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala | 6 +++---
.../token/enricher/NCEnSwearWordsTokenEnricherSpec.scala | 4 ++--
.../apache/nlpcraft/internal/nlp/util/NCTestToken.scala | 4 +++-
11 files changed, 38 insertions(+), 31 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
index f8e0c8e..e9f3ef8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
@@ -17,6 +17,8 @@
package org.apache.nlpcraft;
+import org.apache.nlpcraft.internal.util.NCUtils;
+
import java.util.List;
/**
@@ -43,12 +45,6 @@ public interface NCEntity extends NCPropertyMap {
String getId();
/**
- *
- * @return Index of this entity in the sentence.
- */
- int getIndex();
-
- /**
* A shortcut method that gets internal globally unique system ID of the
entity.
* <p>
* This method is equivalent to:
@@ -56,7 +52,11 @@ public interface NCEntity extends NCPropertyMap {
* return meta("nlpcraft:nlp:unid");
* </pre>
*
+ * TODO: default.
+ *
* @return Internal globally unique system ID of the entity.
*/
- String getGuid();
+ default String getGuid() {
+ return NCUtils.genUUID().toString();
+ }
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
index dfef5a0..b34b5f7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
@@ -74,4 +74,10 @@ public interface NCToken extends NCPropertyMap {
* @return
*/
int getLength();
+
+ /**
+ *
+ * @return
+ */
+ int getIndex();
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
index 0edfc2b..588923a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala
@@ -61,12 +61,12 @@ class NCOpenNlpEntityParserImpl(is: InputStream, res:
String) extends NCEntityPa
}
override def parse(req: NCRequest, cfg: NCModelConfig, toks:
JList[NCToken]): JList[NCEntity] =
- val toksIdxs = toks.asScala.zipWithIndex
+ val toksSeq = toks.asScala
- find(toksIdxs.map { case (t, _) => t.getOriginalText
}.toArray).flatMap(h =>
+ find(toksSeq.map(_.getOriginalText).toArray).flatMap(h =>
def calcIndex(getHolderIndex: Holder => Int): Int =
- toksIdxs.find { case (_, idx) => idx == getHolderIndex(h) }
match
- case Some((_, idx)) => idx
+ toksSeq.find(_.getIndex == getHolderIndex(h)) match
+ case Some(t) => t.getIndex
case None => -1
val i1 = calcIndex(_.start)
@@ -77,11 +77,9 @@ class NCOpenNlpEntityParserImpl(is: InputStream, res:
String) extends NCEntityPa
put(s"opennlp:${h.name}:probability", h.probability)
override def getTokens: JList[NCToken] =
- toksIdxs.flatMap { case (t, idx) => Option.when(idx >=
i1 && idx <= i2)(t) }.asJava
+ toksSeq.flatMap(t => Option.when(t.getIndex >= i1 &&
t.getIndex <= i2)(t)).asJava
override def getRequestId: String = req.getRequestId
override def getId: String = s"opennlp:${h.name}"
- override def getIndex: Int = 0 // TODO:
- override def getGuid: String = NCUtils.genUUID().toString
}
)
).toSeq.asJava
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
index 9e4c712..f12936c 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnOpenNlpImpl.scala
@@ -156,7 +156,7 @@ class NCEnOpenNlpImpl(
(lemma, idx) => fixes.getOrElse(idx, lemma)
}
- val res: Seq[NCToken] =
holders.zip(posTags).zip(lemmas).toIndexedSeq.map { case ((h, pos), lemma) =>
+ val res: Seq[NCToken] =
holders.zip(posTags).zip(lemmas).toIndexedSeq.zipWithIndex.map { case (((h,
pos), lemma), idx) =>
new NCPropertyMapAdapter with NCToken:
override def getOriginalText: String = h.origin
override def getNormalizedText: String = h.normalized
@@ -167,6 +167,7 @@ class NCEnOpenNlpImpl(
override def getStartCharIndex: Int = h.start
override def getEndCharIndex: Int = h.end
override def getLength: Int = h.length
+ override def getIndex: Int = idx
}
val stops = swFinder.find(res)
@@ -183,6 +184,7 @@ class NCEnOpenNlpImpl(
override def getStartCharIndex: Int =
tok.getStartCharIndex
override def getEndCharIndex: Int = tok.getEndCharIndex
override def getLength: Int = tok.getLength
+ override def getIndex: Int = tok.getIndex
else
tok
).asJava
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
index 278f560..b6749f4 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
@@ -435,9 +435,8 @@ private[impl] class NCEnStopWordsFinder(addStems:
Set[String], exclStems: Set[St
val stops = mutable.HashSet.empty[NCToken]
- for (p <- toks.zipWithIndex)
- val tok = p._1
- val idx = p._2
+ for (tok <- toks)
+ val idx = tok.getIndex
val pos = tok.getPos
val lemma = tok.getLemma
val stem = tok.getStem
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
index 42c3887..8dfb2dd 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnBracketsTokenEnricherSpec.scala
@@ -46,9 +46,9 @@ class NCEnBracketsTokenEnricherSpec:
enricher.enrich(NCTestRequest(txt), null, toks)
val seq = toks.asScala.toSeq
NCTestUtils.printTokens(seq)
- seq.zipWithIndex.foreach { case (tok, idx) =>
- require(!(tok.get[Boolean]("brackets:en") ^
brackets.contains(idx)))
- }
+ seq.foreach (tok =>
+ require(!(tok.get[Boolean]("brackets:en") ^
brackets.contains(tok.getIndex)))
+ )
@Test
def test(): Unit =
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
index da11baa..50eb872 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricherSpec.scala
@@ -35,8 +35,8 @@ class NCEnDictionaryTokenEnricherSpec:
@Test
def test(): Unit =
val toks = Seq(
- NCTestToken(txt = "milk", lemma = "milk"), // Valid english word.
- NCTestToken(txt = "XYZ", lemma = "XYZ") // Invalid english word.
+ NCTestToken(txt = "milk", lemma = "milk", idx = 0), // Valid
english word.
+ NCTestToken(txt = "XYZ", lemma = "XYZ", idx = 1) // Invalid
english word.
)
require(toks.head.getOpt[Boolean]("dict:en").isEmpty)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
index 6003eb6..8cdb0f6 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnLanguageTokenEnricherSpec.scala
@@ -35,8 +35,8 @@ class NCEnLanguageTokenEnricherSpec:
@Test
def test(): Unit =
val toks = Seq(
- NCTestToken(txt = "english", stem = "english"), // English word.
- NCTestToken(txt = "русский", stem = "русский") // Not english word.
+ NCTestToken(txt = "english", stem = "english", idx = 0), //
English word.
+ NCTestToken(txt = "русский", stem = "русский", idx = 1) // Not
english word.
)
require(toks.head.getOpt[Boolean]("lang:en").isEmpty)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
index 48fe24d..b9aac95 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnQuotesTokenEnricherSpec.scala
@@ -46,9 +46,9 @@ class NCEnQuotesTokenEnricherSpec:
val toksSeq = toks.asScala.toSeq
enricher.enrich(NCTestRequest(txt), null, toks)
NCTestUtils.printTokens(toksSeq)
- toksSeq.zipWithIndex.foreach { case (tok, idx) =>
- require(!(tok.get[Boolean]("quoted:en") ^ quotes.contains(idx)))
- }
+ toksSeq.foreach (tok =>
+ require(!(tok.get[Boolean]("quoted:en") ^
quotes.contains(tok.getIndex)))
+ )
@Test
def test(): Unit =
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
index 45ab328..658b809 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnSwearWordsTokenEnricherSpec.scala
@@ -35,8 +35,8 @@ class NCEnSwearWordsTokenEnricherSpec:
@Test
def test(): Unit =
val toks = Seq(
- NCTestToken(txt = "english", stem = "english"), // English word.
- NCTestToken(txt = "ass", stem = "ass") // Swear english word.
+ NCTestToken(txt = "english", stem = "english", idx = 0), //
English word.
+ NCTestToken(txt = "ass", stem = "ass", idx = 1) // Swear english
word.
)
require(toks.head.getOpt[Boolean]("swear:en").isEmpty)
diff --git
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestToken.scala
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestToken.scala
index 076ffc7..440f9a1 100644
---
a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestToken.scala
+++
b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/util/NCTestToken.scala
@@ -32,6 +32,7 @@ import org.apache.nlpcraft.*
*/
case class NCTestToken(
txt: String,
+ idx: Int,
lemma: String = null,
stem: String = null,
pos: String = null,
@@ -47,4 +48,5 @@ case class NCTestToken(
override def isStopWord: Boolean = isStop
override def getStartCharIndex: Int = start
override def getEndCharIndex: Int = end
- override def getLength: Int = end - start + 1
\ No newline at end of file
+ override def getLength: Int = end - start + 1
+ override def getIndex: Int = idx
\ No newline at end of file