This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new 11067f2 WIP.
11067f2 is described below
commit 11067f2bcaef5adb014999ddcaabf69258159241
Author: Sergey Kamov <[email protected]>
AuthorDate: Sat Jan 1 19:23:59 2022 +0300
WIP.
---
nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java | 6 ------
nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java | 6 ++++++
.../nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala | 2 +-
.../nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala | 7 ++++---
4 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
index 67e3e8d..4eeacc5 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
@@ -44,10 +44,4 @@ public interface NCToken extends NCWord, NCPropertyMap {
* @return
*/
boolean isStopWord();
-
- /**
- *
- * @return
- */
- int getIndex();
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java
index d4a7eed..a13840a 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java
@@ -44,4 +44,10 @@ public interface NCWord {
* @return
*/
int getLength();
+
+ /**
+ *
+ * @return
+ */
+ int getIndex();
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
index 6c62be8..1496de2 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/en/impl/NCEnOpenNlpImpl.scala
@@ -128,7 +128,7 @@ class NCEnOpenNlpImpl(posMdlSrc: String, lemmaDicSrc:
String) extends NCTokenPar
override def getStartCharIndex: Int = w.getStartCharIndex
override def getEndCharIndex: Int = w.getEndCharIndex
override def getLength: Int = w.getLength
- override def getIndex: Int = idx
+ override def getIndex: Int = w.getIndex
}
val stops = swFinder.find(res)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala
index 9db5c1e..49ac329 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/opennlp/impl/NCOpenNlpTokenizerImpl.scala
@@ -35,10 +35,11 @@ class NCOpenNlpTokenizerImpl(src: String) extends
NCTokenizer:
override def start(cfg: NCModelConfig): Unit = tokenizer = new
TokenizerME(new TokenizerModel(NCUtils.getStream(src)))
override def stop(): Unit = tokenizer = null
override def tokenize(cfg: NCModelConfig, txt: String): util.List[NCWord] =
- this.synchronized { tokenizer.tokenizePos(txt) }
- .map(span => new NCWord:
+ this.synchronized { tokenizer.tokenizePos(txt) }.zipWithIndex.map {
(span, idx) =>
+ new NCWord:
override def getText: String =
span.getCoveredText(txt).toString
override def getStartCharIndex: Int = span.getStart
override def getEndCharIndex: Int = span.getEnd
override def getLength: Int = span.length()
- ).toSeq.asJava
+ override def getIndex: Int = idx
+ }.toSeq.asJava