This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
     new d0060c7  WIP.
d0060c7 is described below

commit d0060c7e960f4bf62be33826a5601a8864a608d9
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 29 21:51:37 2021 +0300

    WIP.
---
 .../scala/org/apache/nlpcraft/NCTokenizer.java     | 27 ++++++++++
 .../src/main/scala/org/apache/nlpcraft/NCWord.java | 47 ++++++++++++++++++
 .../nlpcraft/nlp/tokenizer/NCOpenNlpTokenizer.java | 58 ++++++++++++++++++++++
 .../tokenizer/impl/NCOpenNlpTokenizerImpl.scala    | 44 ++++++++++++++++
 .../nlpcraft/nlp/util/NCTestDefaultConfig.scala    | 47 ++++++++++++++++++
 5 files changed, 223 insertions(+)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenizer.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenizer.java
new file mode 100644
index 0000000..a27c7f7
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenizer.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft;
+
+import java.util.List;
+
+/**
+ *
+ */
+public interface NCTokenizer extends NCLifecycle {
+    List<NCWord> tokenize(NCModelConfig cfg, String txt);
+}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java
new file mode 100644
index 0000000..d4a7eed
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft;
+
+/**
+ *
+ */
+public interface NCWord  {
+    /**
+     *
+     * @return
+     */
+    String getText();
+
+    /**
+     *
+     * @return
+     */
+    int getStartCharIndex();
+
+    /**
+     *
+     * @return
+     */
+    int getEndCharIndex();
+
+    /**
+     *
+     * @return
+     */
+    int getLength();
+}
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/NCOpenNlpTokenizer.java
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/NCOpenNlpTokenizer.java
new file mode 100644
index 0000000..03e8c9a
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/NCOpenNlpTokenizer.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.tokenizer;
+
+import org.apache.nlpcraft.NCModelConfig;
+import org.apache.nlpcraft.NCTokenizer;
+import org.apache.nlpcraft.NCWord;
+import org.apache.nlpcraft.nlp.tokenizer.impl.NCOpenNlpTokenizerImpl;
+
+import java.io.File;
+import java.util.List;
+import java.util.Objects;
+
+public class NCOpenNlpTokenizer implements NCTokenizer {
+    private final NCOpenNlpTokenizerImpl impl;
+
+    public NCOpenNlpTokenizer(File tokMdl) {
+        Objects.requireNonNull(tokMdl, "Tokenizer model file cannot be null.");
+
+        impl = NCOpenNlpTokenizerImpl.apply(tokMdl);
+    }
+
+    public NCOpenNlpTokenizer(String tokMdl) {
+        Objects.requireNonNull(tokMdl, "Tokenizer model source cannot be 
null.");
+
+        impl = NCOpenNlpTokenizerImpl.apply(tokMdl);
+    }
+
+    @Override
+    public List<NCWord> tokenize(NCModelConfig cfg, String txt) {
+        return impl.tokenize(cfg, txt);
+    }
+
+    @Override
+    public void start(NCModelConfig cfg) {
+        impl.start(cfg);
+    }
+
+    @Override
+    public void stop() {
+        impl.stop();
+    }
+}
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/impl/NCOpenNlpTokenizerImpl.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/impl/NCOpenNlpTokenizerImpl.scala
new file mode 100644
index 0000000..18d54b4
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/tokenizer/impl/NCOpenNlpTokenizerImpl.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nlpcraft.nlp.tokenizer.impl
+
+import opennlp.tools.tokenize.*
+import org.apache.nlpcraft.*
+import org.apache.nlpcraft.internal.util.NCUtils
+
+import java.io.*
+import java.util
+import scala.jdk.CollectionConverters.*
+
+object NCOpenNlpTokenizerImpl:
+    def apply(file: File): NCOpenNlpTokenizerImpl = new 
NCOpenNlpTokenizerImpl(new BufferedInputStream(new FileInputStream(file)))
+    def apply(src: String): NCOpenNlpTokenizerImpl = new 
NCOpenNlpTokenizerImpl(NCUtils.getStream(src))
+
+class NCOpenNlpTokenizerImpl(is: InputStream) extends NCTokenizer:
+    @volatile var tokenizer: TokenizerME = _
+
+    override def start(cfg: NCModelConfig): Unit = tokenizer = new 
TokenizerME(new TokenizerModel(is))
+    override def stop(): Unit = tokenizer = null
+    override def tokenize(cfg: NCModelConfig, txt: String): util.List[NCWord] =
+        tokenizer.tokenizePos(txt).
+            map(span =>
+                new NCWord:
+                    override def getText: String = 
span.getCoveredText(txt).toString
+                    override def getStartCharIndex: Int = span.getStart
+                    override def getEndCharIndex: Int = span.getEnd
+                    override def getLength: Int = span.length()
+            ).toSeq.asJava
diff --git 
a/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestDefaultConfig.scala 
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestDefaultConfig.scala
new file mode 100644
index 0000000..e0016ab
--- /dev/null
+++ 
b/nlpcraft/src/test/java/org/apache/nlpcraft/nlp/util/NCTestDefaultConfig.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.util
+
+import org.apache.nlpcraft.nlp.token.parser.opennlp.en.NCEnOpenNlpTokenParser
+import org.apache.nlpcraft.nlp.tokenizer.NCOpenNlpTokenizer
+import org.apache.nlpcraft.*
+
+import java.util
+import java.util.Optional
+
+object NCTestDefaultConfig:
+    final val EN_TOKENIZER = new NCOpenNlpTokenizer(
+        "opennlp/en-token.bin"
+    )
+    final val EN_PARSER: NCTokenParser = new NCEnOpenNlpTokenParser(
+        "opennlp/en-pos-maxent.bin",
+        "opennlp/en-lemmatizer.dict"
+    )
+
+    final val EN_MDL_CFG: NCModelConfig = new NCPropertyMapAdapter() with 
NCModelConfig:
+        override def getTokenizer: NCTokenizer = EN_TOKENIZER
+        override def getTokenParsers: util.List[NCTokenParser] = 
util.Collections.singletonList(EN_PARSER);
+        override def getTokenEnrichers: util.List[NCTokenEnricher] = new 
util.ArrayList[NCTokenEnricher]()
+        override def getEntityEnrichers: util.List[NCEntityEnricher] = new 
util.ArrayList[NCEntityEnricher]()
+        override def getEntityParsers: util.List[NCEntityParser] = new 
util.ArrayList[NCEntityParser]()
+        override def getId: String = "test"
+        override def getName: String = "test"
+        override def getVersion: String = "1.0"
+
+    EN_TOKENIZER.start(EN_MDL_CFG)
+    EN_PARSER.start(EN_MDL_CFG)

Reply via email to