This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-469 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit bfec03718625ee8a92e1ece415e05328240e11e4 Author: Sergey Kamov <[email protected]> AuthorDate: Mon Dec 20 18:59:12 2021 +0300 WIP. --- nlpcraft/pom.xml | 10 + .../main/resources/stopwords/first_words.txt.gz | Bin 0 -> 4024879 bytes .../src/main/resources/stopwords/noun_words.txt.gz | Bin 0 -> 857 bytes .../resources/stopwords/possessive_words.txt.gz | Bin 0 -> 990 bytes .../src/main/resources/stopwords/stop_words.txt | 240 ++++++++ .../token/parser/opennlp/NCOpenNlpTokenParser.java | 27 +- .../token/parser/opennlp/impl/NCOpenNlpImpl.scala | 118 ++-- .../parser/opennlp/impl/NCStopWordGenerator.scala | 352 +++++++++++ .../parser/opennlp/impl/NCStopWordsProcessor.scala | 646 +++++++++++++++++++++ .../apache/nlpcraft/internal/util/NCUtils.scala | 295 +++++++++- .../parser/opennlp/NCOpenNlpTokenParserSpec.scala | 6 +- nlpcraft/src/test/resources/log4j2.xml | 50 ++ pom.xml | 12 + 13 files changed, 1682 insertions(+), 74 deletions(-) diff --git a/nlpcraft/pom.xml b/nlpcraft/pom.xml index b4290aa..1f17ff8 100644 --- a/nlpcraft/pom.xml +++ b/nlpcraft/pom.xml @@ -109,6 +109,16 @@ <artifactId>opennlp-tools</artifactId> </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j-impl</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-api</artifactId> + </dependency> + <!-- JUnit & ScalaTest dependencies. =============================== diff --git a/nlpcraft/src/main/resources/stopwords/first_words.txt.gz b/nlpcraft/src/main/resources/stopwords/first_words.txt.gz new file mode 100644 index 0000000..bb8df9c Binary files /dev/null and b/nlpcraft/src/main/resources/stopwords/first_words.txt.gz differ diff --git a/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz b/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz new file mode 100644 index 0000000..bd80d4f Binary files /dev/null and b/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz differ diff --git a/nlpcraft/src/main/resources/stopwords/possessive_words.txt.gz b/nlpcraft/src/main/resources/stopwords/possessive_words.txt.gz new file mode 100644 index 0000000..20ed420 Binary files /dev/null and b/nlpcraft/src/main/resources/stopwords/possessive_words.txt.gz differ diff --git a/nlpcraft/src/main/resources/stopwords/stop_words.txt b/nlpcraft/src/main/resources/stopwords/stop_words.txt new file mode 100644 index 0000000..5644efd --- /dev/null +++ b/nlpcraft/src/main/resources/stopwords/stop_words.txt @@ -0,0 +1,240 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Basic predefined stop-words. +# +# Configuration contains: +# - Words (processed as stem) +# - Words with POSes list (processed as lemma) +# - Words with wildcard, symbol `*` (processed as lemma) +# +# Words and POSes can me marked as excluded (symbol `~` before word) +# Word can be marked as case-sensitive (symbol `@` before word) +# +# Restrictions: +# - POSes list cannot be defined for multiple words. +# - Only one wildcard can be defined in the word. +# - Wildcard cannot be applied to chunks of words. +# - Only one case-sensitive flag can be defined in the word. +# +# Examples: +# ======== +# decent - Includes word 'decent'. +# *ent - Includes all words ending with 'ent'. +# *ent | NN - Includes all words with POS NN ending with 'ent'. +# *ent | ~NN ~JJ - Includes all words beside POSes NN and JJ ending with 'ent'. +# ~dif*ly | JJ JJR JJS - Excludes all JJ/JJR/JJS words starting with 'diff' and ending with 'ly'. +# ~may | MD - Excludes 'may' MD. +# * | MD - All words with MD POS. +# ~@US - US is not stop word (exception). +# +# Invalid syntax examples: +# ======================== +# te*ni* - Too many wildcards +# tech* pers* - Too many wildcards. +# @Technical @Personal - Too many case sensitive flags. +# @Technical Personal | JJ - POSes cannot be defined for chunks of words. +# + +# POSes list. +* | UH +* | , +* | POS +* | : +* | . +* | -- +* | MD +* | EX +* | DT + +# POSES list exceptions. +~may +~no + +# Postfixes list. +*ent | ~NN ~NNS ~NNP ~NNPS +*ant | ~NN ~NNS ~NNP ~NNPS +*ive | ~NN ~NNS ~NNP ~NNPS ~CD +*ly | ~NN ~NNS ~NNP ~NNPS +*ry | ~NN ~NNS ~NNP ~NNPS +*ial | ~NN ~NNS ~NNP ~NNPS +*able | ~NN ~NNS ~NNP ~NNPS +*able | ~NN ~NNS ~NNP ~NNPS +*ible | ~NN ~NNS ~NNP ~NNPS +*less | ~NN ~NNS ~NNP ~NNPS + +# Postfixes list exceptions. +~less +~monthly +~daily +~weekly +~quarterly +~yearly +~badly +~poorly +~different + +# Words of concrete POSes. +key | JJ JJR JJS +vital | JJ JJR JJS +critical | JJ JJR JJS +pressing | JJ JJR JJS +paramount | JJ JJR JJS +high-priority | JJ JJR JJS +must-have | JJ JJR JJS + +# Words of any POSes. +a +an +avg +average +the +etc +fair +approximate +decent +generous +good +ok +okay +so +please +well +objective +reasonable +unbiased +sincere +trustworthy +civil +candid +honest +impartial +legitimate +straightforward +moderate +subjective +partial +rough +fuzzy +now +all right +let +website +web-site +web site +hey +lol +lulz +omg +omfg +of the essence +gr8 +lmao +wtf +xoxo +j/k +jk +fyi +imho +imo +btw +fwiw +thx +wth +afaik +abt +afaic +aka +a.k.a. +awol +b2b +b2c +byod +ciao +cmon +eta +huh +nsfw +otoh +plz +pls +rotfl +tgif +zzzz +zzz + +# GEO abbreviations exceptions. +# Cities. +~la +~sf +~kc +~hk + +# States. +~al +~ak +~az +~ar +~ca +~co +~ct +~de +~fl +~ga +~hi +~id +~il +~in +~ia +~ks +~ky +~la +~me +~md +~ma +~mi +~mn +~ms +~mo +~mt +~ne +~nv +~nh +~nj +~nm +~ny +~nc +~nd +~oh +~ok +~or +~pa +~ri +~sc +~sd +~tn +~tx +~ut +~vt +~va +~wa +~wv +~wi +~wy + +# Upper case exceptions. +~@US diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java index a590d72..118ef86 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java @@ -17,15 +17,15 @@ package org.apache.nlpcraft.internal.nlp.token.parser.opennlp; -import org.apache.nlpcraft.*; -import org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.*; +import org.apache.nlpcraft.NCException; +import org.apache.nlpcraft.NCRequest; +import org.apache.nlpcraft.NCToken; +import org.apache.nlpcraft.NCTokenParser; +import org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCOpenNlpImpl; -import java.io.BufferedInputStream; import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; import java.util.List; - +import java.util.Set; /* * Models can be downloaded from the following resources: @@ -33,7 +33,6 @@ import java.util.List; * - tagger: http://opennlp.sourceforge.net/models-1.5/en-pos-maxent.bin * - lemmatizer: https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict */ - /** * */ @@ -49,11 +48,7 @@ public class NCOpenNlpTokenParser implements NCTokenParser { */ public NCOpenNlpTokenParser(File tokMdl, File posMdl, File lemmaDic) { try { - impl = new NCOpenNlpImpl( - new BufferedInputStream(new FileInputStream(tokMdl)), - new BufferedInputStream(new FileInputStream(posMdl)), - new BufferedInputStream(new FileInputStream(lemmaDic)) - ); + impl = NCOpenNlpImpl.apply(tokMdl, posMdl, lemmaDic); } catch (Exception e) { throw new NCException("Failed to create OpenNLP token parser.", e); @@ -86,7 +81,7 @@ public class NCOpenNlpTokenParser implements NCTokenParser { * * @return */ - public List<String> getAdditionalStopWords() { + public Set<String> getAdditionalStopWords() { return impl.getAdditionalStopWords(); } @@ -94,7 +89,7 @@ public class NCOpenNlpTokenParser implements NCTokenParser { * * @param addStopWords */ - public void setAdditionalStopWords(List<String> addStopWords) { + public void setAdditionalStopWords(Set<String> addStopWords) { impl.setAdditionalStopWords(addStopWords); } @@ -102,7 +97,7 @@ public class NCOpenNlpTokenParser implements NCTokenParser { * * @return */ - public List<String> getExcludedStopWords() { + public Set<String> getExcludedStopWords() { return impl.getExcludedStopWords(); } @@ -110,7 +105,7 @@ public class NCOpenNlpTokenParser implements NCTokenParser { * * @param exclStopWords */ - public void setExcludedStopWords(List<String> exclStopWords) { + public void setExcludedStopWords(Set<String> exclStopWords) { impl.setExcludedStopWords(exclStopWords); } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala index 7bf1df0..7efdb84 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala @@ -20,13 +20,13 @@ package org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl import org.apache.nlpcraft.* import java.io.* +import java.util.Set as JSet import java.util.List as JList import opennlp.tools.lemmatizer.* import opennlp.tools.postag.* import opennlp.tools.stemmer.* import opennlp.tools.tokenize.* import org.apache.nlpcraft.internal.util.NCUtils - import scala.jdk.CollectionConverters.* object NCOpenNlpImpl: @@ -40,6 +40,18 @@ object NCOpenNlpImpl: def apply(tokMdlSrc: String, posMdlSrc: String, lemmaDicSrc: String): NCOpenNlpImpl = new NCOpenNlpImpl(NCUtils.getStream(tokMdlSrc), NCUtils.getStream(posMdlSrc), NCUtils.getStream(lemmaDicSrc)) + /** + * + * @param tokMdlFile Local file for OpenNLP tokenizer model. + * @param posMdlFile Local file for OpenNLP tagger model. + * @param lemmaDicFile Local file for OpenNLP lemmatizer dictionary. + * @return + */ + def apply(tokMdlFile: File, posMdlFile: File, lemmaDicFile: File): NCOpenNlpImpl = + def toStream(f: File) = new BufferedInputStream(new FileInputStream(f)) + + new NCOpenNlpImpl(toStream(tokMdlFile), toStream(posMdlFile), toStream(lemmaDicFile)) + /** * * @param tokMdlIn @@ -51,32 +63,31 @@ class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn: InputStream, lemmaDicIn: In private val tagger = new POSTaggerME(new POSModel(posMdlIn)) private val lemmatizer = new DictionaryLemmatizer(lemmaDicIn) private val stemmer = new PorterStemmer - private var addStopWords = List.empty[String] - private var exclStopWords = List.empty[String] + private val stopProc = new NCStopWordsProcessor(stemmer) /** - * + *`` * @return */ - def getAdditionalStopWords: JList[String] = addStopWords.asJava + def getAdditionalStopWords: JSet[String] = stopProc.getAdditionalStopWords.asJava /** * * @return */ - def getExcludedStopWords: JList[String] = exclStopWords.asJava + def getExcludedStopWords: JSet[String] = stopProc.getExcludedStopWords.asJava /** * * @param addStopWords */ - def setAdditionalStopWords(addStopWords: JList[String]): Unit = this.addStopWords = addStopWords.asScala.toList + def setAdditionalStopWords(addStopWords: JSet[String]): Unit = stopProc.setAdditionalStopWords(addStopWords.asScala.toSet) /** * * @param exclStopWords */ - def setExcludedStopWords(exclStopWords: JList[String]): Unit = this.exclStopWords = exclStopWords.asScala.toList + def setExcludedStopWords(exclStopWords: JSet[String]): Unit = stopProc.setExcludedStopWords(exclStopWords.asScala.toSet) /** * @@ -84,48 +95,53 @@ class NCOpenNlpImpl(tokMdlIn: InputStream, posMdlIn: InputStream, lemmaDicIn: In * @return */ def parse(req: NCRequest): JList[NCToken] = - val sen = req.getNormalizedText - - case class TokenHolder(origin: String, normalized: String, start: Int, end: Int, length: Int) - - val holders = tokenizer.tokenizePos(sen).map( t => { - val txt = t.getCoveredText(sen).toString - TokenHolder(txt, txt.toLowerCase, t.getStart, t.getEnd, t.length()) - }) - - val words = holders.map(_.origin) - val posTags = tagger.tag(words) - var lemmas = lemmatizer.lemmatize(words, posTags).toSeq - - require(holders.length == posTags.length) - - // For some reasons lemmatizer (en-lemmatizer.dict) marks some words with non-existent POS 'NNN' - // Valid POS list: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html - val suspIdxs = lemmas.zip(posTags).zipWithIndex.flatMap { - // "0" is flag that lemma cannot be obtained for some reasons. - case ((lemma, pos), i) => if lemma == "O" && pos == "NN" then Some(i) else None - } - - if suspIdxs.nonEmpty then - val fixes: Map[Int, String] = lemmatizer. - lemmatize(suspIdxs.map(i => words(i)).toArray, suspIdxs.map(_ => "NNN").toArray). - zipWithIndex. - flatMap { - case (lemma, i) => if lemma != "0" then Some(suspIdxs(i) -> lemma) else None - }.toMap - lemmas = lemmas.zipWithIndex.map { - case (lemma, idx) => fixes.getOrElse(idx, lemma) + // OpenNLP classes are not thread-safe. + this.synchronized { + val sen = req.getNormalizedText + + case class TokenHolder(origin: String, normalized: String, start: Int, end: Int, length: Int) + + val holders = tokenizer.tokenizePos(sen).map( t => { + val txt = t.getCoveredText(sen).toString + TokenHolder(txt, txt.toLowerCase, t.getStart, t.getEnd, t.length) + }) + + val words = holders.map(_.origin) + val posTags = tagger.tag(words) + var lemmas = lemmatizer.lemmatize(words, posTags).toSeq + + require(holders.length == posTags.length) + + // For some reasons lemmatizer (en-lemmatizer.dict) marks some words with non-existent POS 'NNN' + // Valid POS list: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html + val suspIdxs = lemmas.zip(posTags).zipWithIndex.flatMap { + // "0" is flag that lemma cannot be obtained for some reasons. + case ((lemma, pos), i) => if lemma == "O" && pos == "NN" then Some(i) else None } - holders.zip(posTags).zip(lemmas).toIndexedSeq.map { case ((h, pos), lemma) => - new NCParameterizedAdapter with NCToken: - override def getOriginalText: String = h.origin - override def getNormalizedText: String = h.normalized - override def getLemma: String = lemma - override def getStem: String = stemmer.stem(h.normalized) - override def getPos: String = pos - override def isStopWord: Boolean = true // TODO: implement - override def getStartCharIndex: Int = h.start - override def getEndCharIndex: Int = h.end - override def getLength: Int = h.length - }.asJava + if suspIdxs.nonEmpty then + val fixes: Map[Int, String] = lemmatizer. + lemmatize(suspIdxs.map(i => words(i)).toArray, suspIdxs.map(_ => "NNN").toArray). + zipWithIndex. + flatMap { + case (lemma, i) => if lemma != "0" then Some(suspIdxs(i) -> lemma) else None + }.toMap + lemmas = lemmas.zipWithIndex.map { + case (lemma, idx) => fixes.getOrElse(idx, lemma) + } + + stopProc.process( + holders.zip(posTags).zip(lemmas).map { case ((h, pos), lemma) => + new NCParameterizedAdapter with NCToken: + override def getOriginalText: String = h.origin + override def getNormalizedText: String = h.normalized + override def getLemma: String = lemma + override def getStem: String = stemmer.stem(h.normalized) + override def getPos: String = pos + override def isStopWord: Boolean = false + override def getStartCharIndex: Int = h.start + override def getEndCharIndex: Int = h.end + override def getLength: Int = h.length + } + ).asJava + } \ No newline at end of file diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordGenerator.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordGenerator.scala new file mode 100644 index 0000000..17b6bba --- /dev/null +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordGenerator.scala @@ -0,0 +1,352 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl + +import opennlp.tools.stemmer.PorterStemmer +import org.apache.nlpcraft.internal.util.NCUtils + +import scala.collection.mutable + +/** + * Generates first word sequences. + */ +object NCStopWordGenerator extends App: + private final lazy val stemmer = new PorterStemmer + + // Output files. + private val FIRST_WORDS_FILE = "first_words.txt" + private val NOUN_WORDS_FILE = "noun_words.txt" + private val POS_WORDS_FILE = "possessive_words.txt" + + private final val POS1 = Seq( + "for", + "in", + "on", + "within" + ) + + private final val POS2 = Seq( + "our", + "my" + ) + + private final val POS3 = Seq( + "website", + "web-site", + "web site", + "company website", + "personal website", + "site", + "team", + "organization", + "group", + "company", + "page", + "property", + "online property", + "company online property" + ) + + private[impl] def mkPossessiveStopWords: Seq[String] = + (for (w1 <- POS1; w2 <- POS2; w3 <- POS3) yield s"$w1 $w2 $w3") ++ + (for (w2 <- POS2; w3 <- POS3) yield s"$w2 $w3") + + private final val QWORDS = Seq( + "what", + "when", + "where", + "which", + "who", + "whom", + "whose", + "why", + "how", + "how much", + "how many", + "how long" + ) + + private final val DWORDS = Seq( + "show", + "list", + "give", + "display", + "enumerate", + "print", + "tell", + "say", + "find", + "select", + "query", + "count", + "calculate", + "produce", + "chart", + "draw", + "plot", + "get" + ) + + private final val QWORDS_SUP = Seq( + "is", + "are", + "about" + ) + + private final val DWORDS_SUP = Seq( + "me", + "for me", + "just for me", + "us", + "for us", + "just for us", + "for all of us", + "just for all of us" + ) + + private final val WORDS2 = Seq( + "please", + "kindly", + "simply", + "basically", + "just", + "now" + ) + + private final val NOUN_WORDS = Seq( + "data", + "document", + "info", + "fact", + "report", + "result" + ) + + private final val NOUN_WORDS2 = Seq( + "for", + "about", + "in", + "around", + "within", + "in regards to", + "with regards to", + "related to", + "specific to", + "pertaining to", + "in relation to", + "correlated to", + "specific for", + "specifically about" + ) + + private final val DWORDS_PRE = Seq( + "can you", + "would you", + "could you", + "would you not", + "could you not", + "will you", + "shall you", + "how about you", + "what about you", + "if you", + "if you can", + "if you could", + "what if you", + "what if you can", + "what if you could" + ) + + private final val QWORDS2 = Seq( + "is there", + "are there", + "do we have", + "do we not have", + "do you have", + "do you not have" + ) + + private final val QWORDS_ANY = Seq( + "any", + "some", + "few", + "several", + "handful", + "couple", + "couple of" + ) + + private def mkGzip(path: String, lines: Iterable[Any]): Unit = + val p = NCUtils.mkPath(s"nlpcraft/src/main/resources/stopwords/$path") + + NCUtils.mkTextFile(p, lines) + NCUtils.gzipPath(p) + + private[impl] def mkNounWords(): Unit = + val buf = new mutable.ArrayBuffer[String]() + + for (w1 <- NOUN_WORDS) + buf += s"$w1" + + for (w1 <- NOUN_WORDS; w2 <- NOUN_WORDS2) + buf += s"$w1 $w2" + + mkGzip(NOUN_WORDS_FILE, stem(buf.toSeq)) + + private def stem(s: String): String = s.split(" ").map(stemmer.stem).mkString(" ") + private def stem(seq: Seq[String]): Seq[String] = seq.map(stem) + + private[impl] def mkFirstWords(): Unit = + val buf = new scala.collection.mutable.ArrayBuffer[String]() + + // is there + for (w1 <- QWORDS2) + buf += s"$w1" + + // please can you show what is + for (w0 <- WORDS2; w1 <- DWORDS_PRE; w2 <- DWORDS; w3 <- QWORDS; w4 <- QWORDS_SUP) + buf += s"$w0 $w1 $w2 $w3 $w4" + + // is there any + for (w1 <- QWORDS2; w2 <- QWORDS_ANY) + buf += s"$w1 $w2" + + // what is + for (w1 <- QWORDS; w2 <- QWORDS_SUP) + buf += s"$w1 $w2" + + // what + for (w1 <- QWORDS) + buf += s"$w1" + + // please what is + for (w0 <- WORDS2; w1 <- QWORDS; w2 <- QWORDS_SUP) + buf += s"$w0 $w1 $w2" + + // please what + for (w0 <- WORDS2; w1 <- QWORDS) + buf += s"$w0 $w1" + + // what is please + for (w1 <- QWORDS; w2 <- QWORDS_SUP; w3 <- WORDS2) + buf += s"$w1 $w2 $w3" + + // show me + for (w1 <- DWORDS; w2 <- DWORDS_SUP) + buf += s"$w1 $w2" + + // please show me + for (w0 <- WORDS2; w1 <- DWORDS; w2 <- DWORDS_SUP) + buf += s"$w0 $w1 $w2" + + // please show + for (w0 <- WORDS2; w1 <- DWORDS) + buf += s"$w0 $w1" + + // show me please + for (w1 <- DWORDS; w2 <- DWORDS_SUP; w3 <- WORDS2) + buf += s"$w1 $w2 $w3" + + // show please + for (w1 <- DWORDS; w3 <- WORDS2) + buf += s"$w1 $w3" + + // show + for (w <- DWORDS) + buf += s"$w" + + // can you please show me + for (w0 <- DWORDS_PRE; w1 <- WORDS2; w2 <- DWORDS; w3 <- DWORDS_SUP) + buf += s"$w0 $w1 $w2 $w3" + + // can you please show + for (w0 <- DWORDS_PRE; w1 <- WORDS2; w2 <- DWORDS) + buf += s"$w0 $w1 $w2" + + // please can you show me + for (w0 <- WORDS2; w1 <- DWORDS_PRE; w2 <- DWORDS; w3 <- DWORDS_SUP) + buf += s"$w0 $w1 $w2 $w3" + + // please can you show + for (w0 <- WORDS2; w1 <- DWORDS_PRE; w2 <- DWORDS) + buf += s"$w0 $w1 $w2" + + // can you show me + for (w0 <- DWORDS_PRE; w2 <- DWORDS; w3 <- DWORDS_SUP) + buf += s"$w0 $w2 $w3" + + // can you show + for (w0 <- DWORDS_PRE; w2 <- DWORDS) + buf += s"$w0 $w2" + + // can you please show what is + for (w0 <- DWORDS_PRE; w1 <- WORDS2; w2 <- DWORDS; w3 <- QWORDS; w4 <- QWORDS_SUP) + buf += s"$w0 $w1 $w2 $w3 $w4" + + // can you please + for (w0 <- DWORDS_PRE; w1 <- WORDS2) + buf += s"$w0 $w1" + + // can you please show what + for (w0 <- DWORDS_PRE; w1 <- WORDS2; w2 <- DWORDS; w3 <- QWORDS) + buf += s"$w0 $w1 $w2 $w3" + + // please can you show what + for (w0 <- WORDS2; w1 <- DWORDS_PRE; w2 <- DWORDS; w3 <- QWORDS) + buf += s"$w0 $w1 $w2 $w3" + + // can you show what is + for (w0 <- DWORDS_PRE; w1 <- DWORDS; w3 <- QWORDS; w4 <- QWORDS_SUP) + buf += s"$w0 $w1 $w3 $w4" + + // can you show what + for (w0 <- DWORDS_PRE; w1 <- DWORDS; w3 <- QWORDS) + buf += s"$w0 $w1 $w3" + + // can you please show me what is + for (w0 <- DWORDS_PRE; w1 <- WORDS2; w2 <- DWORDS; w3 <- DWORDS_SUP; w4 <- QWORDS; w5 <- QWORDS_SUP) + buf += s"$w0 $w1 $w2 $w3 $w4 $w5" + + // can you please show me what + for (w0 <- DWORDS_PRE; w1 <- WORDS2; w2 <- DWORDS; w3 <- DWORDS_SUP; w4 <- QWORDS) + buf += s"$w0 $w1 $w2 $w3 $w4" + + // please can you show me what is + for (w0 <- WORDS2; w1 <- DWORDS_PRE; w2 <- DWORDS; w3 <- DWORDS_SUP; w4 <- QWORDS; w5 <- QWORDS_SUP) + buf += s"$w0 $w1 $w2 $w3 $w4 $w5" + + // please can you show me what + for (w0 <- WORDS2; w1 <- DWORDS_PRE; w2 <- DWORDS; w3 <- DWORDS_SUP; w4 <- QWORDS) + buf += s"$w0 $w1 $w2 $w3 $w4" + + // can you show me what is + for (w0 <- DWORDS_PRE; w1 <- DWORDS; w2 <- DWORDS_SUP; w3 <- QWORDS; w4 <- QWORDS_SUP) + buf += s"$w0 $w1 $w2 $w3 $w4" + + // can you show me what + for (w0 <- DWORDS_PRE; w1 <- DWORDS; w2 <- DWORDS_SUP; w3 <- QWORDS) + buf += s"$w0 $w1 $w2 $w3" + + mkGzip(FIRST_WORDS_FILE, stem(buf.toSeq)) + + mkFirstWords() + mkNounWords() + + mkGzip(POS_WORDS_FILE, stem(mkPossessiveStopWords)) + + sys.exit() diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordsProcessor.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordsProcessor.scala new file mode 100644 index 0000000..408cdbe --- /dev/null +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCStopWordsProcessor.scala @@ -0,0 +1,646 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl + +import com.typesafe.scalalogging.LazyLogging +import opennlp.tools.stemmer.PorterStemmer +import org.apache.nlpcraft.internal.util.NCUtils +import org.apache.nlpcraft.{NCException, NCLifecycle, NCParameterizedAdapter, NCToken} + +import scala.annotation.tailrec +import scala.collection.{Seq, mutable} + +/** + * Stop-word and stop-sentence enricher. + */ +private[impl] object NCStopWordsProcessor extends LazyLogging: + // Condition types. + type Wildcard = (String, String) + type Word = String + type Sentence = Seq[NCToken] + + /** All POSes set. http://www.clips.ua.ac.be/pages/mbsp-tags */ + private final val POSES = Set( + "CC", + "CD", + "DT", + "EX", + "FW", + "IN", + "JJ", + "JJR", + "JJS", + "LS", + "MD", + "NN", + "NNS", + "NNP", + "NNPS", + "PDT", + "POS", + "PRP", + "PRP$", + "RB", + "RBR", + "RBS", + "RP", + "SYM", + "TO", + "UH", + "VB", + "VBZ", + "VBP", + "VBD", + "VBN", + "VBG", + "WDT", + "WP", + "WP$", + "WRB", + ".", + ",", + ":", + "(", + ")", + "--" // Synthetic POS. + ) + + private final val STOP_BEFORE_STOP: Seq[Word] = Seq("DT", "PRP", "PRP$", "WDT", "WP", "WP$", "WRB") + + private var percents: Set[String] = _ + private var possessiveWords: Set[String] = _ + private var firstWords: Set[String] = _ + private var nounWords: Set[String] = _ + private var stopWords: StopWordHolder = _ + private var exceptions: StopWordHolder = _ + + // TODO: lifecycle. + start() + + /** + * Stop words holder, used for hash search. + * + * @param any Any POSes container. + * @param includes Included by POS container. + * @param excludes Excluded by POS container. + */ + private case class HashHolder( + any: Set[Word], + includes: Map[String, Set[Word]], + excludes: Map[String, Set[Word]] + ): + def matches(s: String, posOpt: Option[String]): Boolean = + posOpt match + case Some(pos) => + !excludes.getOrElse(pos, Set.empty).contains(s) && + (any.contains(s) || includes.getOrElse(pos, Set.empty).contains(s)) + case _ => any.contains(s) + + /** + * Stop words holder, used for scanning. + * + * @param any Any POSes container. + * @param includes Included by POS container. + * @param excludes Excluded by POS container. + */ + private case class ScanHolder( + any: Set[Wildcard], + includes: Map[String, Set[Wildcard]], + excludes: Map[String, Set[Wildcard]] + ): + require(!any.exists { case (begin, end) => begin.isEmpty && end.isEmpty }) + + // Optimization for full wildcard cases (configurations like * | DT) + private val inclPoses = filterPoses(includes) + private val exclPoses = filterPoses(excludes) + + private def filterPoses(m: Map[String, Set[Wildcard]]): Set[String] = + m.filter { case(_, pair) => pair.exists { case (begin, end) => begin.isEmpty && end.isEmpty } }.keySet + + private def matches(s: String, set: Set[Wildcard]): Boolean = + set.exists { case (b, e) => (b.isEmpty || s.startsWith(b)) && (e.isEmpty || s.endsWith(e)) } + + def matches(s: String, posOpt: Option[String]): Boolean = + if (s.contains(' ')) + false + else + posOpt match + case Some(pos) => + !exclPoses.contains(pos) && + !matches(s, excludes.getOrElse(pos, Set.empty)) && + ( + inclPoses.contains(pos) || + matches(s, any) || + matches(s, includes.getOrElse(pos, Set.empty)) + ) + case _ => throw new AssertionError(s"Unexpected missed POS.") + + /** + * Stop words data holder. + * + * @param stems Stems data holder. + * @param lemmas Lemmas data holder. + * @param origins Origins data holder. + * @param wildcardsLemmas Wildcards lemmas data holder. + * @param wildcardsOrigins Wildcards origins data holder. + */ + private case class StopWordHolder( + stems: HashHolder, + lemmas: HashHolder, + origins: HashHolder, + wildcardsLemmas: ScanHolder, + wildcardsOrigins: ScanHolder + ): + def matches(toks: Seq[NCToken]): Boolean = + val posOpt = + toks.size match + case 0 => throw new AssertionError(s"Unexpected empty tokens.") + case 1 => Some(toks.head.getPos) + case _ => None + + // Hash access. + stems.matches(toStemKey(toks), posOpt) || + lemmas.matches(toLemmaKey(toks), posOpt) || + origins.matches(toOriginalKey(toks), posOpt) || + // Scan access. + wildcardsLemmas.matches(toLemmaKey(toks), posOpt) || + wildcardsOrigins.matches(toOriginalKey(toks), posOpt) + + private def toStemKey(toks: Seq[NCToken]): String = toks.map(_.getStem).mkString(" ") + private def toLemmaKey(toks: Seq[NCToken]): String = toks.map(_.getLemma).mkString(" ") + private def toValueKey(toks: Seq[NCToken]): String = toks.map(_.getOriginalText.toLowerCase).mkString(" ") + private def toOriginalKey(toks: Seq[NCToken]): String = toks.map(_.getOriginalText).mkString(" ") + + /** + * Parses configuration template. + * + * @param stemmer Stemmer. + * @param lines Configuration file content. + * @return Holder and `is-exception` flag. + */ + @throws[NCException] + private def readStopWords(stemmer: PorterStemmer, lines: Seq[String]): Map[Boolean, StopWordHolder] = + // 1. Prepares accumulation data structure. + object WordForm extends Enumeration: + type WordForm = Value + + val STEM, LEM, ORIG = Value + + import WordForm._ + + class Condition[T]: + val any = mutable.HashSet.empty[T] + val includes = mutable.HashMap.empty[String, mutable.HashSet[T]] + val excludes = mutable.HashMap.empty[String, mutable.HashSet[T]] + + def addCondition(cond: T, poses: Map[String, Boolean]): Any = + if (poses.isEmpty) + any += cond + else + def add(m: mutable.HashMap[String, mutable.HashSet[T]], incl: Boolean): Unit = + poses.filter { case (_, isIncl) => isIncl == incl }.keys.foreach(pos => + m.get(pos) match + case Some(set) => set.add(cond) + case _ => + val set = mutable.HashSet.empty[T] + + set += cond + + m += pos -> set + ) + + add(includes, incl = true) + add(excludes, incl = false) + + type Key = (Boolean, WordForm) + def mkMap[T](mkT: Unit => T): Map[Key, T] = + val m = mutable.Map.empty[Key, T] + + def add(f: WordForm, mkT: Unit => T, isExc: Boolean): Unit = + val tuple: (Key, T) = (isExc, f) -> mkT(()) + + m += tuple._1 -> tuple._2 + + WordForm.values.foreach(f => + add(f, mkT, isExc = true) + add(f, mkT, isExc = false) + ) + + m.toMap + + // Prepares collections. + val mHash = mkMap(_ => new Condition[Word]()) + val mScan = mkMap(_ => new Condition[Wildcard]()) + + // 2. Accumulates data of each parsed line. + for (line <- lines) + @throws[NCException] + def throwError(msg: String): Unit = + throw new NCException(s"Invalid stop word configuration [line=$line, reason=$msg]") + + var s = line.trim + + // Word with size 1 word should contains letter only. + if (s.length == 1 && !s.head.isLetter) + throwError("Invalid stop word") + + @throws[NCException] + def checkSingle(ch: Char): Unit = if (s.count(_ == ch) > 1) throwError(s"Unexpected symbols count: $ch") + + // Confusing special symbols. + checkSingle('@') + checkSingle('|') + checkSingle('*') + + val isExc = line.head == '~' + + if (isExc) + s = line.drop(1) + + val idxPos = s.indexOf("|") + + val poses: Map[String, Boolean] = + if (idxPos > 0) + s. + drop(idxPos + 1). + trim.split(" "). + map(_.trim.toUpperCase). + filter(_.nonEmpty). + toSeq. + map(p => if (p.head == '~') p.drop(1).strip -> false else p -> true). + toMap + else + Map.empty + + if (!poses.keys.forall(POSES.contains)) + throwError(s"Invalid POSes: ${poses.keys.mkString(", ")}") + + val hasPoses = poses.nonEmpty + + if (hasPoses) + s = s.take(idxPos).trim + + val isMultiWord = s.contains(' ') + + // Confusing POSes. + if (poses.nonEmpty && isMultiWord) + throwError("POSes cannot be defined for multiple stop words.") + + var isCase = false + + if (s.head == '@') + s = s.drop(1) + + // Empty word. + if (s.isEmpty) + throwError("Empty word") + + isCase = true + + val idxWild = s.indexOf("*") + + if (idxWild >= 0 && isMultiWord) + throwError("Wildcard cannot be defined for multiple stop words.") + + if (idxWild < 0) + val (word, form) = + if (isCase) + (s, ORIG) + else { + if (!hasPoses) (stemmer.stem(s), STEM) else (stemmer.stem(s), LEM) + } + + mHash((isExc, form)).addCondition(word, poses) + else + val b = s.take(idxWild) + val e = s.drop(idxWild + 1) + + if (b.isEmpty && e.isEmpty && !hasPoses) + throwError("Too general wildcard definition.") + + mScan((isExc, if (isCase) ORIG else LEM)).addCondition((b, e), poses) + + // 3. Converts data to service format. + def toImmutable[T](m: mutable.HashMap[String, mutable.HashSet[T]]): Map[String, Set[T]] = m.map(p => p._1 -> p._2.toSet).toMap + + Seq(true, false).map(isExc => + def mkHolder[T, R]( + m: Map[(Boolean, WordForm), Condition[T]], + form: WordForm, + mkInstance: (Set[T], Map[String, Set[T]], Map[String, Set[T]]) => R + ): R = + val any = m((isExc, form)).any.toSet + val incl = toImmutable(m((isExc, form)).includes) + val excl = toImmutable(m((isExc, form)).excludes) + + mkInstance(any ++ excl.values.flatten, incl, excl) + + def mkHash(form: WordForm): HashHolder = mkHolder(mHash, form, HashHolder.apply) + def mkScan(form: WordForm): ScanHolder = mkHolder(mScan, form, ScanHolder.apply) + + isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM), mkHash(ORIG), mkScan(LEM), mkScan(ORIG)) + ).toMap + + private def isVerb(pos: String): Boolean = pos.head == 'V' + + /** + * Marks words before stop words. + * + * @param ns Sentence. + * @param stopPoses Stop POSes. + * @param lastIdx Last index. + * @param isException Function which return `stop word exception` flag. + * @param stops Stopwords tokens. + */ + @tailrec + private def markBefore( + ns: Sentence, + stopPoses: Seq[String], + lastIdx: Int, + isException: Seq[NCToken] => Boolean, + stops: mutable.HashSet[NCToken] + ): Boolean = + var stop = true + + for ( + (tok, idx) <- ns.zipWithIndex + if idx != lastIdx && + !tok.isStopWord && + !isException(Seq(tok)) && + stopPoses.contains(tok.getPos) && + ns(idx + 1).isStopWord) + stops += tok + + stop = false + + if (stop) true else markBefore(ns, stopPoses, lastIdx, isException, stops) + + /** + * Checks value cached or not. + * + * @param toks Tokens. + * @param cache Cache map. + * @param get Calculation method based on given tokens. + */ + private def exists(toks: Seq[NCToken], cache: mutable.HashMap[Seq[NCToken], Boolean], get: Seq[NCToken] => Boolean): Boolean = + cache.get(toks) match + case Some(b) => b + case None => + val b = get(toks) + + cache += toks -> b + + b + /** + * Marks as stopwords, words with POS from configured list, which also placed before another stop words. + */ + private def processCommonStops(ns: Sentence, stops: mutable.HashSet[NCToken], exclStopWordsStems: Set[String]): Unit = + /** + * Marks as stopwords, words with POS from configured list, which also placed before another stop words. + */ + @tailrec + def processCommonStops0(ns: Sentence): Unit = + val max = ns.size - 1 + var stop = true + + for ( + (tok, idx) <- ns.zipWithIndex + if idx != max && + !tok.isStopWord && + !exclStopWordsStems.contains(tok.getStem) && + POSES.contains(tok.getPos) && + ns(idx + 1).isStopWord + ) + stops += tok + + stop = false + + if (!stop) + processCommonStops0(ns) + + processCommonStops0(ns) + + private def start(): Unit = + val stemmer = new PorterStemmer + + percents = Set( + "%", + "pct", + "pc", + "percentage", + "proportion", + "interest", + "rate", + "percent" + ).map(stemmer.stem) + + // Stemmatization is done already by generator. + possessiveWords = NCUtils.readTextGzipResource("stopwords/possessive_words.txt.gz", "UTF-8", logger).toSet + firstWords = NCUtils.readTextGzipResource("stopwords/first_words.txt.gz", "UTF-8", logger).toSet + nounWords = NCUtils.readTextGzipResource("stopwords/noun_words.txt.gz", "UTF-8", logger).toSet + + // Case sensitive. + val m = + readStopWords( + stemmer, + NCUtils.readResource("stopwords/stop_words.txt", "UTF-8", logger). + map(_.strip).filter(s => s.nonEmpty && !s.startsWith("#")) + ) + + stopWords = m(false) + exceptions = m(true) + +import NCStopWordsProcessor._ + +private[impl] class NCStopWordsProcessor(stemmer: PorterStemmer) extends LazyLogging: + private var addStopWords = Set.empty[String] + private var addStopWordsStems = Set.empty[String] + private var exclStopWords = Set.empty[String] + private var exclStopWordsStems = Set.empty[String] + + /** + * + * @param addStopWords + */ + def setAdditionalStopWords(addStopWords: Set[String]): Unit = + require(addStopWords != null) + + this.addStopWords = addStopWords + this.addStopWordsStems = stemmer.synchronized { addStopWords.map(stemmer.stem) } + + /** + * + * @return + */ + def getAdditionalStopWords: Set[String] = addStopWordsStems + + /** + * + * @param exclStopWords + */ + def setExcludedStopWords(exclStopWords: Set[String]): Unit = + require(exclStopWords != null) + + this.exclStopWords = exclStopWords + this.exclStopWordsStems = stemmer.synchronized { exclStopWords.map(stemmer.stem) } + + /** + * + * @return + */ + def getExcludedStopWords: Set[String] = exclStopWords + + /** + * + * @param ns + */ + @throws[NCException] + def process(ns: Sentence): Sentence = + // Stop words and exceptions caches for this sentence. + val cacheSw = mutable.HashMap.empty[Seq[NCToken], Boolean] + val cacheEx = mutable.HashMap.empty[Seq[NCToken], Boolean] + + def isStop(toks: Seq[NCToken]): Boolean = exists(toks, cacheSw, stopWords.matches) + def isException(toks: Seq[NCToken]): Boolean = exists(toks, cacheEx, exceptions.matches) + + val stops = mutable.HashSet.empty[NCToken] + + for (p <- ns.zipWithIndex) + val tok = p._1 + val idx = p._2 + val pos = tok.getPos + val lemma = tok.getLemma + val stem = tok.getStem + + def isFirst: Boolean = idx == 0 + def isLast: Boolean = idx == ns.length - 1 + + def next(): NCToken = ns(idx + 1) + def prev(): NCToken = ns(idx - 1) + + def isCommonVerbs(firstVerb: String, secondVerb: String): Boolean = + isVerb(pos) && lemma == secondVerb || + (isVerb(pos) && lemma == firstVerb && !isLast && isVerb(next().getPos) && next().getLemma == secondVerb) + + // +---------------------------------+ + // | Pass #1. | + // | POS tags and manual resolution. | + // +---------------------------------+ + val stop = + !isException(Seq(tok)) && + (// Percents after numbers. + // 1. Word from 'percentage' list. + percents.contains(stem) && + // 2. Number before. + !isFirst && prev().getPos == "CD" && + // 3. It's last word or any words after except numbers. + (isLast || next().getPos != "CD") + ) || + // be, was, is etc. or has been etc. + isCommonVerbs("have", "be") || + // be, was, is etc. or have done etc. + isCommonVerbs("have", "do") + if (stop) + stops += tok + + // +--------------------------------------+ + // | Pass #2. | + // | Find all words from predefined list. | + // +--------------------------------------+ + val buf = mutable.Buffer.empty[Seq[NCToken]] + val mix = NCUtils.tokenMixWithStopWords(ns) + + for (toks <- mix if !buf.exists(_.containsSlice(toks)) && isStop(toks) && !isException(toks)) + toks.foreach(tok => stops += tok) + buf += toks + + // Capture the token mix at this point minus the initial stop words found up to this point. + val origToks: Seq[(Seq[NCToken], String)] = + (for (toks <- mix) yield toks.toSeq).map(s => s -> toStemKey(s)).toSeq + + // +--------------------------------------------+ + // | Pass #3. | + // | Check external possessive stop-word file. | + // +--------------------------------------------+ + for (tup <- origToks; key = tup._2 if possessiveWords.contains(key) && !isException(tup._1)) + tup._1.foreach(tok => stops += tok) + + // +--------------------------------------------------+ + // | Pass #4. | + // | Check for sentence beginners from external file. | + // +--------------------------------------------------+ + + val foundKeys = new mutable.HashSet[String]() + + // All sentence first stop words + first non stop word. + val startToks = ns.takeWhile(_.isStopWord) ++ ns.find(!_.isStopWord).map(p => p) + for (startTok <- startToks; tup <- origToks.filter(_._1.head == startTok); key = tup._2 + if firstWords.contains(key) && !isException(tup._1)) + tup._1.foreach(tok => stops += tok) + foundKeys += key + + // +-------------------------------------------------+ + // | Pass #5. | + // | Check for sentence beginners with ending nouns. | + // +-------------------------------------------------+ + for (tup <- origToks; key = tup._2 if !foundKeys.contains(key) && !isException(tup._1)) + foundKeys.find(key.startsWith) match + case Some(s) => + if (nounWords.contains(key.substring(s.length).strip)) + tup._1.foreach(tok => stops += tok) + case None => () + + // +-------------------------------------------------+ + // | Pass #6. | + // | Mark words with POSes before stop-words. | + // +-------------------------------------------------+ + markBefore(ns, STOP_BEFORE_STOP, ns.size - 1, isException, stops) + + // +-------------------------------------------------+ + // | Pass #7. | + // | Processing additional and excluded stop words. | + // +-------------------------------------------------+ + for (t <- ns if addStopWordsStems.contains(t.getStem)) + stops += t + + for (t <- stops.filter(t => exclStopWordsStems.contains(t.getStem))) + stops -= t + + // +-------------------------------------------------+ + // | Pass #8. | + // | Marks as stopwords, words with POS from | + // | configured list, which also placed before | + // | another stop words. | + // +-------------------------------------------------+ + processCommonStops(ns, stops, exclStopWordsStems) + + ns.map(tok => + if (stops.contains(tok)) + new NCParameterizedAdapter with NCToken: + override def getOriginalText: String = tok.getOriginalText + override def getNormalizedText: String = tok.getNormalizedText + override def getLemma: String = tok.getLemma + override def getStem: String = tok.getStem + override def getPos: String = tok.getPos + override def isStopWord: Boolean = true + override def getStartCharIndex: Int = tok.getStartCharIndex + override def getEndCharIndex: Int = tok.getEndCharIndex + override def getLength: Int = tok.getLength + else + tok + ) \ No newline at end of file diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala index 6855786..66d2e13 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala @@ -19,19 +19,23 @@ package org.apache.nlpcraft.internal.util import com.google.gson.GsonBuilder import com.typesafe.scalalogging.* -import org.apache.nlpcraft.NCException +import org.apache.nlpcraft.{NCException, NCToken} import org.apache.nlpcraft.internal.ansi.NCAnsi.* import java.io.* import java.net.* import java.util.Random import java.util.regex.Pattern +import java.util.zip.{GZIPInputStream, GZIPOutputStream} import scala.annotation.tailrec +import scala.collection.{IndexedSeq, Seq} import scala.concurrent.duration.Duration import scala.concurrent.{Await, ExecutionContext, Future} +import scala.io.Source import scala.sys.SystemProperties +import scala.util.Using import scala.util.control.Exception.ignoring - +import scala.io.BufferedSource /** * */ @@ -712,4 +716,289 @@ object NCUtils extends LazyLogging: * @param s * @return */ - def capitalize(s: String): String = s"${s.head.toUpper}${s.tail}" \ No newline at end of file + def capitalize(s: String): String = s"${s.head.toUpper}${s.tail}" + + /** + * Makes absolute path starting from working directory. + * + * @param path Path. + */ + def mkPath(path: String): String = new File(s"${new File("").getAbsolutePath}/$path").getAbsolutePath + + /** + * Generates read-only text file with given path and strings. + * Used by text files auto-generators. + * + * @param path Path of the output file. + * @param lines Text data. + * @param sort Whether to sort output or not. + */ + @throws[IOException] + def mkTextFile(path: String, lines: scala.Iterable[Any], sort: Boolean = true): Unit = + val file = new File(path) + + Using.resource(new PrintStream(file)) { + ps => + import java.util.* + + // Could be long for large sequences... + val seq = + if (sort) + lines.map(_.toString).toSeq.sorted + else + lines + + ps.println(s"#") + ps.println(s"# Licensed to the Apache Software Foundation (ASF) under one or more") + ps.println(s"# contributor license agreements. See the NOTICE file distributed with") + ps.println(s"# this work for additional information regarding copyright ownership.") + ps.println(s"# The ASF licenses this file to You under the Apache License, Version 2.0") + ps.println(s"# (the 'License'); you may not use this file except in compliance with") + ps.println(s"# the License. You may obtain a copy of the License at") + ps.println(s"#") + ps.println(s"# https://www.apache.org/licenses/LICENSE-2.0") + ps.println(s"#") + ps.println(s"# Unless required by applicable law or agreed to in writing, software") + ps.println(s"# distributed under the License is distributed on an 'AS IS' BASIS,") + ps.println(s"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.") + ps.println(s"# See the License for the specific language governing permissions and") + ps.println(s"# limitations under the License.") + ps.println(s"#") + ps.println(s"# Auto-generated on: ${new Date()}") + ps.println(s"# Total lines: ${seq.size}") + ps.println(s"#") + ps.println(s"# +-------------------------+") + ps.println(s"# | DO NOT MODIFY THIS FILE |") + ps.println(s"# +-------------------------+") + ps.println(s"#") + ps.println() + + seq.foreach(ps.println) + + // Make the file as read-only. + file.setWritable(false, false) + } + + // Ack. + println(s"File generated: $path") + + /** + * Reads lines from given file. + * + * @param path Zipped file path to read from. + * @param enc Encoding. + * @param log Logger to use. + */ + @throws[NCException] + def readGzipPath(path: String, enc: String = "UTF-8", log: Logger = logger): List[String] = + readGzipFile(new File(path), enc, log) + + /** + * Reads lines from given file. + * + * @param f Zipped file to read from. + * @param enc Encoding. + * @param log Logger to use. + */ + @throws[NCException] + def readGzipFile(f: File, enc: String, log: Logger = logger): List[String] = + try + Using.resource(Source.fromInputStream(new GZIPInputStream(new FileInputStream(f)), enc)) { src => + getAndLog(src.getLines().map(p => p).toList, f, log) + } + catch + case e: IOException => throw new NCException(s"Failed to read GZIP file: ${f.getAbsolutePath}", e) + + /** + * Reads bytes from given file. + * + * @param f File. + * @param log Logger. + */ + @throws[NCException] + def readFileBytes(f: File, log: Logger = logger): Array[Byte] = + try + val arr = new Array[Byte](f.length().toInt) + + Using.resource(new FileInputStream(f)) { in => + in.read(arr) + } + + getAndLog(arr, f, log) + catch + case e: IOException => throw new NCException(s"Error reading file: $f", e) + + + /** + * Gzip file. + * + * @param path File path. + * @param log Logger. + */ + @throws[NCException] + def gzipPath(path: String, log: Logger = logger): Unit = gzipFile(new File(path), log) + + /** + * Gzip file. + * + * @param f File. + * @param log Logger. + */ + @throws[NCException] + def gzipFile(f: File, log: Logger = logger): Unit = + val gz = s"${f.getAbsolutePath}.gz" + + // Do not user BOS here - it makes files corrupted. + try + Using.resource(new GZIPOutputStream(new FileOutputStream(gz))) { stream => + stream.write(readFileBytes(f)) + + stream.flush() + } + catch + case e: IOException => throw new NCException(s"Error gzip file: $f", e) + + if (!f.delete()) + throw new NCException(s"Error while deleting file: $f") + + logger.trace(s"File gzipped [source=$f, destination=$gz]") + + /** + * + * @param data + * @param f + * @param log + */ + private def getAndLog[T](data: T, f: File, log: Logger = logger): T = + log.trace(s"Loaded file: ${f.getAbsolutePath}") + + data + + /** + * Reads lines from given resource. + * + * @param res Resource path to read from. + * @param enc Encoding. + * @param log Logger to use. + */ + @throws[NCException] + def readResource(res: String, enc: String = "UTF-8", log: Logger = logger): List[String] = readStream(getStream(res), enc, log) + + /** + * Reads lines from given stream. + * + * @param in Stream to read from. + * @param enc Encoding. + * @param log Logger to use. + */ + @throws[NCException] + def readStream(in: InputStream, enc: String = "UTF-8", log: Logger = logger): List[String] = + mapStream(in, enc, log, _.map(p => p).toList) + + /** + * Maps lines from the given stream to an object. + * + * @param in Stream to read from. + * @param enc Encoding. + * @param log Logger to use. + * @param mapper Function to read lines. + */ + @throws[NCException] + def mapStream[T](in: InputStream, enc: String, log: Logger = logger, mapper: Iterator[String] => T): T = + try + Using.resource(Source.fromInputStream(in, enc)) { src => + mapper(src.getLines()) + } + catch + case e: IOException => throw new NCException(s"Failed to read stream.", e) + + /** + * + * @param in + * @return + */ + private def readLcTrimFilter(in: BufferedSource): List[String] = + in.getLines().map(_.toLowerCase.strip).filter(s => s.nonEmpty && s.head!= '#').toList + + /** + * Reads lines from given stream converting to lower case, trimming, and filtering + * out empty lines and comments (starting with '#'). + * + * @param res Zipped resource to read from. + * @param enc Encoding. + * @param log Logger to use. + */ + @throws[NCException] + def readTextGzipResource(res: String, enc: String, log: Logger = logger): List[String] = + try + Using.resource(Source.fromInputStream(new GZIPInputStream(getStream(res)), enc)) { src => + readLcTrimFilter(src) + } + catch + case e: IOException => throw new NCException(s"Failed to read stream.", e) + + /** + * Gets all sequential permutations of tokens in this NLP sentence. + * + * For example, if NLP sentence contains "a, b, c, d" tokens, then + * this function will return the sequence of following token sequences in this order: + * "a b c d" + * "a b c" + * "b c d" + * "a b" + * "b c" + * "c d" + * "a" + * "b" + * "c" + * "d" + * + * NOTE: this method will not return any permutations with a quoted token. + * + * @param tokens Tokens. + * @param stopWords Whether or not include tokens marked as stop words. + * @param maxLen Maximum number of tokens in the sequence. + */ + def tokenMix(tokens: Seq[NCToken], stopWords: Boolean = false, maxLen: Int = Integer.MAX_VALUE): Seq[Seq[NCToken]] = + val toks = tokens.filter(t => stopWords || (!stopWords && !t.isStopWord)) + + (for (n <- toks.length until 0 by -1 if n <= maxLen) yield toks.sliding(n)).flatten + + /** + * Gets all sequential permutations of tokens in this NLP sentence. + * This method is like a 'tokenMix', but with all combinations of stop-words (with and without) + * + * @param tokens Tokens. + * @param maxLen Maximum number of tokens in the sequence. + */ + def tokenMixWithStopWords(tokens: Seq[NCToken], maxLen: Int = Integer.MAX_VALUE): Seq[Seq[NCToken]] = + /** + * Gets all combinations for sequence of mandatory tokens with stop-words and without. + * + * Example: + * 'A (stop), B, C(stop) -> [A, B, C]; [A, B]; [B, C], [B] + * 'A, B(stop), C(stop) -> [A, B, C]; [A, B]; [A, C], [A]. + * + * @param toks Tokens. + */ + def permutations(toks: Seq[NCToken]): Seq[Seq[NCToken]] = + def multiple(seq: Seq[Seq[Option[NCToken]]], t: NCToken): Seq[Seq[Option[NCToken]]] = + if (seq.isEmpty) + if (t.isStopWord) IndexedSeq(IndexedSeq(Some(t)), IndexedSeq(None)) else IndexedSeq(IndexedSeq(Some(t))) + else { + (for (subSeq <- seq) yield subSeq :+ Some(t)) ++ + (if (t.isStopWord) for (subSeq <- seq) yield subSeq :+ None else Seq.empty) + } + + var res: Seq[Seq[Option[NCToken]]] = Seq.empty + + for (t <- toks) + res = multiple(res, t) + + res.map(_.flatten).filter(_.nonEmpty) + + tokenMix(tokens, stopWords = true, maxLen). + flatMap(permutations). + filter(_.nonEmpty). + distinct. + sortBy(seq => (-seq.length, seq.head.getStartCharIndex)) diff --git a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala index 43702f8..2f92878 100644 --- a/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala +++ b/nlpcraft/src/test/java/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserSpec.scala @@ -24,7 +24,7 @@ import java.util class NCOpenNlpTokenParserSpec { @Test - def test(): Unit = { + def test(): Unit = val parser = new NCOpenNlpTokenParser( "opennlp/en-token.bin", @@ -35,7 +35,7 @@ class NCOpenNlpTokenParserSpec { parser.start() val toks = parser.parse( - new NCRequest { + new NCRequest: override def getUserId: String = null override def getRequestId: String = null override def getNormalizedText: String = getOriginalText.toLowerCase @@ -43,7 +43,6 @@ class NCOpenNlpTokenParserSpec { override def getReceiveTimestamp: Long = 0 override def getUserAgent: String = null override def getRequestData: util.Map[String, AnyRef] = null - } ) assert(toks != null) @@ -62,4 +61,3 @@ class NCOpenNlpTokenParserSpec { ) ) } -} diff --git a/nlpcraft/src/test/resources/log4j2.xml b/nlpcraft/src/test/resources/log4j2.xml new file mode 100644 index 0000000..d9a627b --- /dev/null +++ b/nlpcraft/src/test/resources/log4j2.xml @@ -0,0 +1,50 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<Configuration status="INFO"> + <Properties> + <Property name="pattern">%highlight{%d{MMM-dd|HH:mm:ss}|%level{WARN=WRN, DEBUG=DBG, ERROR=ERR, TRACE=TRC, INFO=INF}| %m%n}</Property> + </Properties> + <Appenders> + <Console name="stdout" target="SYSTEM_OUT"> + <PatternLayout pattern="${pattern}"/> + <ThresholdFilter level="WARN" onMatch="DENY" onMismatch="ACCEPT"/> + </Console> + <Console name="stderr" target="SYSTEM_ERR"> + <PatternLayout pattern="${pattern}"/> + <ThresholdFilter level="WARN" onMatch="ACCEPT" onMismatch="DENY"/> + </Console> + </Appenders> + <Loggers> + <Root level="INFO"> + <AppenderRef ref="stdout"/> + <AppenderRef ref="stderr"/> + </Root> + <Logger name="org.apache.nlpcraft" level="INFO" additivity="false"> + <AppenderRef ref="stdout"/> + <AppenderRef ref="stderr"/> + </Logger> + <Logger name="com.mchange" level="WARN" additivity="false"> + <AppenderRef ref="stderr"/> + </Logger> + <Logger name="org.apache.ignite" level="WARN" additivity="false"> + <AppenderRef ref="stderr"/> + </Logger> + </Loggers> +</Configuration> \ No newline at end of file diff --git a/pom.xml b/pom.xml index 223bf55..039dc9d 100644 --- a/pom.xml +++ b/pom.xml @@ -180,6 +180,18 @@ <version>${apache.opennlp.ver}</version> </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j-impl</artifactId> + <version>${log4j.ver}</version> + </dependency> + + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-api</artifactId> + <version>${log4j.ver}</version> + </dependency> + <!-- JLine dependencies. ==================
