This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-520 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 6d63a451ed2628aa2f3f5718606ed814ac6a52a6 Author: Sergey Kamov <[email protected]> AuthorDate: Fri Dec 9 12:12:49 2022 +0400 WIP. --- .../nlpcraft/examples/time/CalculatorModel.scala | 4 +- .../entity/parser/NCFrSemanticEntityParser.scala | 3 +- .../entity/parser/NCRuSemanticEntityParser.scala | 3 +- .../components/PizzeriaModelPipeline.scala | 5 +- .../org/apache/nlpcraft/NCPipelineBuilder.scala | 54 ++++++++++++++-------- .../NCStemmer.scala} | 11 ++--- ...nricher.scala => NCBracketsTokenEnricher.scala} | 4 +- ...icher.scala => NCDictionaryTokenEnricher.scala} | 14 +++--- .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 2 +- .../nlp/enrichers/NCOpenNLPTokenEnricher.scala | 18 ++++---- ...nEnricher.scala => NCQuotesTokenEnricher.scala} | 6 +-- ...icher.scala => NCSwearWordsTokenEnricher.scala} | 17 +++---- .../nlpcraft/nlp/parsers/NCNLPEntityParser.scala | 13 +++--- .../nlp/parsers/NCOpenNLPEntityParser.scala | 16 +++---- .../nlp/parsers/NCOpenNLPTokenParser.scala | 10 ++-- .../nlpcraft/nlp/parsers/NCSemanticElement.scala | 3 +- .../nlp/parsers/NCSemanticEntityParser.scala | 32 ++++++------- .../parsers/impl/NCSemanticSynonymsProcessor.scala | 5 +- .../apache/nlpcraft/nlp/NCTokenEnricherSpec.scala | 2 +- .../enrichers/NCBracketsTokenEnricherSpec.scala | 4 +- .../enrichers/NCDictionaryTokenEnricherSpec.scala | 6 +-- .../nlp/enrichers/NCQuotesTokenEnricherSpec.scala | 2 +- .../enrichers/NCSwearWordsTokenEnricherSpec.scala | 15 ++++-- .../parsers/NCSemanticEntityParserLemmaSpec.scala | 3 +- .../org/apache/nlpcraft/nlp/util/NCTestUtils.scala | 7 +-- 25 files changed, 144 insertions(+), 115 deletions(-) diff --git a/nlpcraft-examples/calculator/src/main/scala/org/apache/nlpcraft/examples/time/CalculatorModel.scala b/nlpcraft-examples/calculator/src/main/scala/org/apache/nlpcraft/examples/time/CalculatorModel.scala index 0aecbc86..e1eb0a9c 100644 --- a/nlpcraft-examples/calculator/src/main/scala/org/apache/nlpcraft/examples/time/CalculatorModel.scala +++ b/nlpcraft-examples/calculator/src/main/scala/org/apache/nlpcraft/examples/time/CalculatorModel.scala @@ -65,7 +65,7 @@ class CalculatorModel extends NCModel(NCModelConfig("nlpcraft.calculator.ex", "C @NCIntent( "intent=calc options={ 'ordered': true }" + " term(x)={# == 'stanford:number'}" + - " term(op)={has(list('+', '-', '*', '/'), meta_ent('nlp:token:text')) == true}" + + " term(op)={has(list('+', '-', '*', '/'), meta_ent('nlp:entity:text')) == true}" + " term(y)={# == 'stanford:number'}" ) @unused def onMatch( @@ -78,7 +78,7 @@ class CalculatorModel extends NCModel(NCModelConfig("nlpcraft.calculator.ex", "C @NCIntent( "intent=calcMem options={ 'ordered': true }" + - " term(op)={has(list('+', '-', '*', '/'), meta_ent('nlp:token:text')) == true}" + + " term(op)={has(list('+', '-', '*', '/'), meta_ent('nlp:entity:text')) == true}" + " term(y)={# == 'stanford:number'}" ) @unused def onMatchMem( diff --git a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala index 55350bf1..c13251f3 100644 --- a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala +++ b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala @@ -20,6 +20,7 @@ package org.apache.nlpcraft.examples.lightswitch.nlp.entity.parser import opennlp.tools.stemmer.snowball.SnowballStemmer import org.apache.nlpcraft.examples.lightswitch.nlp.token.parser.NCFrTokenParser import org.apache.nlpcraft.* +import org.apache.nlpcraft.nlp.common.NCStemmer import org.apache.nlpcraft.nlp.parsers.* /** @@ -27,7 +28,7 @@ import org.apache.nlpcraft.nlp.parsers.* * @param src */ class NCFrSemanticEntityParser(src: String) extends NCSemanticEntityParser( - new NCSemanticStemmer: + new NCStemmer: private val stemmer = new SnowballStemmer(SnowballStemmer.ALGORITHM.FRENCH) override def stem(txt: String): String = stemmer.synchronized { stemmer.stem(txt.toLowerCase).toString } , diff --git a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala index 695a118d..e4c48b94 100644 --- a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala +++ b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala @@ -21,13 +21,14 @@ import opennlp.tools.stemmer.snowball.SnowballStemmer import org.apache.nlpcraft.examples.lightswitch.nlp.token.parser.NCRuTokenParser import org.apache.nlpcraft.nlp.parsers.* import org.apache.nlpcraft.* +import org.apache.nlpcraft.nlp.common.NCStemmer /** * * @param src */ class NCRuSemanticEntityParser(src: String) extends NCSemanticEntityParser( - new NCSemanticStemmer: + new NCStemmer: private val stemmer = new SnowballStemmer(SnowballStemmer.ALGORITHM.RUSSIAN) override def stem(txt: String): String = stemmer.synchronized { stemmer.stem(txt.toLowerCase).toString } , diff --git a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala index 046cf159..c9e86301 100644 --- a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala +++ b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala @@ -6,8 +6,9 @@ import org.apache.nlpcraft.nlp.parsers.* import org.apache.nlpcraft.nlp.entity.parser.stanford.NCStanfordNLPEntityParser import org.apache.nlpcraft.nlp.token.parser.stanford.NCStanfordNLPTokenParser import org.apache.nlpcraft.* +import org.apache.nlpcraft.nlp.common.NCStemmer import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher -import org.apache.nlpcraft.nlp.parsers.{NCSemanticEntityParser, NCSemanticStemmer} +import org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser import java.util.Properties @@ -20,7 +21,7 @@ object PizzeriaModelPipeline: props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner") new StanfordCoreNLP(props) val tokParser = new NCStanfordNLPTokenParser(stanford) - val stemmer = new NCSemanticStemmer(): + val stemmer = new NCStemmer(): private val ps = new PorterStemmer override def stem(txt: String): String = ps.synchronized { ps.stem(txt) } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala index fb21dcce..371b96e5 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala @@ -19,6 +19,7 @@ package org.apache.nlpcraft import opennlp.tools.stemmer.PorterStemmer import org.apache.nlpcraft.internal.util.NCResourceReader +import org.apache.nlpcraft.nlp.common.NCStemmer import org.apache.nlpcraft.nlp.parsers.* import org.apache.nlpcraft.nlp.enrichers.* @@ -39,8 +40,8 @@ class NCPipelineBuilder: private val entMappers: Buf[NCEntityMapper] = Buf.empty private val varFilters: Buf[NCVariantFilter] = Buf.empty - private def mkEnStemmer: NCSemanticStemmer = - new NCSemanticStemmer: + private def mkEnStemmer: NCStemmer = + new NCStemmer: final private val ps: PorterStemmer = new PorterStemmer override def stem(txt: String): String = ps.stem(txt) @@ -219,10 +220,13 @@ class NCPipelineBuilder: tokParser = mkEnOpenNLPTokenParser.? tokEnrichers += new NCOpenNLPTokenEnricher(NCResourceReader.getPath("opennlp/en-pos-maxent.bin"), NCResourceReader.getPath("opennlp/en-lemmatizer.dict")) tokEnrichers += new NCEnStopWordsTokenEnricher - tokEnrichers += new NCEnSwearWordsTokenEnricher(NCResourceReader.getPath("badfilter/swear_words.txt")) - tokEnrichers += new NCEnQuotesTokenEnricher - tokEnrichers += new NCEnDictionaryTokenEnricher - tokEnrichers += new NCEnBracketsTokenEnricher + tokEnrichers += new NCSwearWordsTokenEnricher( + NCResourceReader.getPath("badfilter/swear_words.txt"), + mkEnStemmer + ) + tokEnrichers += new NCQuotesTokenEnricher + tokEnrichers += new NCDictionaryTokenEnricher("moby/354984si.ngl") + tokEnrichers += new NCBracketsTokenEnricher /** * Shortcut to configure pipeline with [[NCSemanticEntityParser]]. @@ -238,11 +242,15 @@ class NCPipelineBuilder: * [[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict en-lemmatizer.dict]] model for * [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html DictionaryLemmatizer]]. * - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher. - * - [[NCEnSwearWordsTokenEnricher Swear-word]] token enricher initialized by + * - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized by * [[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt swear_words.txt]] dictionary. - * - [[NCEnQuotesTokenEnricher Quotes]] token enricher. - * - [[NCEnDictionaryTokenEnricher Known-word]] token enricher. - * - [[NCEnBracketsTokenEnricher Brackets]] token enricher. + * - [[NCQuotesTokenEnricher Quotes]] token enricher. + * - [[NCDictionaryTokenEnricher Known-word]] token enricher initialized by "moby/354984si.ngl" dictionary, + * look more about [[https://en.wikipedia.org/wiki/Moby_Project Moby Project]]. + * - [[NCBracketsTokenEnricher Brackets]] token enricher. + * + * Also there is used [[https://en.wikipedia.org/wiki/Stemming Porter stemmer]] implementation of [[NCStemmer]], + * based on [[https://opennlp.apache.org/ OpenNLP]] solution. * * @param lang ISO 639-1 language code. Currently, only "en" (English) is supported. * @param macros Macros to use with [[NCSemanticEntityParser]]. @@ -276,11 +284,15 @@ class NCPipelineBuilder: * [[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict en-lemmatizer.dict]] model for * [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html DictionaryLemmatizer]]. * - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher. - * - [[NCEnSwearWordsTokenEnricher Swear-word]] token enricher initialized by + * - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized by * [[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt swear_words.txt]] dictionary. - * - [[NCEnQuotesTokenEnricher Quotes]] token enricher. - * - [[NCEnDictionaryTokenEnricher Known-word]] token enricher. - * - [[NCEnBracketsTokenEnricher Brackets]] token enricher. + * - [[NCQuotesTokenEnricher Quotes]] token enricher. + * - [[NCDictionaryTokenEnricher Known-word]] token enricher initialized by "moby/354984si.ngl" dictionary, + * look more about [[https://en.wikipedia.org/wiki/Moby_Project Moby Project]]. + * - [[NCBracketsTokenEnricher Brackets]] token enricher. + * + * Also there is used [[https://en.wikipedia.org/wiki/Stemming Porter stemmer]] implementation of [[NCStemmer]], + * based on [[https://opennlp.apache.org/ OpenNLP]] solution. * * @param lang ISO 639-1 language code. Currently, only "en" (English) is supported. * @param elms Semantic elements to use with [[NCSemanticEntityParser]]. @@ -301,13 +313,17 @@ class NCPipelineBuilder: * [[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict en-lemmatizer.dict]] model for * [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html DictionaryLemmatizer]]. * - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher. - * - [[NCEnSwearWordsTokenEnricher Swear-word]] token enricher initialized by + * - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized by * [[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt swear_words.txt]] dictionary. - * - [[NCEnQuotesTokenEnricher Quotes]] token enricher. - * - [[NCEnDictionaryTokenEnricher Known-word]] token enricher. - * - [[NCEnBracketsTokenEnricher Brackets]] token enricher. + * - [[NCQuotesTokenEnricher Quotes]] token enricher. + * - [[NCDictionaryTokenEnricher Known-word]] token enricher initialized by "moby/354984si.ngl" dictionary, + * look more about [[https://en.wikipedia.org/wiki/Moby_Project Moby Project]]. + * - [[NCBracketsTokenEnricher Brackets]] token enricher. * - * @param lang ISO 639-1 language code. Currently, only "en" (English) is supported. + * Also there is used [[https://en.wikipedia.org/wiki/Stemming Porter stemmer]] implementation of [[NCStemmer]], + * based on [[https://opennlp.apache.org/ OpenNLP]] solution. + * + * @param lang ISO 639-1 language code. Currently, only "en" (English) is supported. * @param mdlSrc Classpath resource, file path or URL for YAML or JSON semantic model definition file. */ def withSemantic(lang: String, mdlSrc: String): NCPipelineBuilder = diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala similarity index 79% rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala index 27490eda..b68d1986 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala @@ -15,19 +15,18 @@ * limitations under the License. */ -package org.apache.nlpcraft.nlp.parsers +package org.apache.nlpcraft.nlp.common + +import org.apache.nlpcraft.nlp.parsers.* /** * * `Stemmer` trait. Stems are used for finding words by their reduced form. + * `Stemmer` trait implementation depends on language. * Read more about stemming [[https://en.wikipedia.org/wiki/Stemming here]]. * - * See detailed description on the website [[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic Semantic Parser]]. - * - * @see [[NCSemanticEntityParser]] - * @see [[NCSemanticElement]] */ -trait NCSemanticStemmer: +trait NCStemmer: /** * Gets text's stem. * diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala similarity index 94% rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala index 29e562e7..cf3563c5 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala @@ -24,7 +24,7 @@ import java.io.* import scala.collection.mutable /** - * Brackets [[NCTokenEnricher enricher]] for English language. + * Brackets [[NCTokenEnricher enricher]]. * * This enricher adds `brackets` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]] * instance if the word it represents is enclosed in brackets. Supported brackets are: `()`, `{}`, @@ -33,7 +33,7 @@ import scala.collection.mutable * **NOTE:** invalid enclosed brackets are ignored. */ //noinspection DuplicatedCode,ScalaWeakerAccess -class NCEnBracketsTokenEnricher extends NCTokenEnricher with LazyLogging: +class NCBracketsTokenEnricher extends NCTokenEnricher with LazyLogging: override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit = val stack = new java.util.Stack[String]() val map = mutable.HashMap.empty[NCToken, Boolean] diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala similarity index 77% rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala index 67615aa1..241adf0c 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala @@ -21,26 +21,26 @@ import org.apache.nlpcraft.* import org.apache.nlpcraft.internal.util.NCUtils /** - * "Known-word" [[NCTokenEnricher enricher]] for English language. + * "Known-word" [[NCTokenEnricher enricher]]. * * This enricher adds `dict` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]] - * instance if word it represents is a known English word, i.e. the English dictionary contains this word's + * instance if word it represents is a known dictionary word, i.e. the configured dictionary contains this word's * lemma. The value `true` of the metadata property indicates that this word's lemma is found in the dictionary, * `false` value indicates otherwise. * - * Implementation uses the [[https://en.wikipedia.org/wiki/Moby_Project Moby Project]] English dictionary. - * * **NOTE:** this implementation requires `lemma` string [[NCPropertyMap metadata]] property that contains - * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] that provides this metadata property before + * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] for required language that provides this metadata property before * this enricher in your [[NCPipeline pipeline]]. + * + * @param dictRes Path to the dictionary. This dictionary should has a simple plain text format with one dictionary word on one line. */ //noinspection DuplicatedCode,ScalaWeakerAccess -class NCEnDictionaryTokenEnricher extends NCTokenEnricher: +class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher: private var dict: Set[String] = _ init() - private def init(): Unit = dict = NCUtils.readResource("moby/354984si.ngl", "iso-8859-1").toSet + private def init(): Unit = dict = NCUtils.readResource(dictRes, "UTF-8").toSet private def getLemma(t: NCToken): String = t.get("lemma").getOrElse(throw new NCException("Lemma not found in token.")) override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit = diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala index f0ffb1a7..b5b0c762 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala @@ -172,7 +172,7 @@ import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.* * Look more about stop-words [[https://en.wikipedia.org/wiki/Stop_word here]]. * * **NOTE:** this implementation requires `lemma` and `pos` string [[NCPropertyMap metadata]] properties that contains - * token's lemma and part of speech. You can configure [[NCOpenNLPTokenEnricher]] that provides this metadata property before + * token's lemma and part of speech. You can configure [[NCOpenNLPTokenEnricher]] for English language that provides this metadata property before * this enricher in your [[NCPipeline pipeline]]. * * @param addStopsSet User defined collection of additional stop-words. diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala index af8d6f10..7ba30164 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala @@ -39,10 +39,10 @@ import scala.concurrent.ExecutionContext * * Some of OpenNLP prepared models can be found [[https://opennlp.sourceforge.net/models-1.5/ here]]. * - * @param posMdlSrc Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/postag/POSTaggerME.html POSTaggerME]] model. - * @param lemmaDicSrc Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html DictionaryLemmatizer]] model. + * @param posMdlRes Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/postag/POSTaggerME.html POSTaggerME]] model. + * @param lemmaDicRes Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html DictionaryLemmatizer]] model. */ -class NCOpenNLPTokenEnricher(posMdlSrc: String = null, lemmaDicSrc: String = null) extends NCTokenEnricher with LazyLogging: +class NCOpenNLPTokenEnricher(posMdlRes: String = null, lemmaDicRes: String = null) extends NCTokenEnricher with LazyLogging: private var tagger: POSTaggerME = _ private var lemmatizer: DictionaryLemmatizer = _ @@ -52,15 +52,15 @@ class NCOpenNLPTokenEnricher(posMdlSrc: String = null, lemmaDicSrc: String = nul NCUtils.execPar( Seq( () => { - if posMdlSrc != null then - tagger = new POSTaggerME(new POSModel(NCUtils.getStream(posMdlSrc))) - logger.trace(s"Loaded resource: $posMdlSrc") + if posMdlRes != null then + tagger = new POSTaggerME(new POSModel(NCUtils.getStream(posMdlRes))) + logger.trace(s"Loaded resource: $posMdlRes") else logger.warn("POS tagger is not configured.") }, () => { - if lemmaDicSrc != null then - lemmatizer = new DictionaryLemmatizer(NCUtils.getStream(lemmaDicSrc)) - logger.trace(s"Loaded resource: $lemmaDicSrc") + if lemmaDicRes != null then + lemmatizer = new DictionaryLemmatizer(NCUtils.getStream(lemmaDicRes)) + logger.trace(s"Loaded resource: $lemmaDicRes") else logger.warn("Lemmatizer is not configured.") } ) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala similarity index 92% rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala index ea9bd28a..6f82ca76 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala @@ -21,18 +21,18 @@ import com.typesafe.scalalogging.LazyLogging import org.apache.nlpcraft.* /** - * Quotes [[NCTokenEnricher enricher]] for English language. + * Quotes [[NCTokenEnricher enricher]]. * * This enricher adds `quoted` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]] * instance if word it represents is in quotes. The value `true` of the metadata property indicates that this word is in quotes, * `false` value indicates otherwise. * * **NOTE:** this implementation requires `lemma` string [[NCPropertyMap metadata]] property that contains - * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] that provides this metadata property before + * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] for required language that provides this metadata property before * this enricher in your [[NCPipeline pipeline]]. */ //noinspection ScalaWeakerAccess -class NCEnQuotesTokenEnricher extends NCTokenEnricher with LazyLogging: +class NCQuotesTokenEnricher extends NCTokenEnricher with LazyLogging: private final val Q_POS: Set[String] = Set("``", "''") private def getPos(t: NCToken): String = t.get("pos").getOrElse(throw new NCException("POS not found in token.")) private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t)) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala similarity index 71% rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala index c4fa7d8b..f0d282c7 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala @@ -18,15 +18,15 @@ package org.apache.nlpcraft.nlp.enrichers import com.typesafe.scalalogging.LazyLogging -import opennlp.tools.stemmer.PorterStemmer import org.apache.nlpcraft.* import org.apache.nlpcraft.internal.util.NCUtils +import org.apache.nlpcraft.nlp.common.NCStemmer import java.io.* import java.util.Objects /** - * "Swear-word" [[NCTokenEnricher enricher]] for English language. + * "Swear-word" [[NCTokenEnricher enricher]]. * * This enricher adds `swear` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]] * instance if word it represents is a swear word dictionary, i.e. the swear dictionary contains this word's @@ -34,22 +34,23 @@ import java.util.Objects * `false` value indicates otherwise. * * Read more about stemming [[https://en.wikipedia.org/wiki/Stemming here]]. + * Stemming is used here because it is too difficult to be based on more accurate `lemma` approach for swear words. * - * @param res Path to English swear dictionary. English swear dictionary has simple plain text format with one word on one line. + * @param dictRes Path to the swear dictionary. This swear dictionary should has a simple plain text format with one dictionary word on one line. + * @param stemmer Stemmer implementation for the dictionary language. */ //noinspection ScalaWeakerAccess -class NCEnSwearWordsTokenEnricher(res: String) extends NCTokenEnricher with LazyLogging: - require(res != null, "Swear words model file cannot be null.") +class NCSwearWordsTokenEnricher(dictRes: String, stemmer: NCStemmer) extends NCTokenEnricher with LazyLogging: + require(dictRes != null, "Swear words model file cannot be null.") - private final val stemmer = new PorterStemmer private var swearWords: Set[String] = _ init() private def init(): Unit = - swearWords = NCUtils.readTextStream(NCUtils.getStream(res), "UTF-8"). + swearWords = NCUtils.readTextStream(NCUtils.getStream(dictRes), "UTF-8"). map(p => stemmer.stem(p.toLowerCase)).toSet - logger.trace(s"Loaded resource: $res") + logger.trace(s"Loaded resource: $dictRes") override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit = toks.foreach(t => t.put("swear", swearWords.contains(stemmer.stem(t.getText.toLowerCase)))) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala index d23d42a0..b84d3c18 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala @@ -26,22 +26,23 @@ import java.util.stream.Collectors * [[NCNLPEntityParser]] helper. */ object NCNLPEntityParser: - private val id: String = "nlp:token" + private val id: String = "nlp:entity" import org.apache.nlpcraft.nlp.parsers.NCNLPEntityParser.* /** * NLP data [[NCEntityParser parser]]. * - * This parser converts list of input [[NCToken]] instances to list of [[NCEntity]] instances with ID `nlp:token`. + * This parser converts list of input [[NCToken]] instances to list of [[NCEntity]] instances with ID `nlp:entity`. * All [[NCEntity]] instances contain following mandatory [[NCPropertyMap metadata]] properties: - * - nlp:token:text - * - nlp:token:index - * - nlp:token:startCharIndex - * - nlp:token:endCharIndex + * - nlp:entity:text + * - nlp:entity:index + * - nlp:entity:startCharIndex + * - nlp:entity:endCharIndex * * Also created [[NCEntity]] instances receive all another [[NCPropertyMap metadata]] properties * which were added by configured in [[NCPipeline pipeline]] token [[org.apache.nlpcraft.NCTokenEnricher enrichers]]. + * These properties identifiers will be prefixed by `nlp:entity:`. * * @param predicate Predicate which allows to filter list of converted [[NCToken]] instances. * By default all [[NCToken]] instances converted. diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala index e40e8ff2..7613e237 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala @@ -39,12 +39,12 @@ object NCOpenNLPEntityParser: /** * Creates [[NCOpenNLPEntityParser]] instance. * - * @param src Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html model]]. + * @param mdl Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html model]]. * @return [[NCOpenNLPEntityParser]] instance. */ - def apply(src: String): NCOpenNLPEntityParser = - require(src != null, "Model source cannot be null.") - new NCOpenNLPEntityParser(List(src)) + def apply(mdl: String): NCOpenNLPEntityParser = + require(mdl != null, "Model source cannot be null.") + new NCOpenNLPEntityParser(List(mdl)) /** * [[https://opennlp.apache.org/ OpenNLP]] based language independent [[NCEntityParser parser]] configured by @@ -59,10 +59,10 @@ object NCOpenNLPEntityParser: * * **NOTE:** that each input [[NCToken]] can be included into several output [[NCEntity]] instances. * - * @param srcs Paths to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html models]]. + * @param findersMdlsRes Paths to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html models]]. */ -class NCOpenNLPEntityParser(srcs: List[String]) extends NCEntityParser with LazyLogging: - require(srcs != null, "Models source cannot be null.") +class NCOpenNLPEntityParser(findersMdlsRes: List[String]) extends NCEntityParser with LazyLogging: + require(findersMdlsRes != null, "Models sources cannot be null.") private var finders: Seq[NameFinderME] = _ private case class Holder(start: Int, end: Int, name: String, probability: Double) @@ -74,7 +74,7 @@ class NCOpenNLPEntityParser(srcs: List[String]) extends NCEntityParser with Lazy private def init(): Unit = val finders = mutable.ArrayBuffer.empty[NameFinderME] NCUtils.execPar( - srcs.map(res => () => { + findersMdlsRes.map(res => () => { val f = new NameFinderME(new TokenNameFinderModel(NCUtils.getStream(res))) logger.trace(s"Loaded resource: $res") finders.synchronized { finders += f } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala index dbc6657e..82c4b120 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala @@ -32,19 +32,19 @@ import java.util.Objects * * Some of OpenNLP prepared models can be found [[https://opennlp.sourceforge.net/models-1.5/ here]]. * - * @param tokMdl Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/tokenize/TokenizerModel.html model]]. + * @param tokMdlRes Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/tokenize/TokenizerModel.html model]]. */ -class NCOpenNLPTokenParser(tokMdl: String) extends NCTokenParser with LazyLogging: - require(tokMdl != null, "Tokenizer model path cannot be null.") +class NCOpenNLPTokenParser(tokMdlRes: String) extends NCTokenParser with LazyLogging: + require(tokMdlRes != null, "Tokenizer model path cannot be null.") @volatile private var tokenizer: TokenizerME = _ init() private def init(): Unit = - tokenizer = new TokenizerME(new TokenizerModel(NCUtils.getStream(tokMdl))) + tokenizer = new TokenizerME(new TokenizerModel(NCUtils.getStream(tokMdlRes))) - logger.trace(s"Loaded resource: $tokMdl") + logger.trace(s"Loaded resource: $tokMdlRes") override def tokenize(text: String): List[NCToken] = this.synchronized { diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala index b9768e59..e8d43aa1 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala @@ -17,6 +17,8 @@ package org.apache.nlpcraft.nlp.parsers +import org.apache.nlpcraft.nlp.common.NCStemmer + /** * * Configuration element which helps to detect [[org.apache.nlpcraft.NCEntity NCEntity]] for @@ -25,7 +27,6 @@ package org.apache.nlpcraft.nlp.parsers * See detailed description on the website [[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic Semantic Parser]]. * * @see [[NCSemanticEntityParser]] - * @see [[NCSemanticStemmer]] */ trait NCSemanticElement: /** diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala index 3942584e..e96a257e 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala @@ -21,6 +21,7 @@ import com.typesafe.scalalogging.LazyLogging import org.apache.nlpcraft.* import org.apache.nlpcraft.internal.makro.NCMacroParser import org.apache.nlpcraft.internal.util.NCUtils +import org.apache.nlpcraft.nlp.common.NCStemmer import org.apache.nlpcraft.nlp.parsers.* import org.apache.nlpcraft.nlp.parsers.impl.* @@ -38,13 +39,13 @@ object NCSemanticEntityParser: /** * Creates [[NCSemanticEntityParser]] instance. * - * @param stemmer [[NCSemanticStemmer]] implementation. - * @param parser [[NCTokenParser]] implementation. - * @param macros Macros map. Empty by default. + * @param stemmer [[NCStemmer]] implementation for synonyms language. + * @param parser [[NCTokenParser]] implementation. + * @param macros Macros map. Empty by default. * @param elements [[NCSemanticElement]] list. */ def apply( - stemmer: NCSemanticStemmer, + stemmer: NCStemmer, parser: NCTokenParser, macros: Map[String, String], elements: List[NCSemanticElement] @@ -60,12 +61,12 @@ object NCSemanticEntityParser: * * Creates [[NCSemanticEntityParser]] instance. * - * @param stemmer [[NCSemanticStemmer]] implementation. + * @param stemmer [[NCStemmer]] implementation for synonyms language. * @param parser [[NCTokenParser]] implementation. * @param elements [[NCSemanticElement]] list. */ def apply( - stemmer: NCSemanticStemmer, + stemmer: NCStemmer, parser: NCTokenParser, elements: List[NCSemanticElement] ): NCSemanticEntityParser = @@ -79,11 +80,11 @@ object NCSemanticEntityParser: * * Creates [[NCSemanticEntityParser]] instance. * - * @param stemmer [[NCSemanticStemmer]] implementation. - * @param parser [[NCTokenParser]] implementation. - * @param mdlSrc Classpath resource, file path or URL for YAML or JSON semantic model definition file. + * @param stemmer [[NCStemmer]] implementation for synonyms language. + * @param parser [[NCTokenParser]] implementation. + * @param mdlSrc Classpath resource, file path or URL for YAML or JSON semantic model definition file. */ - def apply(stemmer: NCSemanticStemmer, parser: NCTokenParser, mdlSrc: String): NCSemanticEntityParser = + def apply(stemmer: NCStemmer, parser: NCTokenParser, mdlSrc: String): NCSemanticEntityParser = require(stemmer != null, "Stemmer cannot be null.") require(parser != null, "Parser cannot be null.") require(mdlSrc != null, "Model source cannot be null.") @@ -181,18 +182,15 @@ import org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser.* * * See detailed description on the website [[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic Semantic Parser]]. * - * * @see [[NCSemanticElement]] - * @see [[NCSemanticStemmer]] - * - * @param stemmer [[NCSemanticStemmer]] implementation. - * @param parser [[NCTokenParser]] implementation. - * @param macros Macros map. Empty by default. + * @param stemmer [[NCStemmer]] implementation for synonyms language. + * @param parser [[NCTokenParser]] implementation. + * @param macros Macros map. Empty by default. * @param elements [[NCSemanticElement]] list. * @param mdlSrcOpt Optional classpath resource, file path or URL for YAML or JSON semantic model definition file. */ class NCSemanticEntityParser( - stemmer: NCSemanticStemmer, + stemmer: NCStemmer, parser: NCTokenParser, macros: Map[String, String] = Map.empty, elements: List[NCSemanticElement] = List.empty, diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/impl/NCSemanticSynonymsProcessor.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/impl/NCSemanticSynonymsProcessor.scala index 7c3992e4..e5c0b09d 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/impl/NCSemanticSynonymsProcessor.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/impl/NCSemanticSynonymsProcessor.scala @@ -24,6 +24,7 @@ import com.typesafe.scalalogging.LazyLogging import org.apache.nlpcraft.* import org.apache.nlpcraft.internal.makro.NCMacroParser import org.apache.nlpcraft.internal.util.NCUtils +import org.apache.nlpcraft.nlp.common.NCStemmer import org.apache.nlpcraft.nlp.parsers.* import org.apache.nlpcraft.nlp.parsers.impl.NCSemanticChunkKind.* @@ -144,7 +145,7 @@ private[parsers] object NCSemanticSynonymsProcessor extends LazyLogging: * @param syns */ private def convertSynonyms( - stemmer: NCSemanticStemmer, + stemmer: NCStemmer, tokParser: NCTokenParser, macroParser: NCMacroParser, elemId: String, @@ -205,7 +206,7 @@ private[parsers] object NCSemanticSynonymsProcessor extends LazyLogging: * @param elements */ def prepare( - stemmer: NCSemanticStemmer, + stemmer: NCStemmer, tokParser: NCTokenParser, macros: Map[String, String], elements: Seq[NCSemanticElement] diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/NCTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/NCTokenEnricherSpec.scala index 4712b55e..2562e317 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/NCTokenEnricherSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/NCTokenEnricherSpec.scala @@ -31,7 +31,7 @@ import scala.util.Using class NCTokenEnricherSpec extends AnyFunSuite: private def test0(pipeline: NCPipeline, ok: Boolean): Unit = val mdl: NCModel = new NCModel(NCModelConfig("test.id", "Test model", "1.0"), pipeline): - @NCIntent("intent=i term(any)={meta_ent('nlp:token:k1') == 'v1'}") + @NCIntent("intent=i term(any)={meta_ent('nlp:entity:k1') == 'v1'}") def onMatch(ctx: NCContext, im: NCIntentMatch): NCResult = TEST_RESULT NCTestUtils.askSomething(mdl, ok) diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala index 480edd24..6739a703 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala @@ -18,7 +18,7 @@ package org.apache.nlpcraft.nlp.enrichers import org.apache.nlpcraft.* -import nlp.enrichers.NCEnBracketsTokenEnricher +import nlp.enrichers.NCBracketsTokenEnricher import nlp.util.* import org.scalatest.funsuite.AnyFunSuite @@ -26,7 +26,7 @@ import org.scalatest.funsuite.AnyFunSuite * */ class NCBracketsTokenEnricherSpec extends AnyFunSuite: - private val bracketsEnricher = new NCEnBracketsTokenEnricher() + private val bracketsEnricher = new NCBracketsTokenEnricher() /** * diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricherSpec.scala index f6f945b7..537ec5cb 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricherSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricherSpec.scala @@ -25,14 +25,14 @@ import internal.util.NCResourceReader import org.scalatest.funsuite.AnyFunSuite class NCDictionaryTokenEnricherSpec extends AnyFunSuite: - private val dictEnricher = new NCEnDictionaryTokenEnricher() + private val dictEnricher = new NCDictionaryTokenEnricher("moby/354984si.ngl") test("test") { val txt = "milk XYZ" val toks = EN_TOK_PARSER.tokenize(txt) - require(toks.head.get[Boolean]("dict:en").isEmpty) - require(toks.last.get[Boolean]("dict:en").isEmpty) + require(toks.head.get[Boolean]("dict").isEmpty) + require(toks.last.get[Boolean]("dict").isEmpty) val req = NCTestRequest(txt) diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala index 3f87f757..ee3ad403 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala @@ -28,7 +28,7 @@ import org.scalatest.funsuite.AnyFunSuite * */ class NCQuotesTokenEnricherSpec extends AnyFunSuite: - private val quoteEnricher = new NCEnQuotesTokenEnricher + private val quoteEnricher = new NCQuotesTokenEnricher /** * diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala index 86303dea..fcea197c 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala @@ -17,8 +17,10 @@ package org.apache.nlpcraft.nlp.enrichers +import opennlp.tools.stemmer.PorterStemmer import org.apache.nlpcraft.internal.util.NCResourceReader -import org.apache.nlpcraft.nlp.enrichers.NCEnSwearWordsTokenEnricher +import org.apache.nlpcraft.nlp.common.NCStemmer +import org.apache.nlpcraft.nlp.enrichers.NCSwearWordsTokenEnricher import org.apache.nlpcraft.nlp.enrichers.* import org.apache.nlpcraft.nlp.util.* import org.scalatest.funsuite.AnyFunSuite @@ -26,13 +28,18 @@ import org.scalatest.funsuite.AnyFunSuite * */ class NCSwearWordsTokenEnricherSpec extends AnyFunSuite: - private val swEnricher = new NCEnSwearWordsTokenEnricher(NCResourceReader.getPath("badfilter/swear_words.txt")) + private val swEnricher = new NCSwearWordsTokenEnricher( + NCResourceReader.getPath("badfilter/swear_words.txt"), + new NCStemmer: + final private val ps: PorterStemmer = new PorterStemmer + override def stem(txt: String): String = ps.stem(txt) + ) test("test") { val toks = EN_TOK_PARSER.tokenize("english ass") - require(toks.head.get[Boolean]("swear:en").isEmpty) - require(toks.last.get[Boolean]("swear:en").isEmpty) + require(toks.head.get[Boolean]("swear").isEmpty) + require(toks.last.get[Boolean]("swear").isEmpty) swEnricher.enrich(null, null, toks) diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala index 299a8fdf..cb134ef3 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala @@ -22,6 +22,7 @@ import annotations.* import nlp.parsers.* import internal.impl.* import nlp.util.* +import org.apache.nlpcraft.nlp.common.NCStemmer import org.scalatest.funsuite.AnyFunSuite import java.util @@ -32,7 +33,7 @@ import scala.collection.mutable */ class NCSemanticEntityParserLemmaSpec extends AnyFunSuite: private val lemmaStemmer = - new NCSemanticStemmer(): + new NCStemmer(): override def stem(txt: String): String = if wrapped(txt) then unwrap(txt) else UUID.randomUUID().toString case class Data(text: String, elemId: String) diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala index fd1e0b07..a23b0f89 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala @@ -21,9 +21,10 @@ import opennlp.tools.stemmer.PorterStemmer import org.apache.nlpcraft.* import org.apache.nlpcraft.internal.ascii.NCAsciiTable import org.apache.nlpcraft.internal.util.NCResourceReader +import org.apache.nlpcraft.nlp.common.NCStemmer import org.apache.nlpcraft.nlp.parsers.* import org.apache.nlpcraft.nlp.parsers -import org.apache.nlpcraft.nlp.parsers.{NCOpenNLPTokenParser, NCSemanticElement, NCSemanticEntityParser, NCSemanticStemmer} +import org.apache.nlpcraft.nlp.parsers.{NCOpenNLPTokenParser, NCSemanticElement, NCSemanticEntityParser} import java.util import scala.util.Using @@ -122,8 +123,8 @@ object NCTestUtils: /** * */ - private def mkSemanticStemmer: NCSemanticStemmer = - new NCSemanticStemmer(): + private def mkSemanticStemmer: NCStemmer = + new NCStemmer(): private val ps = new PorterStemmer override def stem(txt: String): String = ps.synchronized { ps.stem(txt) }
