This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-287 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 46d7f4cb05fca36ab25bfa9ef68f65320c62c345 Author: Sergey Kamov <[email protected]> AuthorDate: Tue Apr 6 15:30:52 2021 +0300 WIP. --- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 58 +++--- .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 5 +- .../probe/mgrs/sentence/NCSentenceManager.scala | 1 + .../nlp/enrichers/sort/NCEnricherSortSpec.scala | 199 ++++++++++++++++++++- 4 files changed, 229 insertions(+), 34 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 4464a1b..c766237 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -267,12 +267,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { private def combos[T](toks: Seq[T]): Seq[Seq[T]] = (for (n ← toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p ⇒ p) - // TODO: -// /** -// * -// * @param toks -// * @param elemId -// */ + /** + * + * @param toks + * @param elemId + */ private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean = toks.forall(_.isTypeOf(elemId)) /** @@ -296,9 +295,17 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { case (complex, kind) ⇒ if (complex.isLeft) Some(complex.left.get → kind) else None } + /** + * + */ private def mkCache(): Cache = mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[Int]]) + /** + * + * @param tows + * @param ns + */ private def toNlpTokens(tows: Seq[NCDslContent], ns: NCNlpSentence): Seq[NlpToken] = ( tows.filter(_.isRight).map(_.right.get) ++ @@ -306,6 +313,23 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { flatMap(w ⇒ ns.filter(t ⇒ t.wordIndexes.intersect(w.wordIndexes).nonEmpty)) ).sortBy(_.startCharIndex) + /** + * + * @param toks + */ + private def tokString(toks: Seq[NlpToken]): String = toks.map(t ⇒ (t.origText, t.index)).mkString(" ") + + /** + * Gets synonyms sorted in descending order by their weight (already prepared), + * i.e. first synonym in the sequence is the most important one. + * + * @param fastMap {Element ID → {Synonym length → T}} + * @param elmId + * @param len + */ + private def fastAccess[T](fastMap: Map[String, Map[Int, T]], elmId: String, len: Int): Option[T] = + fastMap.getOrElse(elmId, Map.empty[Int, T]).get(len) + @throws[NCE] override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = { require(isStarted) @@ -325,31 +349,10 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { def addMatch(elm: NCElement, toks: Seq[NlpToken], syn: Synonym, parts: Seq[TokType]): Unit = { val toksSet = toks.toSet - // TODO: - //require(!matches.exists(m ⇒ m.element.getId == elm.getId && toksSet.subsetOf(m.tokensSet))) - if (!matches.exists(m ⇒ m.element.getId == elm.getId && toksSet.subsetOf(m.tokensSet))) matches += ElementMatch(elm, toks, syn, parts) } - /** - * Gets synonyms sorted in descending order by their weight (already prepared), - * i.e. first synonym in the sequence is the most important one. - * - * @param fastMap {Element ID → {Synonym length → T}} - * @param elmId - * @param len - */ - def fastAccess[T](fastMap: Map[String, Map[Int, T]], elmId: String, len: Int): Option[T] = - fastMap.getOrElse(elmId, Map.empty[Int, T]).get(len) - - /** - * - * @param toks - * @return - */ - def tokString(toks: Seq[NlpToken]): String = toks.map(t ⇒ (t.origText, t.index)).mkString(" ") - lazy val complexesWords = ns.map(Complex(_)) lazy val complexes: Seq[ComplexSeq] = NCProbeVariants.convert(ns.srvReqId, mdl, NCSentenceManager.collapse(mdl.model, ns.clone())). @@ -606,7 +609,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { ).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w")) ) - // TODO: if (!alreadyMarked(matchedToks, elemId)) mark( ns, diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala index d3853da..95c123e 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala @@ -177,7 +177,7 @@ object NCSortEnricher extends NCProbeEnricher { toks.flatten. filter(!_.isNlp). filter(n ⇒ n.tokenIndexes.head >= min && n.tokenIndexes.last <= max). - map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)). + map(n ⇒ NoteData(n.noteType, n.tokenIndexes)). sortBy(_.indexes.head).distinct } @@ -324,8 +324,7 @@ object NCSortEnricher extends NCProbeEnricher { forall(p ⇒ (p.isStopWord || p.stem == stemAnd) && !maskWords.contains(p.stem)) ) { // It removes duplicates (`SORT x x ORDER x x x` converts to `SORT x ORDER x`) - val mask = toks.map(getKeyWordType). - foldLeft("")((x, y) ⇒ if (x.endsWith(y)) x else s"$x $y").trim + val mask = toks.map(getKeyWordType).foldLeft("")((x, y) ⇒ if (x.endsWith(y)) x else s"$x $y").trim MASKS.get(mask) match { case Some(typ) ⇒ diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index fb676d0..7d011a0 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -23,6 +23,7 @@ import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken} import org.apache.nlpcraft.common.{NCE, NCService, U} import org.apache.nlpcraft.model.NCModel +import org.apache.nlpcraft.model.impl.NCTokenLogger import java.io.{Serializable => JSerializable} import java.util diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala index cc03066..8f24288 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala @@ -17,15 +17,31 @@ package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort -import org.apache.nlpcraft.NCTestEnvironment +import org.apache.nlpcraft.{NCTestElement, NCTestEnvironment} +import org.apache.nlpcraft.model.NCElement import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.NCTestSortTokenType._ -import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.{NCDefaultTestModel, NCEnricherBaseSpec, NCTestNlpToken ⇒ nlp, NCTestSortToken ⇒ srt, NCTestUserToken ⇒ usr} +import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.{NCDefaultTestModel, NCEnricherBaseSpec, NCTestNlpToken => nlp, NCTestSortToken => srt, NCTestUserToken => usr} import org.junit.jupiter.api.Test +import java.util +import scala.collection.JavaConverters._ + +class NCDefaultSpecTestModel extends NCDefaultTestModel { + override def getElements: util.Set[NCElement] = { + ( + super.getElements.asScala ++ + Set(NCTestElement("wrapperA", "^^{tok_id() == 'A'}^^ ^^{tok_id() == 'A'}^^ ^^{tok_id() == 'A'}^^")) + ).asJava + } + + override def isPermutateSynonyms: Boolean = true + override def isSparse: Boolean = true +} + /** * Sort enricher test. */ -@NCTestEnvironment(model = classOf[NCDefaultTestModel], startClient = true) +@NCTestEnvironment(model = classOf[NCDefaultSpecTestModel], startClient = true) class NCEnricherSortSpec extends NCEnricherBaseSpec { /** * @@ -204,6 +220,183 @@ class NCEnricherSortSpec extends NCEnricherBaseSpec { nlp(text = ",", isStop = true), usr(text = "B", id = "B"), nlp(text = ", asc", isStop = true) + ), + _ ⇒ checkExists( + "sort A", + srt(text = "sort", typ = SUBJ_ONLY, note = "A", index = 1), + usr("A", "A") + ), + _ ⇒ checkExists( + "sort A by A", + srt(text = "sort", subjNote = "A", subjIndex = 1, byNote = "A", byIndex = 3), + usr(text = "A", id = "A"), + nlp(text = "by", isStop = true), + usr(text = "A", id = "A") + ), + _ ⇒ checkExists( + "sort A, C by A, C", + srt(text = "sort", subjNotes = Seq("A", "C"), subjIndexes = Seq(1, 3), byNotes = Seq("A", "C"), byIndexes = Seq(5, 7)), + usr(text = "A", id = "A"), + nlp(text = ",", isStop = true), + usr(text = "C", id = "C"), + nlp(text = "by", isStop = true), + usr(text = "A", id = "A"), + nlp(text = ",", isStop = true), + usr(text = "C", id = "C") + ), + _ ⇒ checkExists( + "sort A C by A C", + srt(text = "sort", subjNotes = Seq("A", "C"), subjIndexes = Seq(1, 2), byNotes = Seq("A", "C"), byIndexes = Seq(4, 5)), + usr(text = "A", id = "A"), + usr(text = "C", id = "C"), + nlp(text = "by", isStop = true), + usr(text = "A", id = "A"), + usr(text = "C", id = "C") + ), + _ ⇒ checkExists( + "sort A B by A B", + srt(text = "sort", subjNotes = Seq("A", "B"), subjIndexes = Seq(1, 2), byNotes = Seq("A", "B"), byIndexes = Seq(4, 5)), + usr(text = "A", id = "A"), + usr(text = "B", id = "B"), + nlp(text = "by", isStop = true), + usr(text = "A", id = "A"), + usr(text = "B", id = "B") + ), + _ ⇒ checkExists( + "sort A B by A B", + srt(text = "sort", subjNote = "AB", subjIndex = 1, byNote = "AB", byIndex = 3), + usr(text = "A B", id = "AB"), + nlp(text = "by", isStop = true), + usr(text = "A B", id = "AB") + ), + _ ⇒ checkExists( + "A classify", + usr(text = "A", id = "A"), + srt(text = "classify", typ = SUBJ_ONLY, note = "A", index = 0) + ), + _ ⇒ checkExists( + "the A the classify", + nlp(text = "the", isStop = true), + usr(text = "A", id = "A"), + nlp(text = "the", isStop = true), + srt(text = "classify", typ = SUBJ_ONLY, note = "A", index = 1) + ), + _ ⇒ checkExists( + "segment A by top down", + srt(text = "segment", typ = SUBJ_ONLY, note = "A", index = 1, asc = false), + usr(text = "A", id = "A"), + nlp(text = "by top down", isStop = true) + ), + _ ⇒ checkExists( + "segment A in bottom up order", + srt(text = "segment", typ = SUBJ_ONLY, note = "A", index = 1, asc = true), + usr(text = "A", id = "A"), + nlp(text = "in bottom up order", isStop = true) + ), + // `by` is redundant word here + _ ⇒ checkExists( + "segment A by in bottom up order", + srt(text = "segment", typ = SUBJ_ONLY, note = "A", index = 1), + usr(text = "A", id = "A"), + nlp(text = "by"), + nlp(text = "in"), + nlp(text = "bottom"), + nlp(text = "up"), + nlp(text = "order") + ), + _ ⇒ checkExists( + "the segment the A the in bottom up the order the", + nlp(text = "the", isStop = true), + srt(text = "segment", typ = SUBJ_ONLY, note = "A", index = 3, asc = true), + nlp(text = "the", isStop = true), + usr(text = "A", id = "A"), + nlp(text = "the in bottom up the order the", isStop = true) + ), + _ ⇒ checkExists( + "the segment the A the by bottom up the order the", + nlp(text = "the", isStop = true), + srt(text = "segment", typ = SUBJ_ONLY, note = "A", index = 3, asc = true), + nlp(text = "the", isStop = true), + usr(text = "A", id = "A"), + nlp(text = "the by bottom up the order the", isStop = true) + ), + _ ⇒ checkExists( + "A classify", + usr(text = "A", id = "A"), + srt(text = "classify", typ = SUBJ_ONLY, note = "A", index = 0) + ), + _ ⇒ checkAll( + "A B classify", + Seq( + usr(text = "A B", id = "AB"), + srt(text = "classify", typ = SUBJ_ONLY, note = "AB", index = 0) + ), + Seq( + usr(text = "A", id = "A"), + usr(text = "B", id = "B"), + srt(text = "classify", subjNotes = Seq("A", "B"), subjIndexes = Seq(0, 1)) + ), + Seq( + usr(text = "A", id = "A"), + usr(text = "B", id = "B"), + srt(text = "classify", subjNotes = Seq("B"), subjIndexes = Seq(1)) + ) + ), + _ ⇒ checkAll( + "D classify", + Seq( + usr(text = "D", id = "D1"), + srt(text = "classify", typ = SUBJ_ONLY, note = "D1", index = 0) + ), + Seq( + usr(text = "D", id = "D2"), + srt(text = "classify", typ = SUBJ_ONLY, note = "D2", index = 0) + ) + ), + _ ⇒ checkAll( + "sort by A", + Seq( + srt(text = "sort by", typ = BY_ONLY, note = "A", index = 1), + usr(text = "A", id = "A") + ) + ), + _ ⇒ checkExists( + "organize by A, B top down", + srt(text = "organize by", byNotes = Seq("A", "B"), byIndexes = Seq(1, 3), asc = Some(false)), + usr(text = "A", id = "A"), + nlp(text = ",", isStop = true), + usr(text = "B", id = "B"), + nlp(text = "top down", isStop = true) + ), + _ ⇒ checkExists( + "organize by A, B from bottom up order", + srt(text = "organize by", byNotes = Seq("A", "B"), byIndexes = Seq(1, 3), asc = Some(true)), + usr(text = "A", id = "A"), + nlp(text = ",", isStop = true), + usr(text = "B", id = "B"), + nlp(text = "from bottom up order", isStop = true) + ), + _ ⇒ checkExists( + "organize by A, B the descending", + srt(text = "organize by", byNotes = Seq("A", "B"), byIndexes = Seq(1, 3), asc = Some(false)), + usr(text = "A", id = "A"), + nlp(text = ",", isStop = true), + usr(text = "B", id = "B"), + nlp(text = "the descending", isStop = true) + ), + _ ⇒ checkExists( + "organize by A, B, asc", + srt(text = "organize by", byNotes = Seq("A", "B"), byIndexes = Seq(1, 3), asc = Some(true)), + usr(text = "A", id = "A"), + nlp(text = ",", isStop = true), + usr(text = "B", id = "B"), + nlp(text = ", asc", isStop = true) + ), + _ ⇒ checkExists( + "sort A the A the A", + srt(text = "sort", typ = SUBJ_ONLY, note = "wrapperA", index = 1), + usr("A A A", "wrapperA"), + nlp("the the", isStop = true) ) ) }
