This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-337 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 1f97e87428795476359933b90fa0dd4125587480 Author: Sergey Kamov <[email protected]> AuthorDate: Thu Jun 24 12:18:55 2021 +0300 WIP. --- .../nlpcraft/probe/mgrs/NCProbeVariants.scala | 46 +++---- .../nlpcraft/probe/mgrs/NCTokenPartKey.scala | 134 +++++++++++++++++++++ .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 26 +--- .../probe/mgrs/sentence/NCSentenceManager.scala | 30 ++--- .../abstract/NCAbstractTokensVariantsSpec.scala | 3 +- 5 files changed, 167 insertions(+), 72 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala index f3122b3..bcf2c9c 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala @@ -20,8 +20,8 @@ package org.apache.nlpcraft.probe.mgrs import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank import org.apache.nlpcraft.common.nlp.{NCNlpSentence => NlpSentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken} import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY} +import org.apache.nlpcraft.model.NCVariant import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCTokenLogger, NCVariantImpl} -import org.apache.nlpcraft.model.{NCToken, NCVariant} import java.io.{Serializable => JSerializable} import java.util @@ -37,18 +37,6 @@ object NCProbeVariants { private final val IDXS: JSerializable = singletonList(IDX).asInstanceOf[JSerializable] private final val IDXS2: JSerializable = singletonList(singletonList(IDX)).asInstanceOf[JSerializable] - case class Key(id: String, from: Int, to: Int) - - object Key { - def apply(m: util.HashMap[String, JSerializable]): Key = { - def get[T](name: String): T = m.get(name).asInstanceOf[T] - - Key(get("id"), get("startcharindex"), get("endcharindex")) - } - - def apply(t: NCToken): Key = Key(t.getId, t.getStartCharIndex, t.getEndCharIndex) - } - /** * * @param t @@ -77,17 +65,17 @@ object NCProbeVariants { * * @param key * @param delNotes - * @param noteTypePred + * @param delNoteTypePred * @return */ private def findDeletedToken( - key: Key, + key: NCTokenPartKey, delNotes: Map[NlpNote, Seq[NlpToken]], - noteTypePred: String => Boolean + delNoteTypePred: NlpNote => Boolean ): Option[NlpToken] = delNotes.to(LazyList). flatMap { case (delNote, delNoteToks) => - if (noteTypePred(delNote.noteType)) { + if (delNoteTypePred(delNote)) { val toks = delNoteToks. dropWhile(_.startCharIndex != key.from). @@ -111,7 +99,7 @@ object NCProbeVariants { case _ => // No-op. } - artTok.add(delNote.clone(ps.toSeq :_*)) + artTok.add(delNote.clone(ps.toSeq: _*)) } Some(artTok) @@ -200,18 +188,18 @@ object NCProbeVariants { } val toks = nlpSen.map(mkToken) - val keys2Toks = toks.map(t => Key(t) -> t).toMap + val keys2Toks = toks.map(t => NCTokenPartKey(t) -> t).toMap def process(tok: NCTokenImpl, tokNlp: NlpToken): Unit = { - val optList: Option[util.List[util.HashMap[String, JSerializable]]] = + val optList: Option[util.List[NCTokenPartKey]] = tokNlp.find(_.isUser) match { case Some(u) => u.dataOpt("parts") case None => None } optList match { - case Some(list) => - val keys = list.asScala.map(Key(_)) + case Some(keysJava) => + val keys = keysJava.asScala val parts = keys.map(key => keys2Toks.get(key) match { @@ -221,7 +209,11 @@ object NCProbeVariants { val delNotes = nlpSen.getDeletedNotes // Tries to find with same key. - var nlpTokOpt = findDeletedToken(key, delNotes, _ == key.id) + var nlpTokOpt = findDeletedToken( + key, + delNotes, + (delNote: NlpNote) => key.similar(delNote) + ) // If couldn't find nlp note, we can try to find any note on the same position. if (nlpTokOpt.isEmpty && key.id == "nlpcraft:nlp") @@ -249,10 +241,10 @@ object NCProbeVariants { } ) - parts.zip(list.asScala).foreach { case (part, map) => - map.get(TOK_META_ALIASES_KEY) match { + parts.zip(keys).foreach { case (part, key) => + key.aliases match { case null => // No-op. - case aliases => part.getMetadata.put(TOK_META_ALIASES_KEY, aliases.asInstanceOf[Object]) + case aliases => part.getMetadata.put(TOK_META_ALIASES_KEY, aliases) } } @@ -267,7 +259,7 @@ object NCProbeVariants { getOrElse(throw new NCE(s"Token not found for $tok")) ) - ok = ok && !toks.exists(t => t.getId != "nlpcraft:nlp" && keys.contains(Key(t))) + ok = ok && !toks.exists(t => t.getId != "nlpcraft:nlp" && keys.contains(NCTokenPartKey(t))) case None => // No-op. } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala new file mode 100644 index 0000000..c89cae1 --- /dev/null +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nlpcraft.probe.mgrs + +import org.apache.nlpcraft.common.TOK_META_ALIASES_KEY +import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken} +import org.apache.nlpcraft.model.NCToken +import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, _} + +import java.io.{Serializable => JSerializable} +import java.util +import java.util.{List => JList} +import scala.compat.java8.OptionConverters.RichOptionalGeneric +import scala.jdk.CollectionConverters.{MapHasAsJava, MapHasAsScala} +import scala.language.implicitConversions +import scala.collection.mutable + +/** + * + */ +object NCTokenPartKey { + def apply(m: util.HashMap[String, JSerializable]): NCTokenPartKey = { + def get[T](name: String): T = m.get(name).asInstanceOf[T] + + NCTokenPartKey(get("id"), get("startcharindex"), get("endcharindex"), get("data")) + } + + def apply(part: NCToken, kind: NCSynonymChunkKind): NCTokenPartKey = { + val id = part.getId + + val m: Map[String, Any] = + if (kind != TEXT) + id match { + case "nlpcraft:relation" => + Map( + "type" -> part.meta[String](s"$id:type"), + "note" -> part.meta[String](s"$id:note") + ) + case "nlpcraft:limit" => + Map( + "limit" -> part.meta[Double](s"$id:limit"), + "note" -> part.meta[String](s"$id:note") + ) + case "nlpcraft:sort" => + val m = mutable.HashMap.empty[String, Any] + + def add(name: String): Unit = + part.metaOpt[JList[String]](s"$id:$name").asScala match { + case Some(list) => m += name -> list + case None => // No-op. + } + + add("subjnotes") + add("bynotes") + + m.toMap + case _ => Map.empty + } + else + Map.empty + + val key = new NCTokenPartKey( + if (kind == TEXT) "nlpcraft:nlp" else id, + part.getStartCharIndex, + part.getEndCharIndex, + m.asJava + ) + + key.aliases = part.getMetadata.get(TOK_META_ALIASES_KEY) + + key + } + + def apply(t: NCToken): NCTokenPartKey = + new NCTokenPartKey(t.getId, t.getStartCharIndex, t.getEndCharIndex, Map.empty[String, Any].asJava) + + def apply(note: NCNlpSentenceNote, sen: NCNlpSentence): NCTokenPartKey = + NCTokenPartKey( + note.noteType, + sen(note.tokenFrom).startCharIndex, + sen(note.tokenTo).endCharIndex, + Map.empty[String, Any].asJava + ) + + def apply(note: NCNlpSentenceNote, toks: Seq[NCNlpSentenceToken]): NCTokenPartKey = { + val sorted = toks.sortBy(_.index) + + NCTokenPartKey( + note.noteType, + sorted.head.startCharIndex, + sorted.last.endCharIndex, + Map.empty[String, Any].asJava + ) + } +} + +/** + * + * @param id + * @param from + * @param to + * @param data + */ +case class NCTokenPartKey(id: String, from: Int, to: Int, data: util.Map[String, Any]) { + require(from <= to) + + var aliases: AnyRef = _ + + private def in(i: Int): Boolean = i >= from && i <= to + + def intersect(id: String, from: Int, to: Int): Boolean = id == this.id && (in(from) || in(to)) + + def similar(note: NCNlpSentenceNote): Boolean = + id == note.noteType && + ( + data.isEmpty || + data.asScala.forall { case (k, v) => note.contains(k) && note.data(k) == v } + ) +} \ No newline at end of file diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 1061ff8..a2deee8 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -22,19 +22,18 @@ import org.apache.nlpcraft.common._ import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken} import org.apache.nlpcraft.model._ import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent -import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, _} +import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager -import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCProbeSynonym => Synonym} +import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym} import java.io.Serializable -import java.util import java.util.{List => JList} -import scala.collection.mutable.ArrayBuffer import scala.collection.mutable -import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava} +import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.CollectionConverters._ +import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava} /** * Model elements enricher. @@ -185,21 +184,8 @@ object NCModelEnricher extends NCProbeEnricher { case None => // No-op. } - if (parts.nonEmpty) { - val partsData: Seq[util.HashMap[String, Any]] = - parts.map { case (part, kind) => - val m = new util.HashMap[String, Any]() - - m.put("id", if (kind == TEXT) "nlpcraft:nlp" else part.getId) - m.put("startcharindex", part.getStartCharIndex) - m.put("endcharindex", part.getEndCharIndex) - m.put(TOK_META_ALIASES_KEY, part.getMetadata.get(TOK_META_ALIASES_KEY)) - - m - } - - params += "parts" -> partsData.asJava - } + if (parts.nonEmpty) + params += "parts" -> parts.map { case (p, kind) => NCTokenPartKey(p, kind) }.asJava val idxs = toks.map(_.index).sorted diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index 339bb4c..74ead87 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -21,14 +21,15 @@ import io.opencensus.trace.Span import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken} -import org.apache.nlpcraft.common.{NCE, NCService, U} +import org.apache.nlpcraft.common.{NCE, NCService, U, _} import org.apache.nlpcraft.model.NCModel +import org.apache.nlpcraft.probe.mgrs.NCTokenPartKey import java.io.{Serializable => JSerializable} import java.util import java.util.{List => JList} import scala.collection.mutable -import scala.collection.parallel.CollectionConverters.ImmutableIterableIsParallelizable +import scala.collection.parallel.CollectionConverters._ import scala.jdk.CollectionConverters.{ListHasAsScala, SeqHasAsJava, SetHasAsJava} import scala.language.implicitConversions @@ -42,23 +43,6 @@ object NCSentenceManager extends NCService { type CacheValue = Seq[Seq[NCNlpSentenceNote]] private val combCache = mutable.HashMap.empty[String, mutable.HashMap[CacheKey, CacheValue]] - case class PartKey(id: String, start: Int, end: Int) { - require(start <= end) - - private def in(i: Int): Boolean = i >= start && i <= end - def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end)) - } - - object PartKey { - def apply(m: util.HashMap[String, JSerializable]): PartKey = { - def get[T](name: String): T = m.get(name).asInstanceOf[T] - - PartKey(get("id"), get("startcharindex"), get("endcharindex")) - } - - def apply(t: NCNlpSentenceNote, sen: NCNlpSentence): PartKey = - PartKey(t.noteType, sen(t.tokenFrom).startCharIndex, sen(t.tokenTo).endCharIndex) - } /** * @@ -95,14 +79,14 @@ object NCSentenceManager extends NCService { * * @param notes */ - private def getPartKeys(notes: NCNlpSentenceNote*): Seq[PartKey] = + private def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] = notes. filter(_.isUser). flatMap(n => { - val optList: Option[JList[util.HashMap[String, JSerializable]]] = n.dataOpt("parts") + val optList: Option[JList[NCTokenPartKey]] = n.dataOpt("parts") optList - }).flatMap(_.asScala).map(m => PartKey(m)).distinct + }).flatMap(_.asScala).distinct /** * @@ -666,7 +650,7 @@ object NCSentenceManager extends NCService { filter(getPartKeys(_).isEmpty). flatMap(note => { val noteWordsIdxs = note.wordIndexes.toSet - val key = PartKey(note, sen) + val key = NCTokenPartKey(note, sen) val delCombOthers = delCombs.filter(_ != note).flatMap(n => if (getPartKeys(n).contains(key)) Some(n) else None) diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala index 10a28e8..a83f697 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala @@ -44,8 +44,7 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel { val limNote = limitPart.getMetadata.get("nlpcraft:limit:note").asInstanceOf[String] - // TODO: wrapAnyWord? - check it (ticket NLPCRAFT-337) - require(limNote == "anyWord", s"Unexpected limit token note: '$limNote', token: $limitPart, meta: ${limitPart.getMetadata}") + require(limNote == "wrapAnyWord", s"Unexpected limit token note: '$limNote', token: $limitPart, meta: ${limitPart.getMetadata}") val limIdxs = limitPart.getMetadata.get("nlpcraft:limit:indexes").asInstanceOf[util.List[Integer]].asScala
