This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-472
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push:
new 01aa390 WIP.
01aa390 is described below
commit 01aa3907841356db65b4e17ba92ea18b8601fed0
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Dec 29 23:01:19 2021 +0300
WIP.
---
.../semantic/impl/NCSemanticEntityParserImpl.scala | 14 +++++-----
.../entity/parser/semantic/impl/NCSynonym.scala | 31 +++++++++++++++++-----
.../parser/semantic/impl/NCSynonymChunk.scala | 16 +++++------
3 files changed, 39 insertions(+), 22 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
index d05c85e..ded73cc 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala
@@ -55,7 +55,7 @@ object NCSemanticEntityParserImpl:
val ptrn = stripSuffix(REGEX_FIX, chunk)
if (ptrn.nonEmpty) {
try
- NCSynonymChunk(kind = REGEX, origText = chunk, regex =
Pattern.compile(ptrn))
+ NCSynonymChunk(kind = REGEX, text = chunk, regex =
Pattern.compile(ptrn))
catch {
case e: PatternSyntaxException =>
throw new NCException(s"Invalid regex synonym syntax
detected [" +
@@ -68,9 +68,8 @@ object NCSemanticEntityParserImpl:
s"chunk=$chunk" +
s"]")
}
- // IDL-based synonym.
else
- NCSynonymChunk(kind = TEXT, origText = chunk, wordStem =
stemmer.stem(chunk))
+ NCSynonymChunk(kind = TEXT, text = chunk, stem =
stemmer.stem(chunk))
}
private def getPieces(toks: Seq[NCToken]): Seq[Piece] =
@@ -143,15 +142,15 @@ class NCSemanticEntityParserImpl(stemmer: NCStemmer,
macros: Map[String, String]
syns.
flatMap(p.expand).
map(t => cfg.getTokenizer.tokenize(cfg,
t).asScala.map(w => mkChunk(stemmer, w.getText)).toSeq).
- toSeq.map(chunks => NCSynonym(false, false, false,
null, chunks))
+ // TODO:
+ toSeq.map(chunks => NCSynonym(false, false, null,
chunks))
)
})
- // TODO: sort
sortedSyns =
all.groupBy(_.synonyms.size).map {
case (len, hs) =>
- len -> hs.groupBy(_.elemId).map { case (elemId, seq) =>
elemId -> seq.flatMap(_.synonyms).toSeq }
+ len -> hs.groupBy(_.elemId).map { case (id, seq) => id ->
seq.flatMap(_.synonyms).toSeq.sorted }
}
override def stop(): Unit = sortedSyns = null
@@ -180,5 +179,4 @@ class NCSemanticEntityParserImpl(stemmer: NCStemmer,
macros: Map[String, String]
for (piece <- value; extra <- Seq(piece.main) ++ piece.extra)
tryMatch(piece.main, extra)
- entities.toSeq.asJava
-
+ entities.toSeq.asJava
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
index 2beed29..fa3b36e 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonym.scala
@@ -17,18 +17,37 @@
package org.apache.nlpcraft.nlp.entity.parser.semantic.impl
import org.apache.nlpcraft.NCToken
+import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSynonymChunkKind._
case class NCSynonym(
isElementId: Boolean,
isValueName: Boolean,
- isDirect: Boolean,
value: String = null,
chunks: Seq[NCSynonymChunk]
-) {
- private lazy val stem = ""
+) extends Comparable[NCSynonym]:
+ private final val size = chunks.size
+ private final val regexCount = size - chunks.count(_.kind == TEXT)
+ private final val isText = regexCount == 0
+
+ private lazy val stem = if isText then chunks.map(_.stem).mkString(" ")
else null
- // TODO: implement.
def isMatch(toks: Seq[NCToken]): Boolean =
- chunks.size == toks.size && chunks.zip(toks).forall { case (chunk,
tok) => chunk.wordStem == tok.getStem}
+ size == toks.size && (
+ if isText then
+ stem == toks.map(_.getStem).mkString(" ")
+ else
+ chunks.zip(toks).forall { case (chunk, tok) =>
+ if chunk.stem != null then
+ chunk.stem == tok.getStem
+ else
+ def match0(tokTxt: String) =
chunk.regex.matcher(tokTxt).matches()
+
+ match0(tok.getText) || match0(tok.getText.toLowerCase)
+ }
+ )
+
+ override def compareTo(o: NCSynonym): Int = Integer.compare(regexCount,
o.regexCount)
+
+
+
-}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
index 86e2c09..9ac33ed 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSynonymChunk.scala
@@ -24,20 +24,20 @@ import java.util.regex.Pattern
/**
*
* @param kind Kind of synonym chunk.
- * @param origText Original text.
- * @param wordStem Optional stem for a single word synonyms.
- * @param posTag Optional PoS tag to match on.
+ * @param text Original text.
+ * @param stem Optional stem for a single word synonyms.
+ * @param pos Optional PoS tag to match on.
* @param regex Optional regex expression to match on.
*/
case class NCSynonymChunk(
kind: NCSynonymChunkKind,
- origText: String,
- wordStem: String = null, // Only for kind == TEXT.
- posTag: String = null,
+ text: String,
+ stem: String = null, // Only for kind == TEXT.
+ pos: String = null,
regex: Pattern = null
) {
- require(origText != null)
+ require(text != null)
require(kind != null)
- override def toString = s"($origText|$kind)"
+ override def toString = s"($text|$kind)"
}