This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new 55832e48 OPENNLP-1767: Fix sentence detection when an abbreviation
overlaps at sentence end (#829)
55832e48 is described below
commit 55832e48f53c9c33eac0248d85e4a9b92bb560a3
Author: Martin Wiesner <[email protected]>
AuthorDate: Sun Aug 3 09:13:58 2025 +0200
OPENNLP-1767: Fix sentence detection when an abbreviation overlaps at
sentence end (#829)
---
.../tools/sentdetect/SentenceDetectorME.java | 32 ++++++++--
.../sentdetect/SentenceDetectorMEGermanTest.java | 72 ++++++++++++++++------
.../test/resources/opennlp/tools/lang/abb_DE.xml | 14 ++++-
3 files changed, 93 insertions(+), 25 deletions(-)
diff --git
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index e627f3cd..ddcc3388 100644
---
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -340,15 +340,35 @@ public class SentenceDetectorME implements
SentenceDetector, Probabilistic {
return true;
for (StringList abb : abbDict) {
- String token = abb.getToken(0);
+ final String token = abb.getToken(0);
+ final int tokenPosition = s.toString().indexOf(token, fromIndex);
+ if (tokenPosition == -1) {
+ continue; // skip fast
+ }
+ final char prevChar = s.charAt(tokenPosition - 1);
int tokenLength = token.length();
- int tokenPosition = s.toString().indexOf(token, fromIndex);
- if (tokenPosition + tokenLength < candidateIndex || tokenPosition >
candidateIndex)
- continue;
+ if (tokenPosition + tokenLength < candidateIndex || tokenPosition >
candidateIndex ||
+ /*
+ * Note:
+ * Skip abbreviation candidate if regular characters exist directly
before it,
+ * That is, any letter or digit except: a whitespace, an apostrophe,
or an opening round bracket.
+ * This prevents mismatches from overlaps close to an actual sentence
end.
+ */
+ !(Character.isWhitespace(prevChar) || isApostrophe(prevChar) ||
prevChar == '(')) {
- return false;
+ continue;
+ }
+ return false; // in case of a valid abbreviation: the (sentence) break
is not accepted
}
- return true;
+ return true; // no abbreviation(s) at given positions: valid sentence
boundary
+ }
+
+ /**
+ * @param c The character to check.
+ * @return {@code true} if the character represents an apostrophe, {@code
false} otherwise.
+ */
+ private static boolean isApostrophe(char c) {
+ return c == '\'' || c == '`' || c == '´';
}
/**
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index 4f814da9..d95a1eec 100644
---
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -22,12 +22,15 @@ import java.util.Locale;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
import opennlp.tools.dictionary.Dictionary;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
/**
@@ -49,6 +52,9 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
private static Dictionary abbreviationDict;
private SentenceModel sentdetectModel;
+ // SUT
+ private SentenceDetectorME sentenceDetector;
+
@BeforeAll
static void loadResources() throws IOException {
abbreviationDict = loadAbbDictionary(Locale.GERMAN);
@@ -60,8 +66,11 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
"deu", useTokenEnd, abbreviationDict, EOS_CHARS);
sentdetectModel = train(factory, Locale.GERMAN);
- assertAll(() -> assertNotNull(sentdetectModel),
- () -> assertEquals("deu", sentdetectModel.getLanguage()));
+ assertAll(
+ () -> assertNotNull(sentdetectModel),
+ () -> assertEquals("deu", sentdetectModel.getLanguage())
+ );
+ sentenceDetector = new SentenceDetectorME(sentdetectModel);
} catch (IOException ex) {
fail("Couldn't train the SentenceModel using test data. Exception: " +
ex.getMessage());
}
@@ -77,12 +86,12 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
// Here we have two abbreviations "S. = Seite" and "ff. = folgende
(Plural)"
final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von
der botanischen Monographie.";
- SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
String sampleSentences = sent1 + " " + sent2;
- String[] sents = sentDetect.sentDetect(sampleSentences);
- double[] probs = sentDetect.probs();
+ String[] sents = sentenceDetector.sentDetect(sampleSentences);
+ double[] probs = sentenceDetector.probs();
- assertAll(() -> assertEquals(2, sents.length),
+ assertAll(
+ () -> assertEquals(2, sents.length),
() -> assertEquals(sent1, sents[0]),
() -> assertEquals(sent2, sents[1]),
() -> assertEquals(2, probs.length));
@@ -97,11 +106,11 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
"geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
- SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
- String[] sents = sentDetect.sentDetect(sent1);
- double[] probs = sentDetect.probs();
+ String[] sents = sentenceDetector.sentDetect(sent1);
+ double[] probs = sentenceDetector.probs();
- assertAll(() -> assertEquals(1, sents.length),
+ assertAll(
+ () -> assertEquals(1, sents.length),
() -> assertEquals(sent1, sents[0]),
() -> assertEquals(1, probs.length));
}
@@ -115,11 +124,11 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
"geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
- SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
- String[] sents = sentDetect.sentDetect(sent1);
- double[] probs = sentDetect.probs();
+ String[] sents = sentenceDetector.sentDetect(sent1);
+ double[] probs = sentenceDetector.probs();
- assertAll(() -> assertEquals(1, sents.length),
+ assertAll(
+ () -> assertEquals(1, sents.length),
() -> assertEquals(sent1, sents[0]),
() -> assertEquals(1, probs.length));
}
@@ -131,14 +140,41 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
final String sent1 = "Träume sind eine Verbindung von Gedanken.";
final String sent2 = "Verschiedene Gedanken sind während der
Traumformation aktiv.";
- SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
//There is no blank space before start of the second sentence.
- String[] sents = sentDetect.sentDetect(sent1 + sent2);
- double[] probs = sentDetect.probs();
+ String[] sents = sentenceDetector.sentDetect(sent1 + sent2);
+ double[] probs = sentenceDetector.probs();
- assertAll(() -> assertEquals(2, sents.length),
+ assertAll(
+ () -> assertEquals(2, sents.length),
() -> assertEquals(sent1, sents[0]),
() -> assertEquals(sent2, sents[1]),
() -> assertEquals(2, probs.length));
}
+
+ /*
+ * A reproducer and test for OPENNLP-1767.
+ * It checks that sentence detection with common abbreviations works
correctly,
+ * that is, tokens such as "lt.", "f.", "S." (page), "ca.", or "ugs." do not
cause
+ * mis-matches when it accidentally overlaps at the end of a sentence.
+ */
+ @ParameterizedTest
+ @ValueSource(strings = {
+ "Die Frage wurde gestellt. Sie wurde beantwortet.",
+ "Der Auto stand schief. Wer hat es dort geparkt?",
+ "Es lag am DBMS. Die Performance muss verbessert werden.",
+ "Siehe Buch S. 17f. Dort ist es zu finden.",
+ "Sie trank einen Mocca. Er schmeckte ihr!",
+ "Der Anker hängt zu Beginn des Bugs. Es ist vertaut.",
+ "Das Verfahren testet auf HIV. Es ist präzise."
+ })
+ void testSentDetectWithOverlappingAbbreviationAtSentenceEnd(String input) {
+ prepareResources(true);
+ String[] sents = sentenceDetector.sentDetect(input);
+ assertAll(
+ () -> assertNotNull(sents),
+ () -> assertEquals(2, sents.length),
+ () -> assertTrue(Character.isUpperCase(sents[0].charAt(0))),
+ () -> assertTrue(Character.isUpperCase(sents[1].charAt(0)))
+ );
+ }
}
diff --git
a/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml
b/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml
index ac7f9589..23e09abc 100644
---
a/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml
+++
b/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml
@@ -21,7 +21,7 @@
<dictionary case_sensitive="false">
<entry>
- <token>S.</token>
+ <token>ca.</token>
</entry>
<entry>
<token>f.</token>
@@ -29,6 +29,18 @@
<entry>
<token>ff.</token>
</entry>
+ <entry>
+ <token>lt.</token>
+ </entry>
+ <entry>
+ <token>S.</token>
+ </entry>
+ <entry>
+ <token>V.</token>
+ </entry>
+ <entry>
+ <token>ugs.</token>
+ </entry>
<entry>
<token>z. B.</token>
</entry>