This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1767-Fix-sentence-detection-when-an-abbreviation-overlaps-at-sentence-end in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 2681b5a45db72265668b4a74fb9ace17a546c1f8 Author: Martin Wiesner <[email protected]> AuthorDate: Fri Aug 1 05:58:02 2025 +0200 OPENNLP-1767: Fix sentence detection when an abbreviation overlaps at sentence end --- .../tools/sentdetect/SentenceDetectorME.java | 24 ++++++-- .../sentdetect/SentenceDetectorMEGermanTest.java | 72 ++++++++++++++++------ .../test/resources/opennlp/tools/lang/abb_DE.xml | 14 ++++- 3 files changed, 85 insertions(+), 25 deletions(-) diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java index e627f3cd..fe474288 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java @@ -340,15 +340,27 @@ public class SentenceDetectorME implements SentenceDetector, Probabilistic { return true; for (StringList abb : abbDict) { - String token = abb.getToken(0); + final String token = abb.getToken(0); + final int tokenPosition = s.toString().indexOf(token, fromIndex); + if (tokenPosition == -1) { + continue; // skip fast + } + final char prevChar = s.charAt(tokenPosition - 1); int tokenLength = token.length(); - int tokenPosition = s.toString().indexOf(token, fromIndex); - if (tokenPosition + tokenLength < candidateIndex || tokenPosition > candidateIndex) - continue; + if (tokenPosition + tokenLength < candidateIndex || tokenPosition > candidateIndex || + /* + * Note: + * Skip abbreviation candidate if regular characters exist directly before it, + * That is, any letter or digit except: a whitespace, an apostrophe, or an opening round bracket. + * This prevents mismatches from overlaps close to an actual sentence end. + */ + !(prevChar == ' ' || prevChar == '\'' || prevChar == '`' || prevChar == '´' || prevChar == '(')) { - return false; + continue; + } + return false; // in case of a valid abbreviation: the (sentence) break is not accepted } - return true; + return true; // no abbreviation(s) at given positions: valid sentence boundary } /** diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java index 4f814da9..411d3d4f 100644 --- a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java @@ -22,12 +22,15 @@ import java.util.Locale; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import opennlp.tools.dictionary.Dictionary; import static org.junit.jupiter.api.Assertions.assertAll; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; /** @@ -49,6 +52,9 @@ public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest { private static Dictionary abbreviationDict; private SentenceModel sentdetectModel; + // SUT + private SentenceDetectorME sentenceDetector; + @BeforeAll static void loadResources() throws IOException { abbreviationDict = loadAbbDictionary(Locale.GERMAN); @@ -60,8 +66,11 @@ public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest { "deu", useTokenEnd, abbreviationDict, EOS_CHARS); sentdetectModel = train(factory, Locale.GERMAN); - assertAll(() -> assertNotNull(sentdetectModel), - () -> assertEquals("deu", sentdetectModel.getLanguage())); + assertAll( + () -> assertNotNull(sentdetectModel), + () -> assertEquals("deu", sentdetectModel.getLanguage()) + ); + sentenceDetector = new SentenceDetectorME(sentdetectModel); } catch (IOException ex) { fail("Couldn't train the SentenceModel using test data. Exception: " + ex.getMessage()); } @@ -77,12 +86,12 @@ public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest { // Here we have two abbreviations "S. = Seite" and "ff. = folgende (Plural)" final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie."; - SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel); String sampleSentences = sent1 + " " + sent2; - String[] sents = sentDetect.sentDetect(sampleSentences); - double[] probs = sentDetect.probs(); + String[] sents = sentenceDetector.sentDetect(sampleSentences); + double[] probs = sentenceDetector.probs(); - assertAll(() -> assertEquals(2, sents.length), + assertAll( + () -> assertEquals(2, sents.length), () -> assertEquals(sent1, sents[0]), () -> assertEquals(sent2, sents[1]), () -> assertEquals(2, probs.length)); @@ -97,11 +106,11 @@ public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest { final String sent1 = "Die farbige Tafel, die ich aufschlage, " + "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein."; - SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel); - String[] sents = sentDetect.sentDetect(sent1); - double[] probs = sentDetect.probs(); + String[] sents = sentenceDetector.sentDetect(sent1); + double[] probs = sentenceDetector.probs(); - assertAll(() -> assertEquals(1, sents.length), + assertAll( + () -> assertEquals(1, sents.length), () -> assertEquals(sent1, sents[0]), () -> assertEquals(1, probs.length)); } @@ -115,11 +124,11 @@ public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest { final String sent1 = "Die farbige Tafel, die ich aufschlage, " + "geht (z. B. die Analyse S. 185) auf ein neues Thema ein."; - SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel); - String[] sents = sentDetect.sentDetect(sent1); - double[] probs = sentDetect.probs(); + String[] sents = sentenceDetector.sentDetect(sent1); + double[] probs = sentenceDetector.probs(); - assertAll(() -> assertEquals(1, sents.length), + assertAll( + () -> assertEquals(1, sents.length), () -> assertEquals(sent1, sents[0]), () -> assertEquals(1, probs.length)); } @@ -131,14 +140,41 @@ public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest { final String sent1 = "Träume sind eine Verbindung von Gedanken."; final String sent2 = "Verschiedene Gedanken sind während der Traumformation aktiv."; - SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel); //There is no blank space before start of the second sentence. - String[] sents = sentDetect.sentDetect(sent1 + sent2); - double[] probs = sentDetect.probs(); + String[] sents = sentenceDetector.sentDetect(sent1 + sent2); + double[] probs = sentenceDetector.probs(); - assertAll(() -> assertEquals(2, sents.length), + assertAll( + () -> assertEquals(2, sents.length), () -> assertEquals(sent1, sents[0]), () -> assertEquals(sent2, sents[1]), () -> assertEquals(2, probs.length)); } + + /* + * A reproducer and test for OPENNLP-1767. + * It checks that sentence detection with common abbreviations works correctly, + * that is, tokenks such as "lt.", "f.", "S." (page), "ca.", or "ugs." do not cause + * mis-matches when it accidentially overlaps at the end of a sentence. + */ + @ParameterizedTest + @ValueSource(strings = { + "Die Frage wurde gestellt. Sie wurde beantwortet.", + "Der Auto stand schief. Wer hat es dort geparkt?", + "Es lag am DBMS. Die Performance muss verbessert werden.", + "Siehe Buch S. 17f. Dort ist es zu finden.", + "Sie trank einen Mocca. Er schmeckte ihr!", + "Der Anker hängt zu Beginn des Bugs. Es ist vertaut.", + "Das Verfahren testet auf HIV. Es ist präzise." + }) + void testSentDetectWithOverlappingAbbreviationAtSentenceEnd(String input) { + prepareResources(true); + String[] sents = sentenceDetector.sentDetect(input); + assertAll( + () -> assertNotNull(sents), + () -> assertEquals(2, sents.length), + () -> assertTrue(Character.isUpperCase(sents[0].charAt(0))), + () -> assertTrue(Character.isUpperCase(sents[1].charAt(0))) + ); + } } diff --git a/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml b/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml index ac7f9589..23e09abc 100644 --- a/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml +++ b/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml @@ -21,7 +21,7 @@ <dictionary case_sensitive="false"> <entry> - <token>S.</token> + <token>ca.</token> </entry> <entry> <token>f.</token> @@ -29,6 +29,18 @@ <entry> <token>ff.</token> </entry> + <entry> + <token>lt.</token> + </entry> + <entry> + <token>S.</token> + </entry> + <entry> + <token>V.</token> + </entry> + <entry> + <token>ugs.</token> + </entry> <entry> <token>z. B.</token> </entry>
