(opennlp) 01/01: OPENNLP-1767: Fix sentence detection when an abbreviation overlaps at sentence end

mawiesne Thu, 31 Jul 2025 20:58:16 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch 
OPENNLP-1767-Fix-sentence-detection-when-an-abbreviation-overlaps-at-sentence-end
in repository https://gitbox.apache.org/repos/asf/opennlp.git


commit 2681b5a45db72265668b4a74fb9ace17a546c1f8
Author: Martin Wiesner <[email protected]>
AuthorDate: Fri Aug 1 05:58:02 2025 +0200

    OPENNLP-1767: Fix sentence detection when an abbreviation overlaps at 
sentence end
---
 .../tools/sentdetect/SentenceDetectorME.java       | 24 ++++++--
 .../sentdetect/SentenceDetectorMEGermanTest.java   | 72 ++++++++++++++++------
 .../test/resources/opennlp/tools/lang/abb_DE.xml   | 14 ++++-
 3 files changed, 85 insertions(+), 25 deletions(-)

diff --git 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index e627f3cd..fe474288 100644
--- 
a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++ 
b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -340,15 +340,27 @@ public class SentenceDetectorME implements 
SentenceDetector, Probabilistic {
       return true;
 
     for (StringList abb : abbDict) {
-      String token = abb.getToken(0);
+      final String token = abb.getToken(0);
+      final int tokenPosition = s.toString().indexOf(token, fromIndex);
+      if (tokenPosition == -1) {
+        continue; // skip fast
+      }
+      final char prevChar = s.charAt(tokenPosition - 1);
       int tokenLength = token.length();
-      int tokenPosition = s.toString().indexOf(token, fromIndex);
-      if (tokenPosition + tokenLength < candidateIndex || tokenPosition > 
candidateIndex)
-        continue;
+      if (tokenPosition + tokenLength < candidateIndex || tokenPosition > 
candidateIndex ||
+        /*
+         * Note:
+         * Skip abbreviation candidate if regular characters exist directly 
before it,
+         * That is, any letter or digit except: a whitespace, an apostrophe, 
or an opening round bracket.
+         * This prevents mismatches from overlaps close to an actual sentence 
end.
+         */
+          !(prevChar == ' ' || prevChar == '\'' || prevChar == '`' || prevChar 
== '´' || prevChar == '(')) {
 
-      return false;
+        continue;
+      }
+      return false; // in case of a valid abbreviation: the (sentence) break 
is not accepted
     }
-    return true;
+    return true; // no abbreviation(s) at given positions: valid sentence 
boundary
   }
 
   /**
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index 4f814da9..411d3d4f 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -22,12 +22,15 @@ import java.util.Locale;
 
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
 
 import opennlp.tools.dictionary.Dictionary;
 
 import static org.junit.jupiter.api.Assertions.assertAll;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
 /**
@@ -49,6 +52,9 @@ public class SentenceDetectorMEGermanTest extends 
AbstractSentenceDetectorTest {
   private static Dictionary abbreviationDict;
   private SentenceModel sentdetectModel;
 
+  // SUT
+  private SentenceDetectorME sentenceDetector;
+
   @BeforeAll
   static void loadResources() throws IOException {
     abbreviationDict = loadAbbDictionary(Locale.GERMAN);
@@ -60,8 +66,11 @@ public class SentenceDetectorMEGermanTest extends 
AbstractSentenceDetectorTest {
           "deu", useTokenEnd, abbreviationDict, EOS_CHARS);
       sentdetectModel = train(factory, Locale.GERMAN);
 
-      assertAll(() -> assertNotNull(sentdetectModel),
-          () -> assertEquals("deu", sentdetectModel.getLanguage()));
+      assertAll(
+          () -> assertNotNull(sentdetectModel),
+          () -> assertEquals("deu", sentdetectModel.getLanguage())
+      );
+      sentenceDetector = new SentenceDetectorME(sentdetectModel);
     } catch (IOException ex) {
       fail("Couldn't train the SentenceModel using test data. Exception: " + 
ex.getMessage());
     }
@@ -77,12 +86,12 @@ public class SentenceDetectorMEGermanTest extends 
AbstractSentenceDetectorTest {
     // Here we have two abbreviations "S. = Seite" and "ff. = folgende 
(Plural)"
     final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von 
der botanischen Monographie.";
 
-    SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
     String sampleSentences = sent1 + " " + sent2;
-    String[] sents = sentDetect.sentDetect(sampleSentences);
-    double[] probs = sentDetect.probs();
+    String[] sents = sentenceDetector.sentDetect(sampleSentences);
+    double[] probs = sentenceDetector.probs();
 
-    assertAll(() -> assertEquals(2, sents.length),
+    assertAll(
+        () -> assertEquals(2, sents.length),
         () -> assertEquals(sent1, sents[0]),
         () -> assertEquals(sent2, sents[1]),
         () -> assertEquals(2, probs.length));
@@ -97,11 +106,11 @@ public class SentenceDetectorMEGermanTest extends 
AbstractSentenceDetectorTest {
     final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
         "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
 
-    SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
-    String[] sents = sentDetect.sentDetect(sent1);
-    double[] probs = sentDetect.probs();
+    String[] sents = sentenceDetector.sentDetect(sent1);
+    double[] probs = sentenceDetector.probs();
 
-    assertAll(() -> assertEquals(1, sents.length),
+    assertAll(
+        () -> assertEquals(1, sents.length),
         () -> assertEquals(sent1, sents[0]),
         () -> assertEquals(1, probs.length));
   }
@@ -115,11 +124,11 @@ public class SentenceDetectorMEGermanTest extends 
AbstractSentenceDetectorTest {
     final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
         "geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
 
-    SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
-    String[] sents = sentDetect.sentDetect(sent1);
-    double[] probs = sentDetect.probs();
+    String[] sents = sentenceDetector.sentDetect(sent1);
+    double[] probs = sentenceDetector.probs();
 
-    assertAll(() -> assertEquals(1, sents.length),
+    assertAll(
+        () -> assertEquals(1, sents.length),
         () -> assertEquals(sent1, sents[0]),
         () -> assertEquals(1, probs.length));
   }
@@ -131,14 +140,41 @@ public class SentenceDetectorMEGermanTest extends 
AbstractSentenceDetectorTest {
     final String sent1 = "Träume sind eine Verbindung von Gedanken.";
     final String sent2 = "Verschiedene Gedanken sind während der 
Traumformation aktiv.";
 
-    SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
     //There is no blank space before start of the second sentence.
-    String[] sents = sentDetect.sentDetect(sent1 + sent2);
-    double[] probs = sentDetect.probs();
+    String[] sents = sentenceDetector.sentDetect(sent1 + sent2);
+    double[] probs = sentenceDetector.probs();
 
-    assertAll(() -> assertEquals(2, sents.length),
+    assertAll(
+        () -> assertEquals(2, sents.length),
         () -> assertEquals(sent1, sents[0]),
         () -> assertEquals(sent2, sents[1]),
         () -> assertEquals(2, probs.length));
   }
+
+  /*
+   * A reproducer and test for OPENNLP-1767.
+   * It checks that sentence detection with common abbreviations works 
correctly,
+   * that is, tokenks such as "lt.", "f.", "S." (page), "ca.", or "ugs." do 
not cause
+   * mis-matches when it accidentially overlaps at the end of a sentence.
+   */
+  @ParameterizedTest
+  @ValueSource(strings = {
+      "Die Frage wurde gestellt. Sie wurde beantwortet.",
+      "Der Auto stand schief. Wer hat es dort geparkt?",
+      "Es lag am DBMS. Die Performance muss verbessert werden.",
+      "Siehe Buch S. 17f. Dort ist es zu finden.",
+      "Sie trank einen Mocca. Er schmeckte ihr!",
+      "Der Anker hängt zu Beginn des Bugs. Es ist vertaut.",
+      "Das Verfahren testet auf HIV. Es ist präzise."
+  })
+  void testSentDetectWithOverlappingAbbreviationAtSentenceEnd(String input) {
+    prepareResources(true);
+    String[] sents = sentenceDetector.sentDetect(input);
+    assertAll(
+        () -> assertNotNull(sents),
+        () -> assertEquals(2, sents.length),
+        () -> assertTrue(Character.isUpperCase(sents[0].charAt(0))),
+        () -> assertTrue(Character.isUpperCase(sents[1].charAt(0)))
+    );
+  }
 }
diff --git 
a/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml 
b/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml
index ac7f9589..23e09abc 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml
+++ 
b/opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/lang/abb_DE.xml
@@ -21,7 +21,7 @@
 
 <dictionary case_sensitive="false">
   <entry>
-    <token>S.</token>
+    <token>ca.</token>
   </entry>
   <entry>
     <token>f.</token>
@@ -29,6 +29,18 @@
   <entry>
     <token>ff.</token>
   </entry>
+  <entry>
+    <token>lt.</token>
+  </entry>
+  <entry>
+    <token>S.</token>
+  </entry>
+  <entry>
+    <token>V.</token>
+  </entry>
+  <entry>
+    <token>ugs.</token>
+  </entry>
   <entry>
     <token>z. B.</token>
   </entry>

(opennlp) 01/01: OPENNLP-1767: Fix sentence detection when an abbreviation overlaps at sentence end

Reply via email to