This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch opennlp-2.x
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/opennlp-2.x by this push:
new a70273ce [2.x] OPENNLP-1781: SentenceDetectorME throws
StringIndexOutOfBoundsException when sentence starts with an abbreviation (#882)
a70273ce is described below
commit a70273ce3f1343a88f6face3992627785b480e8e
Author: Richard Zowalla <[email protected]>
AuthorDate: Tue Oct 14 13:20:55 2025 +0200
[2.x] OPENNLP-1781: SentenceDetectorME throws
StringIndexOutOfBoundsException when sentence starts with an abbreviation (#882)
---
.../opennlp/tools/sentdetect/SentenceDetectorME.java | 3 ++-
.../sentdetect/SentenceDetectorMEGermanTest.java | 19 +++++++++++++++++++
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index 26146c67..f64768df 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -345,7 +345,8 @@ public class SentenceDetectorME implements
SentenceDetector, Probabilistic {
if (tokenPosition == -1) {
continue; // skip fast
}
- final char prevChar = s.charAt(tokenPosition - 1);
+
+ final char prevChar = s.charAt(tokenPosition == 0 ? tokenPosition :
tokenPosition - 1);
int tokenLength = token.length();
if (tokenPosition + tokenLength < candidateIndex || tokenPosition >
candidateIndex ||
/*
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index b34285dc..33560133 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -151,6 +151,24 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
() -> assertEquals(2, probs.length));
}
+ /*
+ * A reproducer and test for OPENNLP-1781.
+ */
+ @Test
+ void testSentDetectWithAbbreviationsAtSentenceStart() {
+ prepareResources(true);
+
+ final String sent1 = "S. Träume sind eine Verbindung von Gedanken.";
+
+ final String[] sents = sentenceDetector.sentDetect(sent1);
+ final double[] probs = sentenceDetector.probs();
+
+ assertAll(
+ () -> assertEquals(1, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(1, probs.length));
+ }
+
/*
* A reproducer and test for OPENNLP-1767.
* It checks that sentence detection with common abbreviations works
correctly,
@@ -163,6 +181,7 @@ public class SentenceDetectorMEGermanTest extends
AbstractSentenceDetectorTest {
"Der Auto stand schief. Wer hat es dort geparkt?",
"Es lag am DBMS. Die Performance muss verbessert werden.",
"Siehe Buch S. 17f. Dort ist es zu finden.",
+ "S. Buch S. 17f. Dort ist es zu finden.", // OPENNLP-1781
"Sie trank einen Mocca. Er schmeckte ihr!",
"Der Anker hängt zu Beginn des Bugs. Es ist vertaut.",
"Das Verfahren testet auf HIV. Es ist präzise."