This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new eab70aa0 OPENNLP-1745: SentenceDetector - Add Junit test for
useTokenEnd = false (#792)
eab70aa0 is described below
commit eab70aa07772076879ef10760c75ef7bbd451e0e
Author: NishantShri4 <[email protected]>
AuthorDate: Tue Jun 24 19:58:59 2025 +0100
OPENNLP-1745: SentenceDetector - Add Junit test for useTokenEnd = false
(#792)
* OPENNLP-1745: SentenceDetector - Add Junit test for useTokenEnd = false
* Added useTokenEnd to the list of optional params available for sentence
detector tool.
---
.../sentdetect/SentenceDetectorTrainerTool.java | 2 +-
.../tools/cmdline/sentdetect/TrainingParams.java | 5 ++
.../sentdetect/SentenceDetectorMEGermanTest.java | 85 ++++++++++++++++------
opennlp-docs/src/docbkx/sentdetect.xml | 5 +-
4 files changed, 72 insertions(+), 25 deletions(-)
diff --git
a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
index 933895bf..77d09baf 100644
---
a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
+++
b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
@@ -92,7 +92,7 @@ public final class SentenceDetectorTrainerTool
try {
Dictionary dict = loadDict(params.getAbbDict());
SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(
- params.getFactory(), params.getLang(), true, dict, eos);
+ params.getFactory(), params.getLang(), params.getUseTokenEnd(),
dict, eos);
model = SentenceDetectorME.train(params.getLang(), sampleStream,
sdFactory, mlParams);
} catch (IOException e) {
diff --git
a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
index 476f929a..37cb7115 100644
---
a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
+++
b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
@@ -44,4 +44,9 @@ interface TrainingParams extends BasicTrainingParams {
description = "A sub-class of SentenceDetectorFactory where to get
implementation and resources.")
@OptionalParameter
String getFactory();
+
+ @ParameterDescription(valueName = "useTokenEnd",
+ description = "A boolean parameter to detect the start index of the next
sentence in the test data.")
+ @OptionalParameter(defaultValue = "true")
+ Boolean getUseTokenEnd();
}
diff --git
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index a520ed27..7593100a 100644
---
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -20,12 +20,16 @@ package opennlp.tools.sentdetect;
import java.io.IOException;
import java.util.Locale;
-import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import opennlp.tools.dictionary.Dictionary;
+import static org.junit.jupiter.api.Assertions.assertAll;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.fail;
+
/**
* Tests for the {@link SentenceDetectorME} class.
* <p>
@@ -42,64 +46,99 @@ import opennlp.tools.dictionary.Dictionary;
public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest
{
private static final char[] EOS_CHARS = {'.', '?', '!'};
-
- private static SentenceModel sentdetectModel;
+ private static Dictionary abbreviationDict;
+ private SentenceModel sentdetectModel;
@BeforeAll
- public static void prepareResources() throws IOException {
- Dictionary abbreviationDict = loadAbbDictionary(Locale.GERMAN);
- SentenceDetectorFactory factory = new SentenceDetectorFactory(
- "deu", true, abbreviationDict, EOS_CHARS);
- sentdetectModel = train(factory, Locale.GERMAN);
- Assertions.assertNotNull(sentdetectModel);
- Assertions.assertEquals("deu", sentdetectModel.getLanguage());
+ static void loadResources() throws IOException {
+ abbreviationDict = loadAbbDictionary(Locale.GERMAN);
+ }
+
+ private void prepareResources(boolean useTokenEnd) {
+ try {
+ SentenceDetectorFactory factory = new SentenceDetectorFactory(
+ "deu", useTokenEnd, abbreviationDict, EOS_CHARS);
+ sentdetectModel = train(factory, Locale.GERMAN);
+
+ assertAll(() -> assertNotNull(sentdetectModel),
+ () -> assertEquals("deu", sentdetectModel.getLanguage()));
+ } catch (IOException ex) {
+ fail("Couldn't train the SentenceModel using test data. Exception: " +
ex.getMessage());
+ }
}
// Example taken from 'Sentences_DE.txt'
@Test
void testSentDetectWithInlineAbbreviationsEx1() {
+ prepareResources(true);
+
final String sent1 = "Ein Traum, zu dessen Bildung eine besonders starke
Verdichtung beigetragen, " +
- "wird für diese Untersuchung das günstigste Material sein.";
+ "wird für diese Untersuchung das günstigste Material sein.";
// Here we have two abbreviations "S. = Seite" and "ff. = folgende
(Plural)"
final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von
der botanischen Monographie.";
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
String sampleSentences = sent1 + " " + sent2;
String[] sents = sentDetect.sentDetect(sampleSentences);
- Assertions.assertEquals(2, sents.length);
- Assertions.assertEquals(sent1, sents[0]);
- Assertions.assertEquals(sent2, sents[1]);
double[] probs = sentDetect.getSentenceProbabilities();
- Assertions.assertEquals(2, probs.length);
+
+ assertAll(() -> assertEquals(2, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(sent2, sents[1]),
+ () -> assertEquals(2, probs.length));
}
// Reduced example taken from 'Sentences_DE.txt'
@Test
void testSentDetectWithInlineAbbreviationsEx2() {
+ prepareResources(true);
+
// Here we have three abbreviations: "S. = Seite", "vgl. = vergleiche",
and "f. = folgende (Singular)"
final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
- "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
+ "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
String[] sents = sentDetect.sentDetect(sent1);
- Assertions.assertEquals(1, sents.length);
- Assertions.assertEquals(sent1, sents[0]);
double[] probs = sentDetect.getSentenceProbabilities();
- Assertions.assertEquals(1, probs.length);
+
+ assertAll(() -> assertEquals(1, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(1, probs.length));
}
// Modified example deduced from 'Sentences_DE.txt'
@Test
void testSentDetectWithInlineAbbreviationsEx3() {
+ prepareResources(true);
+
// Here we have two abbreviations "z. B. = zum Beispiel" and "S. = Seite"
final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
- "geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
+ "geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
String[] sents = sentDetect.sentDetect(sent1);
- Assertions.assertEquals(1, sents.length);
- Assertions.assertEquals(sent1, sents[0]);
double[] probs = sentDetect.getSentenceProbabilities();
- Assertions.assertEquals(1, probs.length);
+
+ assertAll(() -> assertEquals(1, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(1, probs.length));
+ }
+
+ @Test
+ void testSentDetectWithUseTokenEndFalse() {
+ prepareResources(false);
+
+ final String sent1 = "Träume sind eine Verbindung von Gedanken.";
+ final String sent2 = "Verschiedene Gedanken sind während der
Traumformation aktiv.";
+
+ SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
+ //There is no blank space before start of the second sentence.
+ String[] sents = sentDetect.sentDetect(sent1 + sent2);
+ double[] probs = sentDetect.getSentenceProbabilities();
+
+ assertAll(() -> assertEquals(2, sents.length),
+ () -> assertEquals(sent1, sents[0]),
+ () -> assertEquals(sent2, sents[1]),
+ () -> assertEquals(2, probs.length));
}
}
diff --git a/opennlp-docs/src/docbkx/sentdetect.xml
b/opennlp-docs/src/docbkx/sentdetect.xml
index 11b047d3..f73248ec 100644
--- a/opennlp-docs/src/docbkx/sentdetect.xml
+++ b/opennlp-docs/src/docbkx/sentdetect.xml
@@ -142,7 +142,10 @@ Arguments description:
-data sampleData
data to be used, usually a file name.
-encoding charsetName
- encoding for reading and writing text, if absent the system
default is used.]]>
+ encoding for reading and writing text, if absent the system
default is used.
+ -useTokenEnd boolean flag
+ set to false when the next sentence in the test dataset
doesn't start with a blank space post completion of
+ the previous sentence. If absent, it is defaulted to true.]]>
</screen>
To train an English sentence detector use the following command:
<screen>