(opennlp) branch main updated: OPENNLP-1745: SentenceDetector - Add Junit test for useTokenEnd = false (#792)

mawiesne Wed, 25 Jun 2025 13:49:18 -0700

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git



The following commit(s) were added to refs/heads/main by this push:
     new eab70aa0 OPENNLP-1745: SentenceDetector - Add Junit test for 
useTokenEnd = false (#792)
eab70aa0 is described below

commit eab70aa07772076879ef10760c75ef7bbd451e0e
Author: NishantShri4 <[email protected]>
AuthorDate: Tue Jun 24 19:58:59 2025 +0100

    OPENNLP-1745: SentenceDetector - Add Junit test for useTokenEnd = false 
(#792)
    
    * OPENNLP-1745: SentenceDetector - Add Junit test for useTokenEnd = false
    
    * Added useTokenEnd to the list of optional params available for sentence 
detector tool.
---
 .../sentdetect/SentenceDetectorTrainerTool.java    |  2 +-
 .../tools/cmdline/sentdetect/TrainingParams.java   |  5 ++
 .../sentdetect/SentenceDetectorMEGermanTest.java   | 85 ++++++++++++++++------
 opennlp-docs/src/docbkx/sentdetect.xml             |  5 +-
 4 files changed, 72 insertions(+), 25 deletions(-)

diff --git 
a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
 
b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
index 933895bf..77d09baf 100644
--- 
a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
+++ 
b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
@@ -92,7 +92,7 @@ public final class SentenceDetectorTrainerTool
     try {
       Dictionary dict = loadDict(params.getAbbDict());
       SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(
-          params.getFactory(), params.getLang(), true, dict, eos);
+          params.getFactory(), params.getLang(), params.getUseTokenEnd(), 
dict, eos);
       model = SentenceDetectorME.train(params.getLang(), sampleStream,
           sdFactory, mlParams);
     } catch (IOException e) {
diff --git 
a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
 
b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
index 476f929a..37cb7115 100644
--- 
a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
+++ 
b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
@@ -44,4 +44,9 @@ interface TrainingParams extends BasicTrainingParams {
       description = "A sub-class of SentenceDetectorFactory where to get 
implementation and resources.")
   @OptionalParameter
   String getFactory();
+
+  @ParameterDescription(valueName = "useTokenEnd",
+      description = "A boolean parameter to detect the start index of the next 
sentence in the test data.")
+  @OptionalParameter(defaultValue = "true")
+  Boolean getUseTokenEnd();
 }
diff --git 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
index a520ed27..7593100a 100644
--- 
a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
+++ 
b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMEGermanTest.java
@@ -20,12 +20,16 @@ package opennlp.tools.sentdetect;
 import java.io.IOException;
 import java.util.Locale;
 
-import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 
 import opennlp.tools.dictionary.Dictionary;
 
+import static org.junit.jupiter.api.Assertions.assertAll;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.fail;
+
 /**
  * Tests for the {@link SentenceDetectorME} class.
  * <p>
@@ -42,64 +46,99 @@ import opennlp.tools.dictionary.Dictionary;
 public class SentenceDetectorMEGermanTest extends AbstractSentenceDetectorTest 
{
 
   private static final char[] EOS_CHARS = {'.', '?', '!'};
-  
-  private static SentenceModel sentdetectModel;
+  private static Dictionary abbreviationDict;
+  private SentenceModel sentdetectModel;
 
   @BeforeAll
-  public static void prepareResources() throws IOException {
-    Dictionary abbreviationDict = loadAbbDictionary(Locale.GERMAN);
-    SentenceDetectorFactory factory = new SentenceDetectorFactory(
-            "deu", true, abbreviationDict, EOS_CHARS);
-    sentdetectModel = train(factory, Locale.GERMAN);
-    Assertions.assertNotNull(sentdetectModel);
-    Assertions.assertEquals("deu", sentdetectModel.getLanguage());
+  static void loadResources() throws IOException {
+    abbreviationDict = loadAbbDictionary(Locale.GERMAN);
+  }
+
+  private void prepareResources(boolean useTokenEnd) {
+    try {
+      SentenceDetectorFactory factory = new SentenceDetectorFactory(
+          "deu", useTokenEnd, abbreviationDict, EOS_CHARS);
+      sentdetectModel = train(factory, Locale.GERMAN);
+
+      assertAll(() -> assertNotNull(sentdetectModel),
+          () -> assertEquals("deu", sentdetectModel.getLanguage()));
+    } catch (IOException ex) {
+      fail("Couldn't train the SentenceModel using test data. Exception: " + 
ex.getMessage());
+    }
   }
 
   // Example taken from 'Sentences_DE.txt'
   @Test
   void testSentDetectWithInlineAbbreviationsEx1() {
+    prepareResources(true);
+
     final String sent1 = "Ein Traum, zu dessen Bildung eine besonders starke 
Verdichtung beigetragen, " +
-            "wird für diese Untersuchung das günstigste Material sein.";
+        "wird für diese Untersuchung das günstigste Material sein.";
     // Here we have two abbreviations "S. = Seite" and "ff. = folgende 
(Plural)"
     final String sent2 = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von 
der botanischen Monographie.";
 
     SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
     String sampleSentences = sent1 + " " + sent2;
     String[] sents = sentDetect.sentDetect(sampleSentences);
-    Assertions.assertEquals(2, sents.length);
-    Assertions.assertEquals(sent1, sents[0]);
-    Assertions.assertEquals(sent2, sents[1]);
     double[] probs = sentDetect.getSentenceProbabilities();
-    Assertions.assertEquals(2, probs.length);
+
+    assertAll(() -> assertEquals(2, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(sent2, sents[1]),
+        () -> assertEquals(2, probs.length));
   }
 
   // Reduced example taken from 'Sentences_DE.txt'
   @Test
   void testSentDetectWithInlineAbbreviationsEx2() {
+    prepareResources(true);
+
     // Here we have three abbreviations: "S. = Seite", "vgl. = vergleiche", 
and "f. = folgende (Singular)"
     final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
-            "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
+        "geht (vgl. die Analyse S. 185 f.) auf ein neues Thema ein.";
 
     SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
     String[] sents = sentDetect.sentDetect(sent1);
-    Assertions.assertEquals(1, sents.length);
-    Assertions.assertEquals(sent1, sents[0]);
     double[] probs = sentDetect.getSentenceProbabilities();
-    Assertions.assertEquals(1, probs.length);
+
+    assertAll(() -> assertEquals(1, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(1, probs.length));
   }
 
   // Modified example deduced from 'Sentences_DE.txt'
   @Test
   void testSentDetectWithInlineAbbreviationsEx3() {
+    prepareResources(true);
+
     // Here we have two abbreviations "z. B. = zum Beispiel" and "S. = Seite"
     final String sent1 = "Die farbige Tafel, die ich aufschlage, " +
-            "geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
+        "geht (z. B. die Analyse S. 185) auf ein neues Thema ein.";
 
     SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
     String[] sents = sentDetect.sentDetect(sent1);
-    Assertions.assertEquals(1, sents.length);
-    Assertions.assertEquals(sent1, sents[0]);
     double[] probs = sentDetect.getSentenceProbabilities();
-    Assertions.assertEquals(1, probs.length);
+
+    assertAll(() -> assertEquals(1, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(1, probs.length));
+  }
+
+  @Test
+  void testSentDetectWithUseTokenEndFalse() {
+    prepareResources(false);
+
+    final String sent1 = "Träume sind eine Verbindung von Gedanken.";
+    final String sent2 = "Verschiedene Gedanken sind während der 
Traumformation aktiv.";
+
+    SentenceDetectorME sentDetect = new SentenceDetectorME(sentdetectModel);
+    //There is no blank space before start of the second sentence.
+    String[] sents = sentDetect.sentDetect(sent1 + sent2);
+    double[] probs = sentDetect.getSentenceProbabilities();
+
+    assertAll(() -> assertEquals(2, sents.length),
+        () -> assertEquals(sent1, sents[0]),
+        () -> assertEquals(sent2, sents[1]),
+        () -> assertEquals(2, probs.length));
   }
 }
diff --git a/opennlp-docs/src/docbkx/sentdetect.xml 
b/opennlp-docs/src/docbkx/sentdetect.xml
index 11b047d3..f73248ec 100644
--- a/opennlp-docs/src/docbkx/sentdetect.xml
+++ b/opennlp-docs/src/docbkx/sentdetect.xml
@@ -142,7 +142,10 @@ Arguments description:
         -data sampleData
                 data to be used, usually a file name.
         -encoding charsetName
-                encoding for reading and writing text, if absent the system 
default is used.]]>
+                encoding for reading and writing text, if absent the system 
default is used.
+        -useTokenEnd boolean flag
+                set to false when the next sentence in the test dataset 
doesn't start with a blank space post completion of
+                the previous sentence. If absent, it is defaulted to true.]]>
        </screen>
                To train an English sentence detector use the following command:
         <screen>

(opennlp) branch main updated: OPENNLP-1745: SentenceDetector - Add Junit test for useTokenEnd = false (#792)

Reply via email to