Richard Zowalla created OPENNLP-1811:
----------------------------------------

             Summary: SentenceDetector fails to split multi-letter abbreviation 
at non-first sentence start without spacing
                 Key: OPENNLP-1811
                 URL: https://issues.apache.org/jira/browse/OPENNLP-1811
             Project: OpenNLP
          Issue Type: Bug
          Components: Sentence Detector
    Affects Versions: 3.0.0-M1, 2.5.7
            Reporter: Richard Zowalla


 
{{/*
   * Edge case: Multi-letter abbreviation at the start of a non-first sentence
   * with \{@code useTokenEnd = false} (no space between sentences).
   */
  @Test
  void testSentDetectWithMultiLetterAbbreviationAtNonFirstSentenceStart() \{
    prepareResources(false);

    final String sent1 = "Träume sind eine Verbindung von Gedanken.";
    final String sent2 = "Bek. Problem: Schlafmangel.";

    // No space between sentences (useTokenEnd=false supports this)
    String sampleSentences = sent1 + sent2;
    String[] sents = sentenceDetector.sentDetect(sampleSentences);
    double[] probs = sentenceDetector.probs();

    assertAll(
        () -> assertEquals(2, sents.length),
        () -> assertEquals(sent1, sents[0]),
        () -> assertEquals(sent2, sents[1]),
        () -> assertEquals(2, probs.length));
  }}}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to