i18npool/qa/cppunit/test_breakiterator.cxx | 45 +++++++++++ i18npool/source/breakiterator/data/dict_word.txt | 21 ++++- i18npool/source/breakiterator/data/dict_word_prepostdash.txt | 9 +- 3 files changed, 71 insertions(+), 4 deletions(-)
New commits: commit f4fe6df6aa92573368c3fa0edb9fd03e64d9d059 Author: Jonathan Clark <[email protected]> AuthorDate: Thu Nov 28 12:47:02 2024 -0700 Commit: Jonathan Clark <[email protected]> CommitDate: Fri Nov 29 18:40:51 2024 +0100 tdf#162514 i18npool: Handle abbreviations in dictionary breakiterator Restores abbreviation handling to spell checking. Regression from commit 44699b3de37f07090ac6fee1cd97aa76036e9700 "tdf#49885 BreakIterator rule upgrades". Change-Id: I2883f984952aa3e54cfe800590a16c0de74ae0e4 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177506 Reviewed-by: Jonathan Clark <[email protected]> Tested-by: Jenkins diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index e56089ad0c28..9d9712e54e71 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -48,6 +48,7 @@ public: void testLegacyDictWordPrepostDash_nds_DE(); void testLegacyDictWordPrepostDash_nl_NL(); void testLegacyDictWordPrepostDash_sv_SE(); + void testDictWordAbbreviation(); void testHebrewGereshGershaim(); void testLegacySurrogatePairs(); void testWordCount(); @@ -71,6 +72,7 @@ public: CPPUNIT_TEST(testLegacyDictWordPrepostDash_nds_DE); CPPUNIT_TEST(testLegacyDictWordPrepostDash_nl_NL); CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE); + CPPUNIT_TEST(testDictWordAbbreviation); CPPUNIT_TEST(testHebrewGereshGershaim); CPPUNIT_TEST(testLegacySurrogatePairs); CPPUNIT_TEST(testWordCount); @@ -1666,6 +1668,49 @@ void TestBreakIterator::testLegacyDictWordPrepostDash_de_DE() } } +void TestBreakIterator::testDictWordAbbreviation() +{ + std::vector<lang::Locale> aLocale{ + { "en", "US", "" }, // dict_word locale + { "de", "DE", "" } // dict_word_prepostdash locale + }; + + for (const auto& rLocale : aLocale) + { + auto aTest = u"Examples: e.g. i.e. etc. and such"_ustr; + + i18n::Boundary aBounds + = m_xBreak->getWordBoundary(aTest, 3, rLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 10, rLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 15, rLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(15), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 20, rLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 26, rLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(28), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aTest, 30, rLocale, i18n::WordType::DICTIONARY_WORD, false); + CPPUNIT_ASSERT_EQUAL(sal_Int32(29), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(33), aBounds.endPos); + } +} + void TestBreakIterator::testLegacyDictWordPrepostDash_nds_DE() { lang::Locale aLocale; diff --git a/i18npool/source/breakiterator/data/dict_word.txt b/i18npool/source/breakiterator/data/dict_word.txt index 4a09af5cf1b2..849b2fe29205 100644 --- a/i18npool/source/breakiterator/data/dict_word.txt +++ b/i18npool/source/breakiterator/data/dict_word.txt @@ -70,6 +70,9 @@ $ExcludedML = [[:name = COLON:] [:name = SMALL COLON:] [:name = FULLWIDTH COLON:]]; +### tdf#162514: For spell checking, abbreviations may end with a period. +$PostPeriod = [:name = FULL STOP:]; + # $MidLetter = [\p{Word_Break = MidLetter}]; $MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; @@ -140,10 +143,24 @@ $Ideographic $ExFm* {400}; # # rule 5 # Do not break between most letters. # -($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); + +### BEGIN CUSTOMIZATION +### tdf#162514: For spell checking, abbreviations may end with a period. + +# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); +($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PostPeriod)?; + +### END CUSTOMIZATION # rule 6 and 7 -($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; + +### BEGIN CUSTOMIZATION +### tdf#162514: For spell checking, abbreviations may end with a period. + +# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; +($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PostPeriod)? {200}; + +### END CUSTOMIZATION # rule 7a $Hebrew_Letter $ExFm* $Single_Quote {200}; diff --git a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt index b39503d1b405..6051c149d23f 100644 --- a/i18npool/source/breakiterator/data/dict_word_prepostdash.txt +++ b/i18npool/source/breakiterator/data/dict_word_prepostdash.txt @@ -82,6 +82,9 @@ $MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML]; $PrePostHyphen = [:name = HYPHEN-MINUS:]; +### tdf#162514: For spell checking, abbreviations may end with a period. +$PostPeriod = [:name = FULL STOP:]; + ### END CUSTOMIZATION $Hiragana = [:Hiragana:]; @@ -148,9 +151,10 @@ $Ideographic $ExFm* {400}; # ### BEGIN CUSTOMIZATION ### Unknown issue number: Allow leading and trailing hyphens in certain languages +### tdf#162514: For spell checking, abbreviations may end with a period. # ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter); -($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)?; +($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen | $PostPeriod)?; ### END CUSTOMIZATION @@ -158,9 +162,10 @@ $Ideographic $ExFm* {400}; # ### BEGIN CUSTOMIZATION ### Unknown issue number: Allow leading and trailing hyphens in certain languages +### tdf#162514: For spell checking, abbreviations may end with a period. # ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200}; -($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)? {200}; +($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen | $PostPeriod)? {200}; ### END CUSTOMIZATION
