i18npool/qa/cppunit/test_breakiterator.cxx              |  249 +++++++++++++++-
 i18npool/source/breakiterator/breakiterator_unicode.cxx |   12 
 2 files changed, 249 insertions(+), 12 deletions(-)

New commits:
commit 9a14a0fd8b4227b5d08b3154cddca46f82ec2a03
Author:     Jonathan Clark <[email protected]>
AuthorDate: Mon Dec 2 16:03:43 2024 -0700
Commit:     Jonathan Clark <[email protected]>
CommitDate: Tue Dec 3 01:29:20 2024 +0100

    tdf#162912 i18npool: Updated CJK BreakIterator to use custom rules
    
    Regression from commit 14c6cde779d64596eab0f4d3f32f181ce2243929:
      "tdf#49885 Updated CJK BreakIterator to use ICU"
    
    Previously, languages requiring dictionary-based break iterators were
    handled by instantiating a stock ICU break iterator as a special case.
    tdf#49885 upgraded our custom rules to support passthrough for
    dictionary-based breaking, so this special case is no longer necessary.
    
    Change-Id: Iebb06de82eb511946e5b220e5dc414440838b03c
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177713
    Tested-by: Jenkins
    Reviewed-by: Jonathan Clark <[email protected]>

diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx 
b/i18npool/qa/cppunit/test_breakiterator.cxx
index 24666ca4ac80..80bdeb15c7be 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -49,6 +49,7 @@ public:
     void testHebrewGereshGershaim();
     void testLegacySurrogatePairs();
     void testWordCount();
+    void testDictionaryIteratorLanguages();
 
     CPPUNIT_TEST_SUITE(TestBreakIterator);
     CPPUNIT_TEST(testLineBreaking);
@@ -70,6 +71,7 @@ public:
     CPPUNIT_TEST(testHebrewGereshGershaim);
     CPPUNIT_TEST(testLegacySurrogatePairs);
     CPPUNIT_TEST(testWordCount);
+    CPPUNIT_TEST(testDictionaryIteratorLanguages);
     CPPUNIT_TEST_SUITE_END();
 
 private:
@@ -1612,6 +1614,25 @@ void TestBreakIterator::doTestJapanese(uno::Reference< 
i18n::XBreakIterator > co
         CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
         CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
     }
+
+    {
+        // tdf#162912: Double-clicking should only select one Basic identifier
+        static constexpr OUString aTest = 
u"ThisComponent.CurrentSelection"_ustr;
+
+        aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
+
+        aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
+                                          
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos);
+
+        aBounds = xBreak->getWordBoundary(aTest, 15, aLocale,
+                                          
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
+    }
 }
 
 void TestBreakIterator::testJapanese()
@@ -1914,7 +1935,7 @@ void TestBreakIterator::testWordCount()
 
         const OUString aStr = u"Wordの様にワード数をするのにTest
植松町"_ustr;
 
-        CPPUNIT_ASSERT_EQUAL(7, fnCountWords(aStr, aLocale));
+        CPPUNIT_ASSERT_EQUAL(8, fnCountWords(aStr, aLocale));
     }
 
     // tdf#150621 Korean words should be counted individually, rather than by 
syllable.
@@ -1941,6 +1962,232 @@ void TestBreakIterator::testWordCount()
     }
 }
 
+void TestBreakIterator::testDictionaryIteratorLanguages()
+{
+    // Thai
+    {
+        lang::Locale aLocale{ "th", "TH", "" };
+
+        const OUString aStr = u"รอนานหรือเปล่า"_ustr;
+
+        i18n::Boundary aBounds;
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 1, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 3, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 6, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 10, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
+    }
+
+    // Japanese
+    {
+        lang::Locale aLocale{ "ja", "JP", "" };
+
+        const OUString aStr = u"通産省工業技術院北海道"_ustr;
+
+        i18n::Boundary aBounds;
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 1, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 2, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 4, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 6, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 7, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 9, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
+    }
+
+    // Chinese
+    {
+        lang::Locale aLocale{ "zh", "CN", "" };
+
+        const OUString aStr = u"很高兴认识你"_ustr;
+
+        i18n::Boundary aBounds;
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 0, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 1, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 3, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+        aBounds
+            = m_xBreak->getWordBoundary(aStr, 5, aLocale, 
i18n::WordType::DICTIONARY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale, 
i18n::WordType::ANY_WORD, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
+
+        aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale,
+                                            
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
+    }
+}
+
 void TestBreakIterator::setUp()
 {
     BootstrapFixtureBase::setUp();
diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx 
b/i18npool/source/breakiterator/breakiterator_unicode.cxx
index 5992b6144b0b..4e5df75d2701 100644
--- a/i18npool/source/breakiterator/breakiterator_unicode.cxx
+++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx
@@ -74,16 +74,6 @@ class OOoRuleBasedBreakIterator : public 
icu::RuleBasedBreakIterator
 
 };
 
-bool locale_requires_dictionary_iterator(const css::lang::Locale& rLocale)
-{
-    return rLocale.Language == "bo" || // Tibetan
-           rLocale.Language == "dz" || // Dzongkha
-           rLocale.Language == "ja" || // Japanese
-           rLocale.Language == "km" || // Khmer
-           rLocale.Language == "lo" || // Lao
-           rLocale.Language == "th" || // Thai
-           rLocale.Language == "zh"; // Chinese
-}
 }
 
 // loading ICU breakiterator on demand.
@@ -189,7 +179,7 @@ void BreakIterator_Unicode::loadICUBreakIterator(const 
css::lang::Locale& rLocal
                         rbi.reset();
                     }
                 }
-                else if(!locale_requires_dictionary_iterator(rLocale))
+                else
                 {
                     // language;rule (not langtag, unless we'd actually load 
such)
                     OString aLanguage( LanguageTag( 
rLocale).getLanguage().toUtf8());

Reply via email to