i18npool/qa/cppunit/test_breakiterator.cxx          |   22 +++++++
 i18npool/source/breakiterator/breakiterator_cjk.cxx |   61 ++++++++++++++++----
 2 files changed, 73 insertions(+), 10 deletions(-)

New commits:
commit 02fb87b389f0a4db74b149c790ea1b83af177dd4
Author:     Saburo Yoshida <[email protected]>
AuthorDate: Sat Jan 31 06:42:12 2026 +0900
Commit:     Xisco Fauli <[email protected]>
CommitDate: Thu Feb 5 12:30:51 2026 +0100

    tdf#169590 i18npool: Fix incorrect line breaking in CJK
    
    Since 5a03d511f46ecc05aab35bb29e714b46f5638b1b commit,
    ICU line breaking rules are now honored and Asian typography options
    for paragraph formats are now ignored.
    Respect cases where users apply a list that is different from the
    default forbidden character list, or cases where no forbidden character
    list is applied.
    
    Change-Id: I7d021c1dcb1e9722eb5f7ae6295b949641dcfe01
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/198442
    Tested-by: Jenkins
    Reviewed-by: Jonathan Clark <[email protected]>
    (cherry picked from commit b643c5c0ace3af80e2126a05f4df366d4a3166b3)
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/198740
    Reviewed-by: Xisco Fauli <[email protected]>

diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx 
b/i18npool/qa/cppunit/test_breakiterator.cxx
index c91532d5ff3a..e0c333f0ca70 100644
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@@ -401,9 +401,15 @@ void TestBreakIterator::testLineBreaking()
 
         {
             const OUString str = u"word word、word word"_ustr;
+            aUserOptions.applyForbiddenRules = true; // tdf#169590
             i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
                 str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, 
aUserOptions);
             CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(10), 
aResult.breakIndex);
+
+            aUserOptions.applyForbiddenRules = false; // tdf#169590
+            i18n::LineBreakResults aResult2 = m_xBreak->getLineBreak(
+                str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, 
aUserOptions);
+            CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(13), 
aResult2.breakIndex);
         }
     }
 
@@ -1682,10 +1688,19 @@ void TestBreakIterator::testChinese()
 
         // Comma normally not allowed at start of line, quote normally not 
allowed at end
         auto aTest = u"水水水、水水水「水水水水水水水水水"_ustr;
+        stUserOptions.applyForbiddenRules = true; // tdf#169590
+        stUserOptions.forbiddenBeginCharacters = u"、"_ustr;
+        stUserOptions.forbiddenEndCharacters = u"「"_ustr;
         auto stBreak1 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, 
stHyphOptions, stUserOptions);
         CPPUNIT_ASSERT_EQUAL(sal_Int32(2), stBreak1.breakIndex);
         auto stBreak2 = m_xBreak->getLineBreak(aTest, 8, stLocale, 0, 
stHyphOptions, stUserOptions);
         CPPUNIT_ASSERT_EQUAL(sal_Int32(7), stBreak2.breakIndex);
+
+        stUserOptions.applyForbiddenRules = false; // tdf#169590
+        auto stBreak3 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), stBreak3.breakIndex);
+        auto stBreak4 = m_xBreak->getLineBreak(aTest, 8, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(8), stBreak4.breakIndex);
     }
 
     // tdf#117554 Do not break at ZWNBSP
@@ -1742,6 +1757,9 @@ void TestBreakIterator::testChinese()
 
         auto aTest = u"例例\u201C例例例例\u201D例例"_ustr;
 
+        stUserOptions.applyForbiddenRules = true; // tdf#169590
+        stUserOptions.forbiddenBeginCharacters = u"\u201D"_ustr;
+        stUserOptions.forbiddenEndCharacters = u"\u201C"_ustr;
         // Break opportunities should be outside of the quotation marks.
         // U+201C is a left quotation mark, so the break opportunity should be 
to the left:
         auto stBreak1 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, 
stHyphOptions, stUserOptions);
@@ -1750,6 +1768,10 @@ void TestBreakIterator::testChinese()
         // U+201D is a right quotation mark, so the break opportunity should 
be to the right:
         auto stBreak2 = m_xBreak->getLineBreak(aTest, 8, stLocale, 0, 
stHyphOptions, stUserOptions);
         CPPUNIT_ASSERT_EQUAL(sal_Int32(8), stBreak2.breakIndex);
+
+        stUserOptions.applyForbiddenRules = false; // tdf#169590
+        auto stBreak3 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, 
stHyphOptions, stUserOptions);
+        CPPUNIT_ASSERT_EQUAL(sal_Int32(3), stBreak3.breakIndex);
     }
 }
 
diff --git a/i18npool/source/breakiterator/breakiterator_cjk.cxx 
b/i18npool/source/breakiterator/breakiterator_cjk.cxx
index f9b1b7e46199..9f4a89b7fe2d 100644
--- a/i18npool/source/breakiterator/breakiterator_cjk.cxx
+++ b/i18npool/source/breakiterator/breakiterator_cjk.cxx
@@ -26,8 +26,8 @@
 using namespace ::com::sun::star::i18n;
 using namespace ::com::sun::star::lang;
 
-namespace i18npool {
-
+namespace i18npool
+{
 //      ----------------------------------------------------
 //      class BreakIterator_CJK
 //      ----------------------------------------------------;
@@ -37,12 +37,13 @@ BreakIterator_CJK::BreakIterator_CJK()
     cBreakIterator = u"com.sun.star.i18n.BreakIterator_CJK"_ustr;
 }
 
-namespace {
-bool isHangul( sal_Unicode cCh )
+namespace
+{
+bool isHangul(sal_Unicode cCh)
 {
-    return (cCh >= 0xAC00 && cCh <= 0xD7AF) || (cCh >= 0x1100 && cCh <= 
0x11FF) ||
-           (cCh >= 0xA960 && cCh <= 0xA97F) || (cCh >= 0xD7B0 && cCh <= 
0xD7FF) ||
-           (cCh >= 0x3130 && cCh <= 0x318F);
+    return (cCh >= 0xAC00 && cCh <= 0xD7AF) || (cCh >= 0x1100 && cCh <= 0x11FF)
+           || (cCh >= 0xA960 && cCh <= 0xA97F) || (cCh >= 0xD7B0 && cCh <= 
0xD7FF)
+           || (cCh >= 0x3130 && cCh <= 0x318F);
 }
 }
 
@@ -51,13 +52,26 @@ LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak(
     sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
     const LineBreakUserOptions& bOptions)
 {
-    auto fnIsForbiddenBreak = [&](sal_Int32 nBreakPos)
-    {
+    auto fnIsForbiddenBreak = [&](sal_Int32 nBreakPos) {
         return nBreakPos > 0
                && (bOptions.forbiddenBeginCharacters.indexOf(Text[nBreakPos]) 
!= -1
                    || bOptions.forbiddenEndCharacters.indexOf(Text[nBreakPos - 
1]) != -1);
     };
 
+    auto fnIsNonBreak = [](sal_Unicode cChB) {
+        switch (cChB)
+        {
+            case 0x00A0: // No-break Space
+            case 0x2011: // Non-breaking Hyphen
+            case 0x202F: // Narrow No-break Space
+            case 0x2060: // Word Joinner
+            case 0xFEFF: // NWNBSP
+                return true;
+            default:
+                return false;
+        }
+    };
+
     while (nStartPos > 0 && nStartPos < Text.getLength())
     {
         // Apply hanging punctuation
@@ -105,6 +119,13 @@ LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak(
                 nStartPos = nOldStartPos;
         }
 
+        // Formatting Marks
+        while (nStartPos > 0
+               && (fnIsNonBreak(Text[nStartPos]) || 
fnIsNonBreak(Text[nStartPos - 1])))
+        {
+            Text.iterateCodePoints(&nStartPos, -1);
+        }
+
         // tdf#130592: Fall back to the ICU breakiterator after applying 
CJK-specific rules
         auto stBreak = BreakIterator_Unicode::getLineBreak(Text, nStartPos, 
rLocale, nMinBreakPos,
                                                            hOptions, bOptions);
@@ -113,6 +134,27 @@ LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak(
             // Located break is valid under both iterators
             return stBreak;
         }
+        if (nStartPos > stBreak.breakIndex)
+        {
+            // tdf#169590 Clarify the case where bOptions.applyForbiddenRules 
is False
+            if (!bOptions.applyForbiddenRules)
+            {
+                stBreak.breakIndex = nStartPos;
+                stBreak.breakType = BreakType::WORDBOUNDARY;
+                return stBreak;
+            }
+
+            // Respect user changes to forbidden character list
+            sal_Int32 nTempPos = stBreak.breakIndex;
+            Text.iterateCodePoints(&nTempPos);
+            if (nTempPos == nStartPos && !fnIsForbiddenBreak(nStartPos)
+                && !fnIsNonBreak(Text[nStartPos]))
+            {
+                stBreak.breakIndex = nStartPos;
+                stBreak.breakType = BreakType::WORDBOUNDARY;
+                return stBreak;
+            }
+        }
 
         // CJK break is not valid; restart search from the next candidate
         sal_Int32 nNextCandidate = stBreak.breakIndex;
@@ -174,7 +216,6 @@ BreakIterator_ko::BreakIterator_ko()
     hangingCharacters = 
LocaleDataImpl::get()->getHangingCharacters(LOCALE(u"ko"_ustr, u"KR"_ustr));
     cBreakIterator = u"com.sun.star.i18n.BreakIterator_ko"_ustr;
 }
-
 }
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Reply via email to