i18npool/qa/cppunit/test_breakiterator.cxx | 22 +++++++ i18npool/source/breakiterator/breakiterator_cjk.cxx | 61 ++++++++++++++++---- 2 files changed, 73 insertions(+), 10 deletions(-)
New commits: commit 2af7e0aa417bbb83a2180d581f153f09dc95ac82 Author: Saburo Yoshida <[email protected]> AuthorDate: Sat Jan 31 06:42:12 2026 +0900 Commit: Xisco Fauli <[email protected]> CommitDate: Thu Feb 5 12:36:38 2026 +0100 tdf#169590 i18npool: Fix incorrect line breaking in CJK Since 5a03d511f46ecc05aab35bb29e714b46f5638b1b commit, ICU line breaking rules are now honored and Asian typography options for paragraph formats are now ignored. Respect cases where users apply a list that is different from the default forbidden character list, or cases where no forbidden character list is applied. Change-Id: I7d021c1dcb1e9722eb5f7ae6295b949641dcfe01 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/198442 Tested-by: Jenkins Reviewed-by: Jonathan Clark <[email protected]> (cherry picked from commit b643c5c0ace3af80e2126a05f4df366d4a3166b3) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/198739 Reviewed-by: Xisco Fauli <[email protected]> diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index c91532d5ff3a..e0c333f0ca70 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -401,9 +401,15 @@ void TestBreakIterator::testLineBreaking() { const OUString str = u"word word、word word"_ustr; + aUserOptions.applyForbiddenRules = true; // tdf#169590 i18n::LineBreakResults aResult = m_xBreak->getLineBreak( str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, aUserOptions); CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(10), aResult.breakIndex); + + aUserOptions.applyForbiddenRules = false; // tdf#169590 + i18n::LineBreakResults aResult2 = m_xBreak->getLineBreak( + str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, aUserOptions); + CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(13), aResult2.breakIndex); } } @@ -1682,10 +1688,19 @@ void TestBreakIterator::testChinese() // Comma normally not allowed at start of line, quote normally not allowed at end auto aTest = u"水水水、水水水「水水水水水水水水水"_ustr; + stUserOptions.applyForbiddenRules = true; // tdf#169590 + stUserOptions.forbiddenBeginCharacters = u"、"_ustr; + stUserOptions.forbiddenEndCharacters = u"「"_ustr; auto stBreak1 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, stHyphOptions, stUserOptions); CPPUNIT_ASSERT_EQUAL(sal_Int32(2), stBreak1.breakIndex); auto stBreak2 = m_xBreak->getLineBreak(aTest, 8, stLocale, 0, stHyphOptions, stUserOptions); CPPUNIT_ASSERT_EQUAL(sal_Int32(7), stBreak2.breakIndex); + + stUserOptions.applyForbiddenRules = false; // tdf#169590 + auto stBreak3 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), stBreak3.breakIndex); + auto stBreak4 = m_xBreak->getLineBreak(aTest, 8, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), stBreak4.breakIndex); } // tdf#117554 Do not break at ZWNBSP @@ -1742,6 +1757,9 @@ void TestBreakIterator::testChinese() auto aTest = u"例例\u201C例例例例\u201D例例"_ustr; + stUserOptions.applyForbiddenRules = true; // tdf#169590 + stUserOptions.forbiddenBeginCharacters = u"\u201D"_ustr; + stUserOptions.forbiddenEndCharacters = u"\u201C"_ustr; // Break opportunities should be outside of the quotation marks. // U+201C is a left quotation mark, so the break opportunity should be to the left: auto stBreak1 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, stHyphOptions, stUserOptions); @@ -1750,6 +1768,10 @@ void TestBreakIterator::testChinese() // U+201D is a right quotation mark, so the break opportunity should be to the right: auto stBreak2 = m_xBreak->getLineBreak(aTest, 8, stLocale, 0, stHyphOptions, stUserOptions); CPPUNIT_ASSERT_EQUAL(sal_Int32(8), stBreak2.breakIndex); + + stUserOptions.applyForbiddenRules = false; // tdf#169590 + auto stBreak3 = m_xBreak->getLineBreak(aTest, 3, stLocale, 0, stHyphOptions, stUserOptions); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), stBreak3.breakIndex); } } diff --git a/i18npool/source/breakiterator/breakiterator_cjk.cxx b/i18npool/source/breakiterator/breakiterator_cjk.cxx index f9b1b7e46199..9f4a89b7fe2d 100644 --- a/i18npool/source/breakiterator/breakiterator_cjk.cxx +++ b/i18npool/source/breakiterator/breakiterator_cjk.cxx @@ -26,8 +26,8 @@ using namespace ::com::sun::star::i18n; using namespace ::com::sun::star::lang; -namespace i18npool { - +namespace i18npool +{ // ---------------------------------------------------- // class BreakIterator_CJK // ----------------------------------------------------; @@ -37,12 +37,13 @@ BreakIterator_CJK::BreakIterator_CJK() cBreakIterator = u"com.sun.star.i18n.BreakIterator_CJK"_ustr; } -namespace { -bool isHangul( sal_Unicode cCh ) +namespace +{ +bool isHangul(sal_Unicode cCh) { - return (cCh >= 0xAC00 && cCh <= 0xD7AF) || (cCh >= 0x1100 && cCh <= 0x11FF) || - (cCh >= 0xA960 && cCh <= 0xA97F) || (cCh >= 0xD7B0 && cCh <= 0xD7FF) || - (cCh >= 0x3130 && cCh <= 0x318F); + return (cCh >= 0xAC00 && cCh <= 0xD7AF) || (cCh >= 0x1100 && cCh <= 0x11FF) + || (cCh >= 0xA960 && cCh <= 0xA97F) || (cCh >= 0xD7B0 && cCh <= 0xD7FF) + || (cCh >= 0x3130 && cCh <= 0x318F); } } @@ -51,13 +52,26 @@ LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak( sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions, const LineBreakUserOptions& bOptions) { - auto fnIsForbiddenBreak = [&](sal_Int32 nBreakPos) - { + auto fnIsForbiddenBreak = [&](sal_Int32 nBreakPos) { return nBreakPos > 0 && (bOptions.forbiddenBeginCharacters.indexOf(Text[nBreakPos]) != -1 || bOptions.forbiddenEndCharacters.indexOf(Text[nBreakPos - 1]) != -1); }; + auto fnIsNonBreak = [](sal_Unicode cChB) { + switch (cChB) + { + case 0x00A0: // No-break Space + case 0x2011: // Non-breaking Hyphen + case 0x202F: // Narrow No-break Space + case 0x2060: // Word Joinner + case 0xFEFF: // NWNBSP + return true; + default: + return false; + } + }; + while (nStartPos > 0 && nStartPos < Text.getLength()) { // Apply hanging punctuation @@ -105,6 +119,13 @@ LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak( nStartPos = nOldStartPos; } + // Formatting Marks + while (nStartPos > 0 + && (fnIsNonBreak(Text[nStartPos]) || fnIsNonBreak(Text[nStartPos - 1]))) + { + Text.iterateCodePoints(&nStartPos, -1); + } + // tdf#130592: Fall back to the ICU breakiterator after applying CJK-specific rules auto stBreak = BreakIterator_Unicode::getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions); @@ -113,6 +134,27 @@ LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak( // Located break is valid under both iterators return stBreak; } + if (nStartPos > stBreak.breakIndex) + { + // tdf#169590 Clarify the case where bOptions.applyForbiddenRules is False + if (!bOptions.applyForbiddenRules) + { + stBreak.breakIndex = nStartPos; + stBreak.breakType = BreakType::WORDBOUNDARY; + return stBreak; + } + + // Respect user changes to forbidden character list + sal_Int32 nTempPos = stBreak.breakIndex; + Text.iterateCodePoints(&nTempPos); + if (nTempPos == nStartPos && !fnIsForbiddenBreak(nStartPos) + && !fnIsNonBreak(Text[nStartPos])) + { + stBreak.breakIndex = nStartPos; + stBreak.breakType = BreakType::WORDBOUNDARY; + return stBreak; + } + } // CJK break is not valid; restart search from the next candidate sal_Int32 nNextCandidate = stBreak.breakIndex; @@ -174,7 +216,6 @@ BreakIterator_ko::BreakIterator_ko() hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE(u"ko"_ustr, u"KR"_ustr)); cBreakIterator = u"com.sun.star.i18n.BreakIterator_ko"_ustr; } - } /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
