editeng/source/editeng/impedit2.cxx | 59 ---- i18nutil/qa/cppunit/test_scriptchangescanner.cxx | 315 ++++++++++++++++++++++- i18nutil/source/utility/scriptchangescanner.cxx | 153 ++++++++++- include/i18nutil/scriptchangescanner.hxx | 26 + sw/source/core/inc/scriptinfo.hxx | 1 sw/source/core/text/porlay.cxx | 122 -------- 6 files changed, 489 insertions(+), 187 deletions(-)
New commits: commit de29bec27e90a7d24a90c6f071e7899abefe683e Author: Jonathan Clark <[email protected]> AuthorDate: Thu Dec 19 03:39:20 2024 -0700 Commit: Jonathan Clark <[email protected]> CommitDate: Thu Dec 19 18:18:30 2024 +0100 tdf#163660 sw: Treat strong CJK inside RTL runs as Asian script Previously, Asian-script characters following an explicit direction mark or inside an RTL embedding would be rendered using the Complex font, rather than the Asian font. This change updates script detection to treat these characters as Asian regardless of text direction. This change also consolidates more duplicated script detection code from Writer and Edit Engine. Change-Id: I3931c3ec0cb447f3c504fdbfb3ee62185fd14422 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/178817 Tested-by: Jenkins Reviewed-by: Jonathan Clark <[email protected]> diff --git a/editeng/source/editeng/impedit2.cxx b/editeng/source/editeng/impedit2.cxx index 44f9ae6a33e1..2cd3d66de41b 100644 --- a/editeng/source/editeng/impedit2.cxx +++ b/editeng/source/editeng/impedit2.cxx @@ -1650,20 +1650,6 @@ bool ImpEditEngine::IsInputSequenceCheckingRequired( sal_Unicode nChar, const Ed return bIsSequenceChecking; } -static bool lcl_HasStrongLTR ( std::u16string_view rTxt, sal_Int32 nStart, sal_Int32 nEnd ) - { - for( sal_Int32 nCharIdx = nStart; nCharIdx < nEnd; ++nCharIdx ) - { - const UCharDirection nCharDir = u_charDirection ( rTxt[ nCharIdx ] ); - if ( nCharDir == U_LEFT_TO_RIGHT || - nCharDir == U_LEFT_TO_RIGHT_EMBEDDING || - nCharDir == U_LEFT_TO_RIGHT_OVERRIDE ) - return true; - } - return false; - } - - void ImpEditEngine::InitScriptTypes( sal_Int32 nPara ) { ParaPortion* pParaPortion = GetParaPortions().SafeGetObject( nPara ); @@ -1716,8 +1702,10 @@ void ImpEditEngine::InitScriptTypes( sal_Int32 nPara ) pField = pField->GetEnd() ? pNode->GetCharAttribs().FindNextAttrib( EE_FEATURE_FIELD, pField->GetEnd() ) : nullptr; } + const UBiDiLevel nInitialBidiLevel = IsRightToLeft(nPara) ? 1 /*RTL*/ : 0 /*LTR*/; + auto pDirScanner = i18nutil::MakeDirectionChangeScanner(aText, nInitialBidiLevel); auto pScriptScanner = i18nutil::MakeScriptChangeScanner( - aText, SvtLanguageOptions::GetI18NScriptTypeOfLanguage(GetDefaultLanguage())); + aText, SvtLanguageOptions::GetI18NScriptTypeOfLanguage(GetDefaultLanguage()), *pDirScanner); while (!pScriptScanner->AtEnd() || rTypes.empty()) { auto stChange = pScriptScanner->Peek(); @@ -1730,47 +1718,6 @@ void ImpEditEngine::InitScriptTypes( sal_Int32 nPara ) WritingDirectionInfos& rDirInfos = pParaPortion->getWritingDirectionInfos(); if (rDirInfos.empty()) InitWritingDirections( nPara ); - - // i89825: Use CTL font for numbers embedded into an RTL run: - for (const WritingDirectionInfo & rDirInfo : rDirInfos) - { - const sal_Int32 nStart = rDirInfo.nStartPos; - const sal_Int32 nEnd = rDirInfo.nEndPos; - const sal_uInt8 nCurrDirType = rDirInfo.nType; - - if ( nCurrDirType % 2 == UBIDI_RTL || // text in RTL run - ( nCurrDirType > UBIDI_LTR && !lcl_HasStrongLTR( aText, nStart, nEnd ) ) ) // non-strong text in embedded LTR run - { - size_t nIdx = 0; - - // Skip entries in ScriptArray which are not inside the RTL run: - while ( nIdx < rTypes.size() && rTypes[nIdx].nStartPos < nStart ) - ++nIdx; - - // Remove any entries *inside* the current run: - while (nIdx < rTypes.size() && rTypes[nIdx].nEndPos <= nEnd) - { - // coverity[use_iterator] - we're protected from a bad iterator by the above condition - rTypes.erase(rTypes.begin() + nIdx); - } - - // special case: - if(nIdx < rTypes.size() && rTypes[nIdx].nStartPos < nStart && rTypes[nIdx].nEndPos > nEnd) - { - rTypes.insert( rTypes.begin()+nIdx, ScriptTypePosInfo( rTypes[nIdx].nScriptType, nEnd, rTypes[nIdx].nEndPos ) ); - rTypes[nIdx].nEndPos = nStart; - } - - if( nIdx ) - rTypes[nIdx - 1].nEndPos = nStart; - - rTypes.insert( rTypes.begin()+nIdx, ScriptTypePosInfo( i18n::ScriptType::COMPLEX, nStart, nEnd) ); - ++nIdx; - - if( nIdx < rTypes.size() ) - rTypes[nIdx].nStartPos = nEnd; - } - } } namespace { diff --git a/i18nutil/qa/cppunit/test_scriptchangescanner.cxx b/i18nutil/qa/cppunit/test_scriptchangescanner.cxx index 99685bb7be21..e0726a45d922 100644 --- a/i18nutil/qa/cppunit/test_scriptchangescanner.cxx +++ b/i18nutil/qa/cppunit/test_scriptchangescanner.cxx @@ -33,6 +33,11 @@ public: void testSmartQuoteCompatibilityCJ(); void testSmartQuoteCompatibilityComplexAndCJ(); void testSmartQuoteCJAtStart(); + void testRtlRunTrivial(); + void testRtlRunEmbeddedComplex(); + void testRtlRunEmbeddedLtrStrong(); + void testRtlRunEmbeddedLtrWeakComplex(); + void testRtlRunOverrideCJKAsian(); CPPUNIT_TEST_SUITE(ScriptChangeScannerTest); CPPUNIT_TEST(testEmpty); @@ -45,13 +50,19 @@ public: CPPUNIT_TEST(testSmartQuoteCompatibilityCJ); CPPUNIT_TEST(testSmartQuoteCompatibilityComplexAndCJ); CPPUNIT_TEST(testSmartQuoteCJAtStart); + CPPUNIT_TEST(testRtlRunTrivial); + CPPUNIT_TEST(testRtlRunEmbeddedComplex); + CPPUNIT_TEST(testRtlRunEmbeddedLtrStrong); + CPPUNIT_TEST(testRtlRunEmbeddedLtrWeakComplex); + CPPUNIT_TEST(testRtlRunOverrideCJKAsian); CPPUNIT_TEST_SUITE_END(); }; void ScriptChangeScannerTest::testEmpty() { auto aText = u""_ustr; - auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); CPPUNIT_ASSERT(pScanner->AtEnd()); CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); @@ -61,7 +72,8 @@ void ScriptChangeScannerTest::testEmpty() void ScriptChangeScannerTest::testTrivial() { auto aText = u"Trivial case with a single span of a script"_ustr; - auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); CPPUNIT_ASSERT(!pScanner->AtEnd()); CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); @@ -75,7 +87,8 @@ void ScriptChangeScannerTest::testTrivial() void ScriptChangeScannerTest::testTrivialAppLang() { auto aText = u"Trivial case with a single span of a script"_ustr; - auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::ASIAN); + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::ASIAN, *pDirScanner); CPPUNIT_ASSERT(!pScanner->AtEnd()); CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); @@ -89,7 +102,8 @@ void ScriptChangeScannerTest::testTrivialAppLang() void ScriptChangeScannerTest::testWeakAtStart() { auto aText = u"“x”"_ustr; - auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::COMPLEX); + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::COMPLEX, *pDirScanner); CPPUNIT_ASSERT(!pScanner->AtEnd()); CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); @@ -111,7 +125,8 @@ void ScriptChangeScannerTest::testWeakAtStart() void ScriptChangeScannerTest::testStrongChange() { auto aText = u"wide 廣 vast"_ustr; - auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); CPPUNIT_ASSERT(!pScanner->AtEnd()); CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); @@ -141,7 +156,8 @@ void ScriptChangeScannerTest::testMongolianAfterNNBSP() { // NNBSP before Mongolian text should be part of the Mongolian run auto aText = u"Before\u202f\u1822\u1822After"_ustr; - auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); CPPUNIT_ASSERT(!pScanner->AtEnd()); CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); @@ -172,7 +188,8 @@ void ScriptChangeScannerTest::testNonspacingMark() // A preceding weak character should be included in the run // of a following non-spacing mark auto aText = u"Before \u0944\u0911\u0911 After"_ustr; - auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); CPPUNIT_ASSERT(!pScanner->AtEnd()); CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); @@ -204,7 +221,8 @@ void ScriptChangeScannerTest::testSmartQuoteCompatibilityCJ() // containing CJ characters should be treated as Asian script auto aText = u"Before \u201c水\u201d After"_ustr; - auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); CPPUNIT_ASSERT(!pScanner->AtEnd()); CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); @@ -236,7 +254,8 @@ void ScriptChangeScannerTest::testSmartQuoteCompatibilityComplexAndCJ() // quotes are assigned in the usual greedy way. auto aText = u"Before \u201c水\u201d After \u05d0"_ustr; - auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); CPPUNIT_ASSERT(!pScanner->AtEnd()); CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); @@ -272,7 +291,8 @@ void ScriptChangeScannerTest::testSmartQuoteCompatibilityComplexAndCJ() void ScriptChangeScannerTest::testSmartQuoteCJAtStart() { auto aText = u"“廣”"_ustr; - auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN); + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); CPPUNIT_ASSERT(!pScanner->AtEnd()); CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, pScanner->Peek().m_nScriptType); @@ -284,6 +304,281 @@ void ScriptChangeScannerTest::testSmartQuoteCJAtStart() CPPUNIT_ASSERT(pScanner->AtEnd()); } +void ScriptChangeScannerTest::testRtlRunTrivial() +{ + auto aText = u"Before אאאאאא after"_ustr; + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(0), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(1), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(13), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(0), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(13), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(pDirScanner->AtEnd()); + + pDirScanner->Reset(); + + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(19), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testRtlRunEmbeddedComplex() +{ + auto aText = u"Before אא(א\"א)אא after"_ustr; + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(0), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(1), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(0), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(16), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(22), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(pDirScanner->AtEnd()); + + pDirScanner->Reset(); + + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(17), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(17), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(22), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testRtlRunEmbeddedLtrStrong() +{ + auto aText = u"אאא Inside אאא"_ustr; + auto pDirScanner = MakeDirectionChangeScanner(aText, 1); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(1), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(2), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(1), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(pDirScanner->AtEnd()); + + pDirScanner->Reset(); + + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::COMPLEX, *pDirScanner); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::LATIN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(10), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testRtlRunEmbeddedLtrWeakComplex() +{ + auto aText = u"אאא 123 אאא"_ustr; + auto pDirScanner = MakeDirectionChangeScanner(aText, 1); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(1), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(2), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(4), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(1), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(pDirScanner->AtEnd()); + + pDirScanner->Reset(); + + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + +void ScriptChangeScannerTest::testRtlRunOverrideCJKAsian() +{ + // tdf#163660: Asian-script characters following an RTL override should + // still be treated as Asian script, rather than Complex script + auto aText = u"一二\u202e三四五"_ustr; + auto pDirScanner = MakeDirectionChangeScanner(aText, 0); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(0), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(!pDirScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(sal_uInt8(1), pDirScanner->Peek().m_nLevel); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), pDirScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), pDirScanner->Peek().m_nEndIndex); + CPPUNIT_ASSERT(!pDirScanner->Peek().m_bHasEmbeddedStrongLTR); + + pDirScanner->Advance(); + + CPPUNIT_ASSERT(pDirScanner->AtEnd()); + + pDirScanner->Reset(); + + auto pScanner = MakeScriptChangeScanner(aText, css::i18n::ScriptType::LATIN, *pDirScanner); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::COMPLEX, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(!pScanner->AtEnd()); + CPPUNIT_ASSERT_EQUAL(css::i18n::ScriptType::ASIAN, pScanner->Peek().m_nScriptType); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), pScanner->Peek().m_nStartIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), pScanner->Peek().m_nEndIndex); + + pScanner->Advance(); + + CPPUNIT_ASSERT(pScanner->AtEnd()); +} + CPPUNIT_TEST_SUITE_REGISTRATION(ScriptChangeScannerTest); } diff --git a/i18nutil/source/utility/scriptchangescanner.cxx b/i18nutil/source/utility/scriptchangescanner.cxx index 8a62f4de1660..2cb8d9d96819 100644 --- a/i18nutil/source/utility/scriptchangescanner.cxx +++ b/i18nutil/source/utility/scriptchangescanner.cxx @@ -11,6 +11,7 @@ #include <i18nutil/unicode.hxx> #include <i18nutil/scriptclass.hxx> #include <unicode/uchar.h> +#include <unicode/ubidi.h> #include <sal/log.hxx> #include <com/sun/star/i18n/ScriptType.hpp> #include <com/sun/star/i18n/CharType.hpp> @@ -24,10 +25,112 @@ namespace { constexpr sal_uInt32 CHAR_NNBSP = 0x202f; +class IcuDirectionChangeScanner : public DirectionChangeScanner +{ +private: + const OUString& m_rText; + UBiDi* m_pBidi; + DirectionChange m_stCurr; + UBiDiLevel m_nInitialDirection; + int32_t m_nCurrIndex = 0; + int m_nCount = 0; + int m_nCurr = 0; + bool m_bAtEnd = false; + + bool RangeHasStrongLTR(sal_Int32 nStart, sal_Int32 nEnd) + { + for (sal_Int32 nCharIdx = nStart; nCharIdx < nEnd; ++nCharIdx) + { + auto nCharDir = u_charDirection(m_rText[nCharIdx]); + if (nCharDir == U_LEFT_TO_RIGHT || nCharDir == U_LEFT_TO_RIGHT_EMBEDDING + || nCharDir == U_LEFT_TO_RIGHT_OVERRIDE) + { + return true; + } + } + + return false; + } + + void PopulateCurr() + { + int32_t nEndIndex = 0; + UBiDiLevel nCurrLevel = 0; + ubidi_getLogicalRun(m_pBidi, m_nCurrIndex, &nEndIndex, &nCurrLevel); + + bool bHasEmbeddedStrongLTR = false; + if ((nCurrLevel % 2) == UBIDI_LTR && nCurrLevel > UBIDI_RTL) + { + bHasEmbeddedStrongLTR = RangeHasStrongLTR(m_nCurrIndex, nEndIndex); + } + + m_stCurr = { m_nCurrIndex, nEndIndex, nCurrLevel, bHasEmbeddedStrongLTR }; + + m_nCurrIndex = nEndIndex; + ++m_nCurr; + + m_bAtEnd = false; + } + +public: + IcuDirectionChangeScanner(const OUString& rText, UBiDiLevel nInitialDirection) + : m_rText(rText) + , m_nInitialDirection(nInitialDirection) + { + UErrorCode nError = U_ZERO_ERROR; + m_pBidi = ubidi_openSized(rText.getLength(), 0, &nError); + nError = U_ZERO_ERROR; + + ubidi_setPara(m_pBidi, reinterpret_cast<const UChar*>(rText.getStr()), rText.getLength(), + nInitialDirection, nullptr, &nError); + nError = U_ZERO_ERROR; + + m_nCount = ubidi_countRuns(m_pBidi, &nError); + Reset(); + } + + ~IcuDirectionChangeScanner() override { ubidi_close(m_pBidi); } + + void Reset() override + { + m_nCurrIndex = 0; + m_nCurr = 0; + m_stCurr = { /*start*/ 0, /*end*/ 0, /*level*/ m_nInitialDirection, + /*has embedded strong LTR*/ false }; + m_bAtEnd = true; + + if (m_nCurr < m_nCount) + { + PopulateCurr(); + } + } + + bool AtEnd() const override { return m_bAtEnd; } + + void Advance() override + { + if (m_nCurr >= m_nCount) + { + m_bAtEnd = true; + return; + } + + PopulateCurr(); + } + + DirectionChange Peek() const override { return m_stCurr; } + + UBiDiLevel GetLevelAt(sal_Int32 nIndex) const override + { + return ubidi_getLevelAt(m_pBidi, nIndex); + } +}; + class GreedyScriptChangeScanner : public ScriptChangeScanner { private: ScriptChange m_stCurr; + DirectionChangeScanner* m_pDirScanner; const OUString& m_rText; sal_Int16 m_nPrevScript; sal_Int32 m_nIndex = 0; @@ -35,8 +138,10 @@ private: bool m_bApplyAsianToWeakQuotes = false; public: - GreedyScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType) - : m_rText(rText) + GreedyScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType, + DirectionChangeScanner* pDirScanner) + : m_pDirScanner(pDirScanner) + , m_rText(rText) , m_nPrevScript(nDefaultScriptType) { // tdf#66791: For compatibility with other programs, the Asian script is @@ -95,9 +200,40 @@ public: while (m_nIndex < m_rText.getLength()) { auto nPrevIndex = m_nIndex; + auto nBidiLevel = m_pDirScanner->GetLevelAt(m_nIndex); + + bool bCharIsRtl = (nBidiLevel % 2 == UBIDI_RTL); + bool bCharIsRtlOrEmbedded = (nBidiLevel > UBIDI_LTR); + bool bRunHasStrongEmbeddedLTR = false; + + while (bCharIsRtlOrEmbedded && !m_pDirScanner->AtEnd()) + { + const auto stDirRun = m_pDirScanner->Peek(); + if (m_nIndex >= stDirRun.m_nStartIndex && m_nIndex < stDirRun.m_nEndIndex) + { + bRunHasStrongEmbeddedLTR = stDirRun.m_bHasEmbeddedStrongLTR; + break; + } + + m_pDirScanner->Advance(); + } + auto nChar = m_rText.iterateCodePoints(&m_nIndex); nScript = GetScriptClass(nChar); - if (nScript == css::i18n::ScriptType::WEAK) + + // #i16354# Change script type for RTL text to CTL: + // 1. All text in RTL runs will use the CTL font + // #i89825# change the script type also to CTL (hennerdrewes) + // 2. Text in embedded LTR runs that does not have any strong LTR characters (numbers!) + // tdf#163660 Asian-script characters inside RTL runs should still use Asian font + if (bCharIsRtl || (bCharIsRtlOrEmbedded && !bRunHasStrongEmbeddedLTR)) + { + if (nScript != css::i18n::ScriptType::ASIAN) + { + nScript = css::i18n::ScriptType::COMPLEX; + } + } + else if (nScript == css::i18n::ScriptType::WEAK) { nScript = m_nPrevScript; if (m_bApplyAsianToWeakQuotes) @@ -151,10 +287,17 @@ public: } } +std::unique_ptr<i18nutil::DirectionChangeScanner> +i18nutil::MakeDirectionChangeScanner(const OUString& rText, sal_uInt8 nInitialDirection) +{ + return std::make_unique<IcuDirectionChangeScanner>(rText, nInitialDirection); +} + std::unique_ptr<i18nutil::ScriptChangeScanner> -i18nutil::MakeScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType) +i18nutil::MakeScriptChangeScanner(const OUString& rText, sal_Int16 nDefaultScriptType, + DirectionChangeScanner& rDirScanner) { - return std::make_unique<GreedyScriptChangeScanner>(rText, nDefaultScriptType); + return std::make_unique<GreedyScriptChangeScanner>(rText, nDefaultScriptType, &rDirScanner); } /* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/include/i18nutil/scriptchangescanner.hxx b/include/i18nutil/scriptchangescanner.hxx index cc7630cb83a5..04bab9640b90 100644 --- a/include/i18nutil/scriptchangescanner.hxx +++ b/include/i18nutil/scriptchangescanner.hxx @@ -14,6 +14,14 @@ namespace i18nutil { +struct DirectionChange +{ + sal_Int32 m_nStartIndex = 0; + sal_Int32 m_nEndIndex = 0; + sal_uInt8 m_nLevel = 0; + bool m_bHasEmbeddedStrongLTR = false; +}; + struct ScriptChange { sal_Int32 m_nStartIndex = 0; @@ -21,6 +29,18 @@ struct ScriptChange sal_Int16 m_nScriptType = 0; }; +class I18NUTIL_DLLPUBLIC DirectionChangeScanner +{ +public: + virtual ~DirectionChangeScanner() = default; + + virtual bool AtEnd() const = 0; + virtual void Advance() = 0; + virtual DirectionChange Peek() const = 0; + virtual sal_uInt8 GetLevelAt(sal_Int32 nIndex) const = 0; + virtual void Reset() = 0; +}; + class I18NUTIL_DLLPUBLIC ScriptChangeScanner { public: @@ -31,8 +51,12 @@ public: virtual ScriptChange Peek() const = 0; }; +I18NUTIL_DLLPUBLIC std::unique_ptr<DirectionChangeScanner> +MakeDirectionChangeScanner(const OUString& rWord, sal_uInt8 nInitialDirection); + I18NUTIL_DLLPUBLIC std::unique_ptr<ScriptChangeScanner> -MakeScriptChangeScanner(const OUString& rWord, sal_Int16 nDefaultScriptType); +MakeScriptChangeScanner(const OUString& rWord, sal_Int16 nDefaultScriptType, + DirectionChangeScanner& rDirScanner); } /* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/sw/source/core/inc/scriptinfo.hxx b/sw/source/core/inc/scriptinfo.hxx index 65c34f407a3e..c60e8d0959a8 100644 --- a/sw/source/core/inc/scriptinfo.hxx +++ b/sw/source/core/inc/scriptinfo.hxx @@ -89,7 +89,6 @@ private: TextFrameIndex m_nInvalidityPos; sal_uInt8 m_nDefaultDir; - void UpdateBidiInfo( const OUString& rText ); bool IsKashidaValid(size_t nKashPos) const; // returns true if nKashPos is newly marked invalid bool MarkKashidaInvalid(size_t nKashPos); diff --git a/sw/source/core/text/porlay.cxx b/sw/source/core/text/porlay.cxx index 15958ab1b70f..b414fae04c6b 100644 --- a/sw/source/core/text/porlay.cxx +++ b/sw/source/core/text/porlay.cxx @@ -86,19 +86,6 @@ using namespace ::com::sun::star; using namespace i18n::ScriptType; -static bool lcl_HasStrongLTR ( std::u16string_view rText, sal_Int32 nStart, sal_Int32 nEnd ) - { - for( sal_Int32 nCharIdx = nStart; nCharIdx < nEnd; ++nCharIdx ) - { - const UCharDirection nCharDir = u_charDirection ( rText[ nCharIdx ] ); - if ( nCharDir == U_LEFT_TO_RIGHT || - nCharDir == U_LEFT_TO_RIGHT_EMBEDDING || - nCharDir == U_LEFT_TO_RIGHT_OVERRIDE ) - return true; - } - return false; - } - // This is (meant to be) functionally equivalent to 'delete m_pNext' where // deleting a SwLineLayout recursively deletes the owned m_pNext SwLineLayout. // @@ -1412,8 +1399,9 @@ void SwScriptInfo::InitScriptInfo(const SwTextNode& rNode, m_Kashida.erase(m_Kashida.begin() + nCntKash, m_Kashida.end()); // Construct the script change scanner and advance it to the change range + auto pDirScanner = i18nutil::MakeDirectionChangeScanner(rText, m_nDefaultDir); auto pScriptScanner = i18nutil::MakeScriptChangeScanner( - rText, SvtLanguageOptions::GetI18NScriptTypeOfLanguage(GetAppLanguage())); + rText, SvtLanguageOptions::GetI18NScriptTypeOfLanguage(GetAppLanguage()), *pDirScanner); while (!pScriptScanner->AtEnd()) { if (pScriptScanner->Peek().m_nStartIndex <= static_cast<sal_Int32>(nChg)) @@ -1581,109 +1569,15 @@ void SwScriptInfo::InitScriptInfo(const SwTextNode& rNode, m_DirectionChanges.clear(); // Perform Unicode Bidi Algorithm for text direction information + pDirScanner->Reset(); + while (!pDirScanner->AtEnd()) { - UpdateBidiInfo( rText ); - - // #i16354# Change script type for RTL text to CTL: - // 1. All text in RTL runs will use the CTL font - // #i89825# change the script type also to CTL (hennerdrewes) - // 2. Text in embedded LTR runs that does not have any strong LTR characters (numbers!) - for (size_t nDirIdx = 0; nDirIdx < m_DirectionChanges.size(); ++nDirIdx) - { - const sal_uInt8 nCurrDirType = GetDirType( nDirIdx ); - // nStart is start of RTL run: - const TextFrameIndex nStart = nDirIdx > 0 ? GetDirChg(nDirIdx - 1) : TextFrameIndex(0); - // nEnd is end of RTL run: - const TextFrameIndex nEnd = GetDirChg( nDirIdx ); - - if ( nCurrDirType % 2 == UBIDI_RTL || // text in RTL run - (nCurrDirType > UBIDI_LTR && // non-strong text in embedded LTR run - !lcl_HasStrongLTR(rText, sal_Int32(nStart), sal_Int32(nEnd)))) - { - // nScriptIdx points into the ScriptArrays: - size_t nScriptIdx = 0; - - // Skip entries in ScriptArray which are not inside the RTL run: - // Make nScriptIdx become the index of the script group with - // 1. nStartPosOfGroup <= nStart and - // 2. nEndPosOfGroup > nStart - while ( GetScriptChg( nScriptIdx ) <= nStart ) - ++nScriptIdx; - - const TextFrameIndex nStartPosOfGroup = nScriptIdx - ? GetScriptChg(nScriptIdx - 1) - : TextFrameIndex(0); - const sal_uInt8 nScriptTypeOfGroup = GetScriptType( nScriptIdx ); - - SAL_WARN_IF( nStartPosOfGroup > nStart || GetScriptChg( nScriptIdx ) <= nStart, - "sw.core", "Script override with CTL font trouble" ); - - // Check if we have to insert a new script change at - // position nStart. If nStartPosOfGroup < nStart, - // we have to insert a new script change: - if (nStart > TextFrameIndex(0) && nStartPosOfGroup < nStart) - { - m_ScriptChanges.insert(m_ScriptChanges.begin() + nScriptIdx, - ScriptChangeInfo(nStart, nScriptTypeOfGroup) ); - ++nScriptIdx; - } - - // Remove entries in ScriptArray which end inside the RTL run: - while (nScriptIdx < m_ScriptChanges.size() - && GetScriptChg(nScriptIdx) <= nEnd) - { - m_ScriptChanges.erase(m_ScriptChanges.begin() + nScriptIdx); - } - - // Insert a new entry in ScriptArray for the end of the RTL run: - m_ScriptChanges.insert(m_ScriptChanges.begin() + nScriptIdx, - ScriptChangeInfo(nEnd, i18n::ScriptType::COMPLEX) ); - -#if OSL_DEBUG_LEVEL > 1 - // Check that ScriptChangeInfos are in increasing order of - // position and that we don't have "empty" changes. - sal_uInt8 nLastTyp = i18n::ScriptType::WEAK; - TextFrameIndex nLastPos = TextFrameIndex(0); - for (const auto& rScriptChange : m_ScriptChanges) - { - SAL_WARN_IF( nLastTyp == rScriptChange.type || - nLastPos >= rScriptChange.position, - "sw.core", "Heavy InitScriptType() confusion" ); - nLastPos = rScriptChange.position; - nLastTyp = rScriptChange.type; - } -#endif - } - } - } -} + auto stDirChange = pDirScanner->Peek(); + m_DirectionChanges.emplace_back(TextFrameIndex{ stDirChange.m_nEndIndex }, + stDirChange.m_nLevel); -void SwScriptInfo::UpdateBidiInfo( const OUString& rText ) -{ - // remove invalid entries from direction information arrays - m_DirectionChanges.clear(); - - // Bidi functions from icu 2.0 - - UErrorCode nError = U_ZERO_ERROR; - UBiDi* pBidi = ubidi_openSized( rText.getLength(), 0, &nError ); - nError = U_ZERO_ERROR; - - ubidi_setPara( pBidi, reinterpret_cast<const UChar *>(rText.getStr()), rText.getLength(), - m_nDefaultDir, nullptr, &nError ); - nError = U_ZERO_ERROR; - int nCount = ubidi_countRuns( pBidi, &nError ); - int32_t nStart = 0; - int32_t nEnd; - UBiDiLevel nCurrDir; - for ( int nIdx = 0; nIdx < nCount; ++nIdx ) - { - ubidi_getLogicalRun( pBidi, nStart, &nEnd, &nCurrDir ); - m_DirectionChanges.emplace_back(TextFrameIndex(nEnd), nCurrDir); - nStart = nEnd; + pDirScanner->Advance(); } - - ubidi_close( pBidi ); } // returns the position of the next character which belongs to another script
