editeng/source/editeng/impedit3.cxx |  248 ++++++++++++++++++++++++++++--------
 1 file changed, 199 insertions(+), 49 deletions(-)

New commits:
commit 640a2bc1cd093875b701bb958b6156d5752c0fac
Author:     Khaled Hosny <kha...@aliftype.com>
AuthorDate: Thu Aug 11 14:53:19 2022 +0200
Commit:     Caolán McNamara <caol...@redhat.com>
CommitDate: Sun Aug 14 21:10:43 2022 +0200

    tdf#137528: Sync Kashida algorithm in editeng with sw
    
    It seems that editeng is using an old copy of the code in sw and it was
    never updated.
    
    With this change, the text in the test document from tdf#137528 looks
    essentially identical in both cases.
    
    Fixes also tdf#146199.
    
    Change-Id: I38541bfb35eae36681af84e73a7965d36152448f
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/138145
    Tested-by: Jenkins
    Reviewed-by: Caolán McNamara <caol...@redhat.com>

diff --git a/editeng/source/editeng/impedit3.cxx 
b/editeng/source/editeng/impedit3.cxx
index 9d71560c108b..d7129999dfa5 100644
--- a/editeng/source/editeng/impedit3.cxx
+++ b/editeng/source/editeng/impedit3.cxx
@@ -75,6 +75,8 @@
 
 #include <vcl/outdev/ScopedStates.hxx>
 
+#include <unicode/uchar.h>
+
 using namespace ::com::sun::star;
 using namespace ::com::sun::star::uno;
 using namespace ::com::sun::star::beans;
@@ -254,19 +256,93 @@ static Point lcl_ImplCalcRotatedPos( Point rPos, Point 
rOrigin, double nSin, dou
     return aTranslatedPos;
 }
 
-static bool lcl_IsLigature( sal_Unicode cCh, sal_Unicode cNextCh ) // For 
Kashidas from sw/source/core/text/porlay.txt
+// For Kashidas from sw/source/core/text/porlay.cxx
+
+#define IS_JOINING_GROUP(c, g) ( u_getIntPropertyValue( (c), 
UCHAR_JOINING_GROUP ) == U_JG_##g )
+#define isAinChar(c)        IS_JOINING_GROUP((c), AIN)
+#define isAlefChar(c)       IS_JOINING_GROUP((c), ALEF)
+#define isDalChar(c)        IS_JOINING_GROUP((c), DAL)
+#if U_ICU_VERSION_MAJOR_NUM >= 58
+#define isFehChar(c)       (IS_JOINING_GROUP((c), FEH) || 
IS_JOINING_GROUP((c), AFRICAN_FEH))
+#else
+#define isFehChar(c)        IS_JOINING_GROUP((c), FEH)
+#endif
+#define isGafChar(c)        IS_JOINING_GROUP((c), GAF)
+#define isHehChar(c)        IS_JOINING_GROUP((c), HEH)
+#define isKafChar(c)        IS_JOINING_GROUP((c), KAF)
+#define isLamChar(c)        IS_JOINING_GROUP((c), LAM)
+#if U_ICU_VERSION_MAJOR_NUM >= 58
+#define isQafChar(c)       (IS_JOINING_GROUP((c), QAF) || 
IS_JOINING_GROUP((c), AFRICAN_QAF))
+#else
+#define isQafChar(c)        IS_JOINING_GROUP((c), QAF)
+#endif
+#define isRehChar(c)        IS_JOINING_GROUP((c), REH)
+#define isTahChar(c)        IS_JOINING_GROUP((c), TAH)
+#define isTehMarbutaChar(c) IS_JOINING_GROUP((c), TEH_MARBUTA)
+#define isWawChar(c)        IS_JOINING_GROUP((c), WAW)
+#define isSeenOrSadChar(c)  (IS_JOINING_GROUP((c), SAD) || 
IS_JOINING_GROUP((c), SEEN))
+
+// Beh and characters that behave like Beh in medial form.
+static bool isBehChar(sal_Unicode cCh)
+{
+    bool bRet = false;
+    switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP))
+    {
+    case U_JG_BEH:
+    case U_JG_NOON:
+#if U_ICU_VERSION_MAJOR_NUM >= 58
+    case U_JG_AFRICAN_NOON:
+#endif
+    case U_JG_NYA:
+    case U_JG_YEH:
+    case U_JG_FARSI_YEH:
+    case U_JG_BURUSHASKI_YEH_BARREE:
+        bRet = true;
+        break;
+    default:
+        bRet = false;
+        break;
+    }
+
+    return bRet;
+}
+
+// Yeh and characters that behave like Yeh in final form.
+static bool isYehChar(sal_Unicode cCh)
+{
+    bool bRet = false;
+    switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP))
+    {
+    case U_JG_YEH:
+    case U_JG_FARSI_YEH:
+    case U_JG_YEH_BARREE:
+    case U_JG_BURUSHASKI_YEH_BARREE:
+    case U_JG_YEH_WITH_TAIL:
+        bRet = true;
+        break;
+    default:
+        bRet = false;
+        break;
+    }
+
+    return bRet;
+}
+
+static bool isTransparentChar ( sal_Unicode cCh )
+{
+    return u_getIntPropertyValue( cCh, UCHAR_JOINING_TYPE ) == 
U_JT_TRANSPARENT;
+}
+
+static bool lcl_IsLigature( sal_Unicode cCh, sal_Unicode cNextCh )
 {
             // Lam + Alef
-    return ( 0x644 == cCh && 0x627 == cNextCh ) ||
-            // Beh + Reh
-           ( 0x628 == cCh && 0x631 == cNextCh );
+    return ( isLamChar ( cCh ) && isAlefChar ( cNextCh ));
 }
 
-static bool lcl_ConnectToPrev( sal_Unicode cCh, sal_Unicode cPrevCh )  // For 
Kashidas from sw/source/core/text/porlay.txt
+static bool lcl_ConnectToPrev( sal_Unicode cCh, sal_Unicode cPrevCh )
 {
-    // Alef, Dal, Thal, Reh, Zain, and Waw do not connect to the left
-    bool bRet = 0x627 != cPrevCh && 0x62F != cPrevCh && 0x630 != cPrevCh &&
-                0x631 != cPrevCh && 0x632 != cPrevCh && 0x648 != cPrevCh;
+    const int32_t nJoiningType = u_getIntPropertyValue( cPrevCh, 
UCHAR_JOINING_TYPE );
+    bool bRet = nJoiningType != U_JT_RIGHT_JOINING && nJoiningType != 
U_JT_NON_JOINING;
 
     // check for ligatures cPrevChar + cChar
     if ( bRet )
@@ -2236,8 +2312,15 @@ void ImpEditEngine::ImpAdjustBlocks( ParaPortion* 
pParaPortion, EditLine* pLine,
     pLine->SetTextWidth( pLine->GetTextWidth() + nRemainingSpace );
 }
 
+// For Kashidas from sw/source/core/text/porlay.cxx
 void ImpEditEngine::ImpFindKashidas( ContentNode* pNode, sal_Int32 nStart, 
sal_Int32 nEnd, std::vector<sal_Int32>& rArray )
 {
+    // Kashida glyph looks suspicious, skip Kashida justification
+    if (GetRefDevice()->GetMinKashida() < 0)
+        return;
+
+    std::vector<sal_Int32> aKashidaArray;
+
     // the search has to be performed on a per word base
 
     EditSelection aWordSel( EditPaM( pNode, nStart ) );
@@ -2261,7 +2344,16 @@ void ImpEditEngine::ImpFindKashidas( ContentNode* pNode, 
sal_Int32 nStart, sal_I
         sal_Unicode cCh;
         sal_Unicode cPrevCh = 0;
 
-        while ( nIdx < aWord.getLength() )
+        int nPriorityLevel = 7;    // 0..6 = level found
+                                   // 7 not found
+
+        sal_Int32 nWordLen = aWord.getLength();
+
+        // ignore trailing vowel chars
+        while( nWordLen && isTransparentChar( aWord[ nWordLen - 1 ] ))
+            --nWordLen;
+
+        while ( nIdx < nWordLen )
         {
             cCh = aWord[ nIdx ];
 
@@ -2270,87 +2362,145 @@ void ImpEditEngine::ImpFindKashidas( ContentNode* 
pNode, sal_Int32 nStart, sal_I
             if ( 0x640 == cCh )
             {
                 nKashidaPos = aWordSel.Min().GetIndex() + nIdx;
-                break;
+                nPriorityLevel = 0;
             }
 
             // 2. Priority:
             // after a Seen or Sad
-            if ( nIdx + 1 < aWord.getLength() &&
-                 ( 0x633 == cCh || 0x635 == cCh ) )
+            if (nPriorityLevel >= 1 && nIdx < nWordLen - 1)
             {
-                nKashidaPos = aWordSel.Min().GetIndex() + nIdx;
-                break;
+                if( isSeenOrSadChar( cCh )
+                 && (aWord[ nIdx+1 ] != 0x200C) ) // #i98410#: prevent ZWNJ 
expansion
+                {
+                    nKashidaPos = aWordSel.Min().GetIndex() + nIdx;
+                    nPriorityLevel = 1;
+                }
             }
 
             // 3. Priority:
-            // before final form of the Marbuta, Hah, Dal
-            // 4. Priority:
-            // before final form of Alef, Lam or Kaf
-            if ( nIdx && nIdx + 1 == aWord.getLength() &&
-                 ( 0x629 == cCh || 0x62D == cCh || 0x62F == cCh ||
-                   0x627 == cCh || 0x644 == cCh || 0x643 == cCh ) )
+            // before final form of Teh Marbuta, Heh, Dal
+            if ( nPriorityLevel >= 2 && nIdx > 0 )
             {
-                DBG_ASSERT( 0 != cPrevCh, "No previous character" );
+                if ( isTehMarbutaChar ( cCh ) || // Teh Marbuta (right joining)
+                     isDalChar ( cCh ) ||        // Dal (right joining) final 
form may appear in the middle of word
+                     ( isHehChar ( cCh ) && nIdx == nWordLen - 1))  // Heh 
(dual joining) only at end of word
+                {
 
-                // check if character is connectable to previous character,
-                if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
+                    SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous 
character" );
+                    // check if character is connectable to previous character,
+                    if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
+                    {
+                        nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1;
+                        nPriorityLevel = 2;
+                    }
+                }
+            }
+
+            // 4. Priority:
+            // before final form of Alef, Tah, Lam, Kaf or Gaf
+            if ( nPriorityLevel >= 3 && nIdx > 0 )
+            {
+                if ( isAlefChar ( cCh ) ||   // Alef (right joining) final 
form may appear in the middle of word
+                     (( isLamChar ( cCh ) || // Lam,
+                      isTahChar ( cCh )   || // Tah,
+                      isKafChar ( cCh )   || // Kaf (all dual joining)
+                      isGafChar ( cCh ) )
+                      && nIdx == nWordLen - 1))  // only at end of word
                 {
-                    nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1;
-                    break;
+                    SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous 
character" );
+                    // check if character is connectable to previous character,
+                    if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
+                    {
+                        nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1;
+                        nPriorityLevel = 3;
+                    }
                 }
             }
 
             // 5. Priority:
-            // before media Bah
-            if ( nIdx && nIdx + 1 < aWord.getLength() && 0x628 == cCh )
+            // before medial Beh-like
+            if ( nPriorityLevel >= 4 && nIdx > 0 && nIdx < nWordLen - 1 )
             {
-                DBG_ASSERT( 0 != cPrevCh, "No previous character" );
-
-                // check if next character is Reh, Yeh or Alef Maksura
-                sal_Unicode cNextCh = aWord[ nIdx + 1 ];
+                if ( isBehChar ( cCh ) )
+                {
+                    // check if next character is Reh or Yeh-like
+                    sal_Unicode cNextCh = aWord[ nIdx + 1 ];
+                    if ( isRehChar ( cNextCh ) || isYehChar ( cNextCh ))
+                    {
+                        SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous 
character" );
+                        // check if character is connectable to previous 
character,
+                        if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
+                        {
+                            nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1;
+                            nPriorityLevel = 4;
+                        }
+                    }
+                }
+            }
 
-                if ( 0x631 == cNextCh || 0x64A == cNextCh ||
-                     0x649 == cNextCh )
+            // 6. Priority:
+            // before the final form of Waw, Ain, Qaf and Feh
+            if ( nPriorityLevel >= 5 && nIdx > 0 )
+            {
+                if ( isWawChar ( cCh )   || // Wav (right joining)
+                                            // final form may appear in the 
middle of word
+                     (( isAinChar ( cCh ) ||  // Ain (dual joining)
+                        isQafChar ( cCh ) ||  // Qaf (dual joining)
+                        isFehChar ( cCh ) )   // Feh (dual joining)
+                        && nIdx == nWordLen - 1))  // only at end of word
                 {
+                    SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous 
character" );
                     // check if character is connectable to previous character,
                     if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
+                    {
                         nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1;
+                        nPriorityLevel = 5;
+                    }
                 }
             }
 
-            // 6. Priority:
             // other connecting possibilities
-            if ( nIdx && nIdx + 1 == aWord.getLength() &&
-                 0x60C <= cCh && 0x6FE >= cCh )
+            if ( nPriorityLevel >= 6 && nIdx > 0 )
             {
-                DBG_ASSERT( 0 != cPrevCh, "No previous character" );
-
-                // check if character is connectable to previous character,
-                if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
+                // Reh, Zain
+                if ( isRehChar ( cCh ) )
                 {
-                    // only choose this position if we did not find
-                    // a better one:
-                    if ( nKashidaPos<0 )
+                    SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous 
character" );
+                    // check if character is connectable to previous character,
+                    if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
+                    {
                         nKashidaPos = aWordSel.Min().GetIndex() + nIdx - 1;
-                    break;
+                        nPriorityLevel = 6;
+                    }
                 }
             }
 
-            // Do not consider Fathatan, Dammatan, Kasratan, Fatha,
-            // Damma, Kasra, Shadda and Sukun when checking if
-            // a character can be connected to previous character.
-            if ( cCh < 0x64B || cCh > 0x652 )
+            // Do not consider vowel marks when checking if a character
+            // can be connected to previous character.
+            if ( !isTransparentChar ( cCh) )
                 cPrevCh = cCh;
 
             ++nIdx;
         } // end of current word
 
         if ( nKashidaPos>=0 )
-            rArray.push_back( nKashidaPos );
+            aKashidaArray.push_back( nKashidaPos );
 
         aWordSel = WordRight( aWordSel.Max(), 
css::i18n::WordType::DICTIONARY_WORD );
         aWordSel = SelectWord( aWordSel, css::i18n::WordType::DICTIONARY_WORD 
);
     }
+
+    // Validate
+    std::vector<sal_Int32> aDropped(aKashidaArray.size());
+    auto nOldLayout = GetRefDevice()->GetLayoutMode();
+    GetRefDevice()->SetLayoutMode(nOldLayout | 
vcl::text::ComplexTextLayoutFlags::BiDiRtl);
+    GetRefDevice()->ValidateKashidas(pNode->GetString(), nStart, nEnd - nStart,
+            aKashidaArray.size(), aKashidaArray.data(), aDropped.data());
+    GetRefDevice()->SetLayoutMode(nOldLayout);
+
+    for (auto const& pos : aKashidaArray)
+        if (std::find(aDropped.begin(), aDropped.end(), pos) == aDropped.end())
+             rArray.push_back(pos);
 }
 
 sal_Int32 ImpEditEngine::SplitTextPortion( ParaPortion* pPortion, sal_Int32 
nPos, EditLine* pCurLine )

Reply via email to