sc/source/filter/oox/richstring.cxx |  112 +++++++++++++++++++++++++++++++++++-
 1 file changed, 111 insertions(+), 1 deletion(-)

New commits:
commit ee7c921ca6ccfe8fa1b223cafc3d8a8452a5d316
Author:     Dennis Francis <dennis.fran...@collabora.com>
AuthorDate: Tue Aug 17 14:38:21 2021 +0530
Commit:     Andras Timar <andras.ti...@collabora.com>
CommitDate: Wed Aug 18 13:54:48 2021 +0200

    sc oox: recover escaped unicode chars in strings import
    
    according to OOX open spec 2.1.1742 Part 1 Section 22.9.2.19, ST_Xstring
    (Escaped String). In this implementation, some restrictions mentioned in
    this spec are not kept for simplicity.
    
    Change-Id: If27797a9625d49be54c600c8a864965f1101ceb1
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120570
    Tested-by: Jenkins CollaboraOffice <jenkinscollaboraoff...@gmail.com>
    Reviewed-by: Andras Timar <andras.ti...@collabora.com>

diff --git a/sc/source/filter/oox/richstring.cxx 
b/sc/source/filter/oox/richstring.cxx
index 4f2e937ffc3e..1b8126f933f6 100644
--- a/sc/source/filter/oox/richstring.cxx
+++ b/sc/source/filter/oox/richstring.cxx
@@ -49,6 +49,116 @@ bool lclNeedsRichTextFormat( const oox::xls::Font* pFont )
     return pFont && pFont->needsRichTextFormat();
 }
 
+sal_Int32 lcl_getHexLetterValue(sal_Unicode nCode)
+{
+    if (nCode >= '0' && nCode <= '9')
+        return nCode - '0';
+
+    if (nCode >= 'A' && nCode <= 'F')
+        return nCode - 'A' + 10;
+
+    if (nCode >= 'a' && nCode <= 'f')
+        return nCode - 'a' + 10;
+
+    return -1;
+}
+
+bool lcl_validEscape(sal_Unicode nCode)
+{
+    // Valid XML chars that can be escaped (ignoring the restrictions) as in 
the OOX open spec
+    // 2.1.1742 Part 1 Section 22.9.2.19, ST_Xstring (Escaped String)
+    if (nCode == 0x000D || nCode == 0x000A || nCode == 0x0009 || nCode == 
0x005F)
+        return true;
+
+    // Other valid XML chars in basic multilingual plane that cannot be 
escaped.
+    if ((nCode >= 0x0020 && nCode <= 0xD7FF) || (nCode >= 0xE000 && nCode <= 
0xFFFD))
+        return false;
+
+    return true;
+}
+
+OUString lcl_unEscapeUnicodeChars(const OUString& rSrc)
+{
+    // Example: Escaped representation of unicode char 0x000D is _x000D_
+
+    sal_Int32 nLen = rSrc.getLength();
+    if (!nLen)
+        return rSrc;
+
+    sal_Int32 nStart = 0;
+    bool bFound = true;
+    const OUString aPrefix = "_x";
+    sal_Int32 nPrefixStart = rSrc.indexOf(aPrefix, nStart);
+
+    if (nPrefixStart == -1)
+        return rSrc;
+
+    OUStringBuffer aBuf(rSrc);
+    sal_Int32 nOffset = 0; // index offset in aBuf w.r.t rSrc.
+
+    do
+    {
+        sal_Int32 nEnd = -1;
+        sal_Unicode nCode = 0;
+        bool bFoundThis = false;
+        for (sal_Int32 nIdx = 0; nIdx < 5; ++nIdx)
+        {
+            sal_Int32 nThisIdx = nPrefixStart + nIdx + 2;
+            if (nThisIdx >= nLen)
+                break;
+
+            sal_Unicode nThisCode = rSrc[nThisIdx];
+            sal_Int32 nLetter = lcl_getHexLetterValue(nThisCode);
+
+            if (!nIdx && nLetter < 0)
+                break;
+
+            if (nLetter >= 0)
+            {
+                nCode = (nCode << 4) + static_cast<sal_Unicode>(nLetter);
+            }
+            else if (nThisCode == '_')
+            {
+                nEnd = nThisIdx + 1;
+                bFoundThis = true;
+                break;
+            }
+            else
+            {
+                break;
+            }
+        }
+
+        if (bFoundThis)
+        {
+            // nEnd is already set inside the inner loop in this case.
+            if (lcl_validEscape(nCode))
+            {
+                bFound = true;
+                sal_Int32 nEscStrLen = nEnd - nPrefixStart;
+                aBuf.remove(nPrefixStart - nOffset, nEscStrLen);
+                aBuf.insert(nPrefixStart - nOffset, nCode);
+
+                nOffset += nEscStrLen - 1;
+            }
+        }
+        else
+        {
+            // Start the next search just after last "_x"
+            nEnd = nPrefixStart + 2;
+        }
+
+        nStart = nEnd;
+        nPrefixStart = rSrc.indexOf(aPrefix, nStart);
+    }
+    while (nPrefixStart != -1);
+
+    if (bFound)
+        return aBuf.makeStringAndClear();
+
+    return rSrc;
+}
+
 } // namespace
 
 RichStringPortion::RichStringPortion( const WorkbookHelper& rHelper ) :
@@ -60,7 +170,7 @@ RichStringPortion::RichStringPortion( const WorkbookHelper& 
rHelper ) :
 
 void RichStringPortion::setText( const OUString& rText )
 {
-    maText = rText;
+    maText = lcl_unEscapeUnicodeChars(rText);
 }
 
 FontRef const & RichStringPortion::createFont()

Reply via email to