include/svtools/htmlkywd.hxx       |    1 
 include/svtools/htmltokn.h         |    2 +
 svtools/qa/unit/testHtmlReader.cxx |   23 +++++++++++++++++++++
 svtools/source/svhtml/htmlkywd.cxx |    4 +++
 svtools/source/svhtml/parhtml.cxx  |   39 +++++++++++++++++++++++++++++++++++++
 sw/source/filter/html/swhtml.cxx   |    1 
 6 files changed, 70 insertions(+)

New commits:
commit b38730ae0ae92ca49b84a45853c2ed098ee9064f
Author:     Miklos Vajna <vmik...@collabora.com>
AuthorDate: Tue Oct 25 15:55:34 2022 +0200
Commit:     Miklos Vajna <vmik...@collabora.com>
CommitDate: Tue Oct 25 18:15:47 2022 +0200

    sw html import: fix handling of CDATA
    
    In case the HTML contained markup like <![CDATA[...]]>, we simply
    ignored it during import, even if e.g. the ODT import handles that
    correctly.
    
    The reason for this is that the svtools/ HTMLParser had code to parse
    <!-- ... ---> style comments, but not for CDATA.
    
    Fix the problem by introducing a new HtmlTokenId::CDATA, producing a
    matching token content in HTMLParser::GetNextToken_(), and finally map
    it to normal text on the Writer side.
    
    Note that HtmlTokenId doesn't allow non-on-off tokens past ONOFF_START,
    neither allows inserting a single token before ONOFF_START (it breaks
    getOnToken()), so for now just add a second, dummy token to avoid
    breakage.
    
    Change-Id: I605c3c21dc11986fda5d93d36148788a638e97b4
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141813
    Reviewed-by: Miklos Vajna <vmik...@collabora.com>
    Tested-by: Jenkins

diff --git a/include/svtools/htmlkywd.hxx b/include/svtools/htmlkywd.hxx
index 5d6b7e629fe7..9a84cddd37bf 100644
--- a/include/svtools/htmlkywd.hxx
+++ b/include/svtools/htmlkywd.hxx
@@ -32,6 +32,7 @@
 #define OOO_STRING_SVTOOLS_HTML_base "base"
 #define OOO_STRING_SVTOOLS_HTML_comment "!--"
 #define OOO_STRING_SVTOOLS_HTML_doctype "!DOCTYPE"
+#define OOO_STRING_SVTOOLS_HTML_cdata "![cdata["
 #define OOO_STRING_SVTOOLS_HTML_embed "embed"
 #define OOO_STRING_SVTOOLS_HTML_horzrule "hr"
 #define OOO_STRING_SVTOOLS_HTML_image "img"
diff --git a/include/svtools/htmltokn.h b/include/svtools/htmltokn.h
index bfa1f14d6812..9dca8a8f3ea7 100644
--- a/include/svtools/htmltokn.h
+++ b/include/svtools/htmltokn.h
@@ -58,6 +58,8 @@ enum class HtmlTokenId : sal_Int16
     AREA, // Netscape 2.0
     BASE, // HTML 3.0
     COMMENT,
+    CDATA,
+    DUMMY, // so ONOFF_START is even
     DOCTYPE,
     EMBED, // Netscape 2.0            ignore </EMBED>
     HORZRULE,                      // ignore </HR>
diff --git a/svtools/qa/unit/testHtmlReader.cxx 
b/svtools/qa/unit/testHtmlReader.cxx
index 146458a200eb..37f74e903bcc 100644
--- a/svtools/qa/unit/testHtmlReader.cxx
+++ b/svtools/qa/unit/testHtmlReader.cxx
@@ -27,6 +27,7 @@ public:
 
     OUString m_aDocument;
     int m_nLineBreakCount = 0;
+    OUString m_aCdata;
 };
 
 TestHTMLParser::TestHTMLParser(SvStream& rStream)
@@ -40,6 +41,8 @@ void TestHTMLParser::NextToken(HtmlTokenId nToken)
         m_aDocument += aToken;
     else if (nToken == HtmlTokenId::LINEBREAK)
         ++m_nLineBreakCount;
+    else if (nToken == HtmlTokenId::CDATA)
+        m_aCdata = aToken;
 }
 
 /// Tests HTMLParser.
@@ -76,6 +79,26 @@ CPPUNIT_TEST_FIXTURE(Test, testLineBreak)
     // This was 2, <br></br> was interpreted as 2 line breaks in XHTML mode.
     CPPUNIT_ASSERT_EQUAL(1, xParser->m_nLineBreakCount);
 }
+
+CPPUNIT_TEST_FIXTURE(Test, testCdata)
+{
+    // Given a document with CDATA:
+    SvMemoryStream aStream;
+    OString aDocument("A<![CDATA[B &uuml; &lt;]]>C");
+    aStream.WriteBytes(aDocument.getStr(), aDocument.getLength());
+    aStream.Seek(0);
+
+    // When parsing that HTML:
+    tools::SvRef<TestHTMLParser> xParser = new TestHTMLParser(aStream);
+    xParser->CallParser();
+
+    // Then make sure that we get a cdata token with the correct content:
+    // Without the accompanying fix in place, this test would have failed with:
+    // - Expected: B &uuml; &lt;
+    // - Actual  :
+    // i.e. the content inside CDATA was lost.
+    CPPUNIT_ASSERT_EQUAL(OUString("B &uuml; &lt;"), xParser->m_aCdata);
+}
 }
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/svtools/source/svhtml/htmlkywd.cxx 
b/svtools/source/svhtml/htmlkywd.cxx
index 2d51910d85e9..584322fac8bc 100644
--- a/svtools/source/svhtml/htmlkywd.cxx
+++ b/svtools/source/svhtml/htmlkywd.cxx
@@ -27,6 +27,9 @@
 #include <svtools/htmltokn.h>
 #include <svtools/htmlkywd.hxx>
 
+// If this is odd, then getOnToken() breaks.
+static_assert(static_cast<sal_Int16>(HtmlTokenId::ABBREVIATION_ON) % 2 == 0);
+
 namespace {
 
 template<typename T>
@@ -64,6 +67,7 @@ using HTML_TokenEntry = TokenEntry<HtmlTokenId>;
 HTML_TokenEntry const aHTMLTokenTab[] = {
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_comment),         
HtmlTokenId::COMMENT},
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_doctype),         
HtmlTokenId::DOCTYPE},
+    {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_cdata),           
HtmlTokenId::CDATA},
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_anchor),          
HtmlTokenId::ANCHOR_ON},
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_abbreviation),    
HtmlTokenId::ABBREVIATION_ON},  // HTML 3.0
     {std::u16string_view(u"" OOO_STRING_SVTOOLS_HTML_acronym),         
HtmlTokenId::ACRONYM_ON},   // HTML 3.0
diff --git a/svtools/source/svhtml/parhtml.cxx 
b/svtools/source/svhtml/parhtml.cxx
index 0ac10578981c..e705c98013e4 100644
--- a/svtools/source/svhtml/parhtml.cxx
+++ b/svtools/source/svhtml/parhtml.cxx
@@ -1053,6 +1053,10 @@ HtmlTokenId HTMLParser::GetNextToken_()
                     do {
                         sTmpBuffer.appendUtf32( nNextCh );
                         nNextCh = GetNextChar();
+                        if (std::u16string_view(sTmpBuffer) == u"![CDATA[")
+                        {
+                            break;
+                        }
                     } while( '>' != nNextCh && '/' != nNextCh && 
!rtl::isAsciiWhiteSpace( nNextCh ) &&
                             !linguistic::IsControlChar(nNextCh) &&
                              IsParserWorking() && !rInput.eof() );
@@ -1152,6 +1156,41 @@ HtmlTokenId HTMLParser::GetNextToken_()
                             nNextCh = '>';
                         }
                     }
+                    else if (nRet == HtmlTokenId::CDATA)
+                    {
+                        // Read until the closing ]]>.
+                        bool bDone = false;
+                        while (!bDone && !rInput.eof() && IsParserWorking())
+                        {
+                            if (nNextCh == '>')
+                            {
+                                if (sTmpBuffer.getLength() >= 2)
+                                {
+                                    bDone = sTmpBuffer[sTmpBuffer.getLength() 
- 2] == ']'
+                                            && 
sTmpBuffer[sTmpBuffer.getLength() - 1] == ']';
+                                    if (bDone)
+                                    {
+                                        // Ignore ]] at the end.
+                                        
sTmpBuffer.setLength(sTmpBuffer.getLength() - 2);
+                                    }
+                                }
+                                if (!bDone)
+                                {
+                                    sTmpBuffer.appendUtf32(nNextCh);
+                                }
+                            }
+                            else if (!linguistic::IsControlChar(nNextCh))
+                            {
+                                sTmpBuffer.appendUtf32(nNextCh);
+                            }
+                            if (!bDone)
+                            {
+                                nNextCh = GetNextChar();
+                            }
+                        }
+                        aToken = sTmpBuffer;
+                        sTmpBuffer.setLength(0);
+                    }
                     else
                     {
                         // TokenString not needed anymore
diff --git a/sw/source/filter/html/swhtml.cxx b/sw/source/filter/html/swhtml.cxx
index 804fbb743e9e..f9f0e7f59bd7 100644
--- a/sw/source/filter/html/swhtml.cxx
+++ b/sw/source/filter/html/swhtml.cxx
@@ -1522,6 +1522,7 @@ void SwHTMLParser::NextToken( HtmlTokenId nToken )
         break;
 
     case HtmlTokenId::TEXTTOKEN:
+    case HtmlTokenId::CDATA:
         // insert string without spanning attributes at the end.
         if( !aToken.isEmpty() && ' '==aToken[0] && !IsReadPRE() )
         {

Reply via email to