include/svtools/htmlkywd.hxx | 1 + include/svtools/htmltokn.h | 1 + sc/qa/unit/bugfix-test.cxx | 25 +++++++++++++++++++++++++ sc/qa/unit/data/html/tdf88821-2.html | 19 +++++++++++++++++++ sc/qa/unit/data/html/tdf88821.html | 23 +++++++++++++++++++++++ svtools/source/svhtml/htmlkywd.cxx | 1 + svtools/source/svhtml/parhtml.cxx | 5 ++++- svtools/source/svrtf/svparser.cxx | 7 +++---- 8 files changed, 77 insertions(+), 5 deletions(-)
New commits: commit 84400eae86d7ae8e66f8247f4c4f3a717d90f8c0 Author: Jan Holesovsky <ke...@collabora.com> Date: Thu Nov 3 22:27:12 2016 +0100 tdf#88821: Implement support for <meta charset="..."> for HTML import. The editengine HTML import was not handling it at all, and consequently not setting the right encoding when importing HTML in Calc. Change-Id: I3ca3dd20f36cfb579fb7ae4cd3da63a69d97601e diff --git a/include/svtools/htmlkywd.hxx b/include/svtools/htmlkywd.hxx index 54309a7..4cc2494 100644 --- a/include/svtools/htmlkywd.hxx +++ b/include/svtools/htmlkywd.hxx @@ -422,6 +422,7 @@ #define OOO_STRING_SVTOOLS_HTML_O_alt "alt" #define OOO_STRING_SVTOOLS_HTML_O_axis "axis" #define OOO_STRING_SVTOOLS_HTML_O_char "char" +#define OOO_STRING_SVTOOLS_HTML_O_charset "charset" #define OOO_STRING_SVTOOLS_HTML_O_class "class" #define OOO_STRING_SVTOOLS_HTML_O_code "code" #define OOO_STRING_SVTOOLS_HTML_O_codetype "codetype" diff --git a/include/svtools/htmltokn.h b/include/svtools/htmltokn.h index eeea777..37ca30e 100644 --- a/include/svtools/htmltokn.h +++ b/include/svtools/htmltokn.h @@ -308,6 +308,7 @@ HTML_OPTION_STRING_START = HTML_OPTION_BOOL_END, HTML_O_ALT, HTML_O_AXIS, HTML_O_CHAR, // HTML3 Table Model Draft + HTML_O_CHARSET, HTML_O_CLASS, HTML_O_CODE, // HotJava HTML_O_CODETYPE, diff --git a/sc/qa/unit/bugfix-test.cxx b/sc/qa/unit/bugfix-test.cxx index 3968d38..2cf5b12 100644 --- a/sc/qa/unit/bugfix-test.cxx +++ b/sc/qa/unit/bugfix-test.cxx @@ -88,6 +88,7 @@ public: // void testTdf40110(); void testTdf98657(); void testTdf88821(); + void testTdf88821_2(); CPPUNIT_TEST_SUITE(ScFiltersTest); CPPUNIT_TEST(testTdf64229); @@ -98,6 +99,7 @@ public: // CPPUNIT_TEST(testTdf40110); CPPUNIT_TEST(testTdf98657); CPPUNIT_TEST(testTdf88821); + CPPUNIT_TEST(testTdf88821_2); CPPUNIT_TEST_SUITE_END(); private: uno::Reference<uno::XInterface> m_xCalcComponent; @@ -256,6 +258,17 @@ void ScFiltersTest::testTdf88821() xDocSh->DoClose(); } +void ScFiltersTest::testTdf88821_2() +{ + ScDocShellRef xDocSh = loadDoc("tdf88821-2.", FORMAT_HTML); + ScDocument& rDoc = xDocSh->GetDocument(); + + // A2 should be 'ABCabcÄŠŽÄšž', not 'ABCabcÃÅà  à ½Ãï¾Ã ¡à ¾' + CPPUNIT_ASSERT_EQUAL(OStringToOUString("ABCabc\xC4\x8C\xC5\xA0\xC5\xBD\xC4\x8D\xC5\xA1\xC5\xBE", RTL_TEXTENCODING_UTF8), rDoc.GetString(0, 1, 0)); + + xDocSh->DoClose(); +} + ScFiltersTest::ScFiltersTest() : ScBootstrapFixture( "/sc/qa/unit/data" ) { diff --git a/sc/qa/unit/data/html/tdf88821-2.html b/sc/qa/unit/data/html/tdf88821-2.html new file mode 100644 index 0000000..e71094a --- /dev/null +++ b/sc/qa/unit/data/html/tdf88821-2.html @@ -0,0 +1,19 @@ +<html lang="en"> +<head> +<meta charset="UTF-8"> +</head> +<body> + <table border="1"> + <tr> + <td>Text</td> + <td>Decimal</td> + <td>Date</td> + </tr> + <tr> + <td>ABCabcÄŠŽÄšž</td> + <td>10,50</td> + <td>30.1.2015</td> + </tr> + </table> +</body> +</html> diff --git a/svtools/source/svhtml/htmlkywd.cxx b/svtools/source/svhtml/htmlkywd.cxx index 6034082..2cc0a9e 100644 --- a/svtools/source/svhtml/htmlkywd.cxx +++ b/svtools/source/svhtml/htmlkywd.cxx @@ -576,6 +576,7 @@ static HTML_TokenEntry aHTMLOptionTab[] = { {{OOO_STRING_SVTOOLS_HTML_O_alt}, HTML_O_ALT}, {{OOO_STRING_SVTOOLS_HTML_O_axis}, HTML_O_AXIS}, {{OOO_STRING_SVTOOLS_HTML_O_char}, HTML_O_CHAR}, // HTML 3 Table Model Draft + {{OOO_STRING_SVTOOLS_HTML_O_charset}, HTML_O_CHARSET}, {{OOO_STRING_SVTOOLS_HTML_O_class}, HTML_O_CLASS}, {{OOO_STRING_SVTOOLS_HTML_O_code}, HTML_O_CODE}, // HotJava {{OOO_STRING_SVTOOLS_HTML_O_codetype}, HTML_O_CODETYPE}, diff --git a/svtools/source/svhtml/parhtml.cxx b/svtools/source/svhtml/parhtml.cxx index a47b4e4..c09ecc5 100644 --- a/svtools/source/svhtml/parhtml.cxx +++ b/svtools/source/svhtml/parhtml.cxx @@ -1928,6 +1928,10 @@ bool HTMLParser::ParseMetaOptionsImpl( case HTML_O_CONTENT: aContent = aOption.GetString(); break; + case HTML_O_CHARSET: + OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US)); + o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr())); + break; } } @@ -1942,7 +1946,6 @@ bool HTMLParser::ParseMetaOptionsImpl( aContent = convertLineEnd(aContent, GetSystemLineEnd()); } - if ( bHTTPEquiv && i_pHTTPHeader ) { // Netscape seems to just ignore a closing ", so we do too commit b297f7bbfed83f87398231740e910afe6ebfbb97 Author: Jan Holesovsky <ke...@collabora.com> Date: Thu Nov 3 17:14:01 2016 +0100 tdf#88821: Set the encoding correctly for HTML files with a BOM. BOM (Byte Order Mark) in the HTML file changed the underlying eSrcEnc encoding, but did not actually update the rtl_TextToUnicodeConverter hConv. Subsequent changes of eSrcEnc in SetSrcEncoding() (triggered by 'content="application/xhtml+xml; charset=UTF-8"' in the HTML file) were then ignored (eSrcEnc was already set to UTF-8), and the parser was happily using the old (Windows-1250) hConv. Change-Id: If432d59891d51c6abe3517e325ed73057d0f8610 diff --git a/sc/qa/unit/bugfix-test.cxx b/sc/qa/unit/bugfix-test.cxx index 6213593..3968d38 100644 --- a/sc/qa/unit/bugfix-test.cxx +++ b/sc/qa/unit/bugfix-test.cxx @@ -87,6 +87,7 @@ public: void testTdf91979(); // void testTdf40110(); void testTdf98657(); + void testTdf88821(); CPPUNIT_TEST_SUITE(ScFiltersTest); CPPUNIT_TEST(testTdf64229); @@ -96,6 +97,7 @@ public: CPPUNIT_TEST(testTdf91979); // CPPUNIT_TEST(testTdf40110); CPPUNIT_TEST(testTdf98657); + CPPUNIT_TEST(testTdf88821); CPPUNIT_TEST_SUITE_END(); private: uno::Reference<uno::XInterface> m_xCalcComponent; @@ -243,6 +245,16 @@ void ScFiltersTest::testTdf98657() CPPUNIT_ASSERT_EQUAL(double(285.0), rDoc.GetValue(ScAddress(1, 1, 0))); } +void ScFiltersTest::testTdf88821() +{ + ScDocShellRef xDocSh = loadDoc("tdf88821.", FORMAT_HTML); + ScDocument& rDoc = xDocSh->GetDocument(); + + // B2 should be 'Périmètre', not 'Périmètre' + CPPUNIT_ASSERT_EQUAL(OStringToOUString("P\xC3\xA9rim\xC3\xA8tre", RTL_TEXTENCODING_UTF8), rDoc.GetString(1, 1, 0)); + + xDocSh->DoClose(); +} ScFiltersTest::ScFiltersTest() : ScBootstrapFixture( "/sc/qa/unit/data" ) diff --git a/sc/qa/unit/data/html/tdf88821.html b/sc/qa/unit/data/html/tdf88821.html new file mode 100644 index 0000000..f8e22c8 --- /dev/null +++ b/sc/qa/unit/data/html/tdf88821.html @@ -0,0 +1,23 @@ +<meta http-equiv="Content-type" content="application/xhtml+xml; charset=UTF-8" xmlns:myObj="urn:ms-kb" xmlns:myObjConvertBool="urn:ms-bool" xmlns:myObjConvertDecimal="urn:ms-dec" xmlns:myObjConvertText="urn:ms-text" /> +<HTML xmlns:myObj="urn:ms-kb" xmlns:myObjConvertBool="urn:ms-bool" xmlns:myObjConvertDecimal="urn:ms-dec" xmlns:myObjConvertText="urn:ms-text"> + <HEAD> + <STYLE>.HDR { background-color:bisque;font-weight:bold }</STYLE> + </HEAD> + <BODY> + <TABLE> + <COLGROUP WIDTH="150" ALIGN="LEFT" /> + <COLGROUP WIDTH="150" ALIGN="LEFT" /> + <TD CLASS="HDR" ALIGN="CENTER"> + Code de la liste</TD> + <TD CLASS="HDR" ALIGN="CENTER"> + Libellé de la liste</TD> + <TR> + <TD CLASS="TDR"> + ACT_PERIMETRE</TD> + <TD CLASS="TDR"> + Périmètre</TD> + </TR> + </TABLE> + </BODY> +</HTML> + diff --git a/svtools/source/svrtf/svparser.cxx b/svtools/source/svrtf/svparser.cxx index b749400..ae6c1eb 100644 --- a/svtools/source/svrtf/svparser.cxx +++ b/svtools/source/svrtf/svparser.cxx @@ -104,7 +104,6 @@ void SvParser::ClearTxtConvContext() void SvParser::SetSrcEncoding( rtl_TextEncoding eEnc ) { - if( eEnc != eSrcEnc ) { if( pImplData && pImplData->hConv ) @@ -172,13 +171,13 @@ sal_uInt32 SvParser::GetNextChar() { if( 0xfe == c1 && 0xff == c2 ) { - eSrcEnc = RTL_TEXTENCODING_UCS2; + SetSrcEncoding(RTL_TEXTENCODING_UCS2); bUCS2BSrcEnc = true; bSeekBack = false; } else if( 0xff == c1 && 0xfe == c2 ) { - eSrcEnc = RTL_TEXTENCODING_UCS2; + SetSrcEncoding(RTL_TEXTENCODING_UCS2); bUCS2BSrcEnc = false; bSeekBack = false; } @@ -198,7 +197,7 @@ sal_uInt32 SvParser::GetNextChar() bErr = rInput.IsEof() || rInput.GetError(); if( !bErr && ( 0xbf == c3 ) ) { - eSrcEnc = RTL_TEXTENCODING_UTF8; + SetSrcEncoding(RTL_TEXTENCODING_UTF8); bSeekBack = false; } }
_______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits