Author: tallison Date: Thu Aug 15 01:59:26 2013 New Revision: 1514126 URL: http://svn.apache.org/r1514126 Log: TIKA 1001 more flexible html meta-header encoding detector
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_1.html tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_2.html tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_3.html tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_4.html Modified: tika/trunk/CHANGES.txt tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Modified: tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1514126&r1=1514125&r2=1514126&view=diff ============================================================================== --- tika/trunk/CHANGES.txt (original) +++ tika/trunk/CHANGES.txt Thu Aug 15 01:59:26 2013 @@ -1,6 +1,8 @@ Release 1.5 - Current Development + * Made HtmlEncodingDetector more flexible in finding meta + header charset (TIKA-1001). - * Added sanitized test HTML file for local file test (Tika-1139). + * Added sanitized test HTML file for local file test (TIKA-1139). * Fixed bug that prevented attachments within a PDF from being processed if the PDF itself was an attachment (TIKA-1124). Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java?rev=1514126&r1=1514125&r2=1514126&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java Thu Aug 15 01:59:26 2013 @@ -42,14 +42,25 @@ public class HtmlEncodingDetector implem // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K) private static final int META_TAG_BUFFER_SIZE = 8192; - private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile( - "(?is)<meta\\s+http-equiv\\s*=\\s*['\\\"]\\s*" - + "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" - + "([^'\\\"]+)['\\\"]"); - - private static final Pattern META_CHARSET_PATTERN = Pattern.compile( - "(?is)<meta\\s+charset\\s*=\\s*['\\\"]([^'\\\"]+)['\\\"]"); - + + private static final Pattern HTTP_META_PATTERN = Pattern.compile( + "(?is)<\\s*meta\\s+([^<>]+)" + ); + + //this should match both the older: + //<meta http-equiv="content-type" content="text/html; charset=xyz"/> + //and + //html5 <meta charset="xyz"> + //See http://webdesign.about.com/od/metatags/qt/meta-charset.htm + //for the noisiness that one might encounter in charset attrs. + //Chose to go with strict ([-_:\\.a-z0-9]+) to match encodings + //following http://docs.oracle.com/javase/7/docs/api/java/nio/charset/Charset.html + //For a more general "not" matcher, try: + //("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)") + private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile( + ("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)") + ); + private static final Charset ASCII = Charset.forName("US-ASCII"); public Charset detect(InputStream input, Metadata metadata) @@ -71,32 +82,28 @@ public class HtmlEncodingDetector implem // Interpret the head as ASCII and try to spot a meta tag with // a possible character encoding hint - String charset = null; + String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString(); - Matcher equiv = HTTP_EQUIV_PATTERN.matcher(head); - if (equiv.find()) { - MediaType type = MediaType.parse(equiv.group(1)); - if (type != null) { - charset = type.getParameters().get("charset"); - } - } - if (charset == null) { - // TIKA-892: HTML5 meta charset tag - Matcher meta = META_CHARSET_PATTERN.matcher(head); - if (meta.find()) { - charset = meta.group(1); - } - } - - if (charset != null) { - try { - return CharsetUtils.forName(charset); - } catch (Exception e) { - // ignore - } + Matcher equiv = HTTP_META_PATTERN.matcher(head); + Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); + //iterate through meta tags + while (equiv.find()) { + String attrs = equiv.group(1); + charsetMatcher.reset(attrs); + //iterate through charset= and return the first match + //that is valid + while (charsetMatcher.find()){ + String candCharset = charsetMatcher.group(1); + if (CharsetUtils.isSupported(candCharset)){ + try{ + return CharsetUtils.forName(candCharset); + } catch (Exception e){ + //ignore + } + } + } } - return null; } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1514126&r1=1514125&r2=1514126&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Thu Aug 15 01:59:26 2013 @@ -809,4 +809,17 @@ public class HtmlParserTest extends Test HtmlParserTest.class.getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"), new Metadata()); assertNotNull(content); } + + //TIKA-1001 + public void testNoisyMetaCharsetHeaders() throws Exception { + Tika tika = new Tika(); + String hit = "\u0623\u0639\u0631\u0628"; + + for (int i = 1; i <=4; i++){ + String fileName = "/test-documents/testHTMLNoisyMetaEncoding_"+i+".html"; + String content = tika.parseToString( + HtmlParserTest.class.getResourceAsStream(fileName)); + assertTrue("testing: " +fileName, content.contains(hit)); + } + } } Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_1.html URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_1.html?rev=1514126&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_1.html (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_1.html Thu Aug 15 01:59:26 2013 @@ -0,0 +1,77 @@ +<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"> + +<head> +<meta http-equiv=Content-Type content="text/html; charset=iso-8859-6"> +<meta name=Generator content="Microsoft Word 12 (filtered medium)"> +<style> +<!-- + /* Font Definitions */ + @font-face + {font-family:"Cambria Math"; + panose-1:0 0 0 0 0 0 0 0 0 0;} +@font-face + {font-family:Calibri; + panose-1:2 15 5 2 2 2 4 3 2 4;} +@font-face + {font-family:"Arabic Transparent"; + panose-1:2 1 0 0 0 0 0 0 0 0;} + /* Style Definitions */ + p.MsoNormal, li.MsoNormal, div.MsoNormal + {margin-top:0in; + margin-right:0in; + margin-bottom:10.0pt; + margin-left:0in; + line-height:115%; + font-size:11.0pt; + font-family:"Calibri","sans-serif";} +a:link, span.MsoHyperlink + {mso-style-priority:99; + color:blue; + text-decoration:underline;} +a:visited, span.MsoHyperlinkFollowed + {mso-style-priority:99; + color:purple; + text-decoration:underline;} +span.EmailStyle17 + {mso-style-type:personal-compose; + font-family:"Calibri","sans-serif"; + color:windowtext;} +.MsoChpDefault + {mso-style-type:export-only;} +@page Section1 + {size:8.5in 11.0in; + margin:1.0in 1.0in 1.0in 1.0in;} +div.Section1 + {page:Section1;} +--> +</style> +<!--[if gte mso 9]><xml> + <o:shapedefaults v:ext="edit" spidmax="1026" /> +</xml><![endif]--><!--[if gte mso 9]><xml> + <o:shapelayout v:ext="edit"> + <o:idmap v:ext="edit" data="1" /> + </o:shapelayout></xml><![endif]--> +</head> + +<body lang=EN-US link=blue vlink=purple> + +<div class=Section1> + +<p class=MsoNormal dir=RTL style='mso-margin-top-alt:auto;mso-margin-bottom-alt: +auto;text-align:right;line-height:normal;direction:rtl;unicode-bidi:embed'><span +dir=LTR style='font-size:12.0pt;color:red'><o:p> </o:p></span></p> + +<p class=MsoNormal dir=RTL style='mso-margin-top-alt:auto;mso-margin-bottom-alt: +auto;text-align:right;line-height:normal;direction:rtl;unicode-bidi:embed'><span +lang=AR-SA style='font-size:12.0pt;font-family:"Arabic Transparent"'>èÃÙÑÈ +ÇäÇÊÍÇÏ ÇäÌÒÇÆÑê èâÊçÇ Ùæ "ÃÓáç äçÐÇ Çäåèâá ÚêÑ ÇäåâÈèä èÇäáÑÏê èÚêÑ +ÇäÑêÇÖê"¬ èçÏÏ "ÈÇÊÎÇÐ ãä ÇäÅÌÑÇÁÇÊ äÏé ÇäåæØåÇÊ ÇäåÙæêÉ ääÏáÇÙ Ùæ +åÕÇäÍç èÇäÖÑÑ ÇäãÈêÑ ÇäæÇÌå Ùæ Ðäã".<o:p></o:p></span></p> + +<p class=MsoNormal><o:p> </o:p></p> + +</div> + +</body> + +</html> Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_2.html URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_2.html?rev=1514126&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_2.html (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_2.html Thu Aug 15 01:59:26 2013 @@ -0,0 +1,77 @@ +<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"> + +<head> +<meta http-equiv=Content-Type content="text/html; charset=iso-8859-6"> +<meta name=Generator content="Microsoft Word 12 (filtered medium)"> +<style> +<!-- + /* Font Definitions */ + @font-face + {font-family:"Cambria Math"; + panose-1:0 0 0 0 0 0 0 0 0 0;} +@font-face + {font-family:Calibri; + panose-1:2 15 5 2 2 2 4 3 2 4;} +@font-face + {font-family:"Arabic Transparent"; + panose-1:2 1 0 0 0 0 0 0 0 0;} + /* Style Definitions */ + p.MsoNormal, li.MsoNormal, div.MsoNormal + {margin-top:0in; + margin-right:0in; + margin-bottom:10.0pt; + margin-left:0in; + line-height:115%; + font-size:11.0pt; + font-family:"Calibri","sans-serif";} +a:link, span.MsoHyperlink + {mso-style-priority:99; + color:blue; + text-decoration:underline;} +a:visited, span.MsoHyperlinkFollowed + {mso-style-priority:99; + color:purple; + text-decoration:underline;} +span.EmailStyle17 + {mso-style-type:personal-compose; + font-family:"Calibri","sans-serif"; + color:windowtext;} +.MsoChpDefault + {mso-style-type:export-only;} +@page Section1 + {size:8.5in 11.0in; + margin:1.0in 1.0in 1.0in 1.0in;} +div.Section1 + {page:Section1;} +--> +</style> +<!--[if gte mso 9]><xml> + <o:shapedefaults v:ext="edit" spidmax="1026" /> +</xml><![endif]--><!--[if gte mso 9]><xml> + <o:shapelayout v:ext="edit"> + <o:idmap v:ext="edit" data="1" /> + </o:shapelayout></xml><![endif]--> +</head> + +<body lang=EN-US link=blue vlink=purple> + +<div class=Section1> + +<p class=MsoNormal dir=RTL style='mso-margin-top-alt:auto;mso-margin-bottom-alt: +auto;text-align:right;line-height:normal;direction:rtl;unicode-bidi:embed'><span +dir=LTR style='font-size:12.0pt;color:red'><o:p> </o:p></span></p> + +<p class=MsoNormal dir=RTL style='mso-margin-top-alt:auto;mso-margin-bottom-alt: +auto;text-align:right;line-height:normal;direction:rtl;unicode-bidi:embed'><span +lang=AR-SA style='font-size:12.0pt;font-family:"Arabic Transparent"'>èÃÙÑÈ +ÇäÇÊÍÇÏ ÇäÌÒÇÆÑê èâÊçÇ Ùæ "ÃÓáç äçÐÇ Çäåèâá ÚêÑ ÇäåâÈèä èÇäáÑÏê èÚêÑ +ÇäÑêÇÖê"¬ èçÏÏ "ÈÇÊÎÇÐ ãä ÇäÅÌÑÇÁÇÊ äÏé ÇäåæØåÇÊ ÇäåÙæêÉ ääÏáÇÙ Ùæ +åÕÇäÍç èÇäÖÑÑ ÇäãÈêÑ ÇäæÇÌå Ùæ Ðäã".<o:p></o:p></span></p> + +<p class=MsoNormal><o:p> </o:p></p> + +</div> + +</body> + +</html> Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_3.html URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_3.html?rev=1514126&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_3.html (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_3.html Thu Aug 15 01:59:26 2013 @@ -0,0 +1,77 @@ +<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"> + +<head> +<meta content=text/html; charset = iso-8859-6 http-equiv=Content-Type> +<meta name=Generator content="Microsoft Word 12 (filtered medium)"> +<style> +<!-- + /* Font Definitions */ + @font-face + {font-family:"Cambria Math"; + panose-1:0 0 0 0 0 0 0 0 0 0;} +@font-face + {font-family:Calibri; + panose-1:2 15 5 2 2 2 4 3 2 4;} +@font-face + {font-family:"Arabic Transparent"; + panose-1:2 1 0 0 0 0 0 0 0 0;} + /* Style Definitions */ + p.MsoNormal, li.MsoNormal, div.MsoNormal + {margin-top:0in; + margin-right:0in; + margin-bottom:10.0pt; + margin-left:0in; + line-height:115%; + font-size:11.0pt; + font-family:"Calibri","sans-serif";} +a:link, span.MsoHyperlink + {mso-style-priority:99; + color:blue; + text-decoration:underline;} +a:visited, span.MsoHyperlinkFollowed + {mso-style-priority:99; + color:purple; + text-decoration:underline;} +span.EmailStyle17 + {mso-style-type:personal-compose; + font-family:"Calibri","sans-serif"; + color:windowtext;} +.MsoChpDefault + {mso-style-type:export-only;} +@page Section1 + {size:8.5in 11.0in; + margin:1.0in 1.0in 1.0in 1.0in;} +div.Section1 + {page:Section1;} +--> +</style> +<!--[if gte mso 9]><xml> + <o:shapedefaults v:ext="edit" spidmax="1026" /> +</xml><![endif]--><!--[if gte mso 9]><xml> + <o:shapelayout v:ext="edit"> + <o:idmap v:ext="edit" data="1" /> + </o:shapelayout></xml><![endif]--> +</head> + +<body lang=EN-US link=blue vlink=purple> + +<div class=Section1> + +<p class=MsoNormal dir=RTL style='mso-margin-top-alt:auto;mso-margin-bottom-alt: +auto;text-align:right;line-height:normal;direction:rtl;unicode-bidi:embed'><span +dir=LTR style='font-size:12.0pt;color:red'><o:p> </o:p></span></p> + +<p class=MsoNormal dir=RTL style='mso-margin-top-alt:auto;mso-margin-bottom-alt: +auto;text-align:right;line-height:normal;direction:rtl;unicode-bidi:embed'><span +lang=AR-SA style='font-size:12.0pt;font-family:"Arabic Transparent"'>èÃÙÑÈ +ÇäÇÊÍÇÏ ÇäÌÒÇÆÑê èâÊçÇ Ùæ "ÃÓáç äçÐÇ Çäåèâá ÚêÑ ÇäåâÈèä èÇäáÑÏê èÚêÑ +ÇäÑêÇÖê"¬ èçÏÏ "ÈÇÊÎÇÐ ãä ÇäÅÌÑÇÁÇÊ äÏé ÇäåæØåÇÊ ÇäåÙæêÉ ääÏáÇÙ Ùæ +åÕÇäÍç èÇäÖÑÑ ÇäãÈêÑ ÇäæÇÌå Ùæ Ðäã".<o:p></o:p></span></p> + +<p class=MsoNormal><o:p> </o:p></p> + +</div> + +</body> + +</html> Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_4.html URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_4.html?rev=1514126&view=auto ============================================================================== --- tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_4.html (added) +++ tika/trunk/tika-parsers/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_4.html Thu Aug 15 01:59:26 2013 @@ -0,0 +1,77 @@ +<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"> + +<head> +<meta content=text/html; charset = iso-8859-6 http-equiv=Content-Type> +<meta name=Generator content="Microsoft Word 12 (filtered medium)"> +<style> +<!-- + /* Font Definitions */ + @font-face + {font-family:"Cambria Math"; + panose-1:0 0 0 0 0 0 0 0 0 0;} +@font-face + {font-family:Calibri; + panose-1:2 15 5 2 2 2 4 3 2 4;} +@font-face + {font-family:"Arabic Transparent"; + panose-1:2 1 0 0 0 0 0 0 0 0;} + /* Style Definitions */ + p.MsoNormal, li.MsoNormal, div.MsoNormal + {margin-top:0in; + margin-right:0in; + margin-bottom:10.0pt; + margin-left:0in; + line-height:115%; + font-size:11.0pt; + font-family:"Calibri","sans-serif";} +a:link, span.MsoHyperlink + {mso-style-priority:99; + color:blue; + text-decoration:underline;} +a:visited, span.MsoHyperlinkFollowed + {mso-style-priority:99; + color:purple; + text-decoration:underline;} +span.EmailStyle17 + {mso-style-type:personal-compose; + font-family:"Calibri","sans-serif"; + color:windowtext;} +.MsoChpDefault + {mso-style-type:export-only;} +@page Section1 + {size:8.5in 11.0in; + margin:1.0in 1.0in 1.0in 1.0in;} +div.Section1 + {page:Section1;} +--> +</style> +<!--[if gte mso 9]><xml> + <o:shapedefaults v:ext="edit" spidmax="1026" /> +</xml><![endif]--><!--[if gte mso 9]><xml> + <o:shapelayout v:ext="edit"> + <o:idmap v:ext="edit" data="1" /> + </o:shapelayout></xml><![endif]--> +</head> + +<body lang=EN-US link=blue vlink=purple> + +<div class=Section1> + +<p class=MsoNormal dir=RTL style='mso-margin-top-alt:auto;mso-margin-bottom-alt: +auto;text-align:right;line-height:normal;direction:rtl;unicode-bidi:embed'><span +dir=LTR style='font-size:12.0pt;color:red'><o:p> </o:p></span></p> + +<p class=MsoNormal dir=RTL style='mso-margin-top-alt:auto;mso-margin-bottom-alt: +auto;text-align:right;line-height:normal;direction:rtl;unicode-bidi:embed'><span +lang=AR-SA style='font-size:12.0pt;font-family:"Arabic Transparent"'>èÃÙÑÈ +ÇäÇÊÍÇÏ ÇäÌÒÇÆÑê èâÊçÇ Ùæ "ÃÓáç äçÐÇ Çäåèâá ÚêÑ ÇäåâÈèä èÇäáÑÏê èÚêÑ +ÇäÑêÇÖê"¬ èçÏÏ "ÈÇÊÎÇÐ ãä ÇäÅÌÑÇÁÇÊ äÏé ÇäåæØåÇÊ ÇäåÙæêÉ ääÏáÇÙ Ùæ +åÕÇäÍç èÇäÖÑÑ ÇäãÈêÑ ÇäæÇÌå Ùæ Ðäã".<o:p></o:p></span></p> + +<p class=MsoNormal><o:p> </o:p></p> + +</div> + +</body> + +</html>