Repository: tika Updated Branches: refs/heads/2.x e855648af -> 43e30006d
TIKA-1837 -- strip comments before trying to find encoding in HTMLEncodingDetector Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/43e30006 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/43e30006 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/43e30006 Branch: refs/heads/2.x Commit: 43e30006d16ef4957c7aff347c9b76f3e1e163c5 Parents: e855648 Author: tballison <[email protected]> Authored: Thu May 26 10:50:26 2016 -0400 Committer: tballison <[email protected]> Committed: Thu May 26 10:50:26 2016 -0400 ---------------------------------------------------------------------- .../tika/parser/html/HtmlEncodingDetector.java | 18 +++++++++++++++-- .../apache/tika/parser/html/HtmlParserTest.java | 21 ++++++++++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/43e30006/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java index edb014b..87632eb 100644 --- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java +++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java @@ -84,7 +84,22 @@ public class HtmlEncodingDetector implements EncodingDetector { String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString(); - Matcher equiv = HTTP_META_PATTERN.matcher(head); + //strip out comments + String headNoComments = head.replaceAll("<!--.*?(-->|$)", " "); + //try to find the encoding in head without comments + Charset charset = findCharset(headNoComments); + //if nothing is found, back off to find any encoding + if (charset == null) { + return findCharset(head); + } + return charset; + + } + + //returns null if no charset was found + private Charset findCharset(String s) { + + Matcher equiv = HTTP_META_PATTERN.matcher(s); Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher(""); //iterate through meta tags while (equiv.find()) { @@ -105,5 +120,4 @@ public class HtmlEncodingDetector implements EncodingDetector { } return null; } - } http://git-wip-us.apache.org/repos/asf/tika/blob/43e30006/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 8567cb9..fadb6e9 100644 --- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -19,7 +19,6 @@ package org.apache.tika.parser.html; import static java.nio.charset.StandardCharsets.ISO_8859_1; import static java.nio.charset.StandardCharsets.US_ASCII; import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.tika.TikaTest.assertContains; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; @@ -34,11 +33,14 @@ import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.io.Writer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import org.apache.tika.Tika; +import org.apache.tika.TikaTest; +import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Geographic; import org.apache.tika.metadata.Metadata; @@ -58,7 +60,7 @@ import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -public class HtmlParserTest { +public class HtmlParserTest extends TikaTest { @Test public void testParseAscii() throws Exception { @@ -1111,4 +1113,19 @@ public class HtmlParserTest { assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); } + + @Test + public void testSkippingCommentsInEncodingDetection() throws Exception { + + byte[] bytes = new String("<html><head>" + + "<!--<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\"> -->\n" + + " <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"+ + "</head>"+ + "<body>"+ + "æä»ä¹éè¦æå¸®ä½ ç" + + "</body></html>").getBytes(StandardCharsets.UTF_8); + EncodingDetector htmlEncodingDetector = new HtmlEncodingDetector(); + XMLResult r = getXML(new ByteArrayInputStream(bytes), new AutoDetectParser(), new Metadata()); + assertContains("æä»ä¹éè¦æå¸®ä½ ç", r.xml); + } }
