Repository: tika Updated Branches: refs/heads/master da5bbbea7 -> 46d5775cd
fix for TIKA-1938 contributed by naegelejd add HtmlParser support for <script> tags within <head> Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/b6d23c18 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/b6d23c18 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/b6d23c18 Branch: refs/heads/master Commit: b6d23c189e852fa2e41b441c18bfe3e66e3f67c4 Parents: 19ed261 Author: Joseph Naegele <[email protected]> Authored: Wed Apr 27 18:35:11 2016 +0000 Committer: Joseph Naegele <[email protected]> Committed: Wed Apr 27 19:07:35 2016 +0000 ---------------------------------------------------------------------- .../apache/tika/parser/html/HtmlHandler.java | 3 ++ .../apache/tika/parser/html/HtmlParserTest.java | 38 ++++++++++++++++++++ 2 files changed, 41 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/b6d23c18/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java index d5dfaa6..980ed7e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java @@ -121,6 +121,9 @@ class HtmlHandler extends TextContentHandler { } else if ("LINK".equals(name)) { startElementWithSafeAttributes("link", atts); xhtml.endElement("link"); + } else if ("SCRIPT".equals(name) && atts.getValue("src") != null) { + startElementWithSafeAttributes("script", atts); + xhtml.endElement("script"); } } http://git-wip-us.apache.org/repos/asf/tika/blob/b6d23c18/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index d54d3fa..654eb8a 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -1111,4 +1111,42 @@ public class HtmlParserTest { assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); } + + @Test + public void testScriptSrc() throws Exception { + String url = "http://domain.com/logic.js"; + String scriptInBody = + "<html><body><script src=\"" + url + "\"></script></body></html>"; + String scriptInHead = + "<html><head><script src=\"" + url + "\"></script></head></html>"; + + assertScriptLink(scriptInBody, url); + assertScriptLink(scriptInHead, url); + } + + private void assertScriptLink(String html, String url) throws Exception { + // IdentityHtmlMapper is needed to extract <script> tags + ParseContext context = new ParseContext(); + context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html"); + + final List<String> links = new ArrayList<String>(); + new HtmlParser().parse( + new ByteArrayInputStream(html.getBytes(UTF_8)), + new DefaultHandler() { + @Override + public void startElement( + String u, String l, String name, Attributes atts) { + if (name.equals("script") && atts.getValue("", "src") != null) { + links.add(atts.getValue("", "src")); + } + } + }, + metadata, + context); + + assertEquals(1, links.size()); + assertEquals(url, links.get(0)); + } }
