Repository: tika
Updated Branches:
  refs/heads/master da5bbbea7 -> 46d5775cd


fix for TIKA-1938 contributed by naegelejd

add HtmlParser support for <script> tags within <head>


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/b6d23c18
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/b6d23c18
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/b6d23c18

Branch: refs/heads/master
Commit: b6d23c189e852fa2e41b441c18bfe3e66e3f67c4
Parents: 19ed261
Author: Joseph Naegele <[email protected]>
Authored: Wed Apr 27 18:35:11 2016 +0000
Committer: Joseph Naegele <[email protected]>
Committed: Wed Apr 27 19:07:35 2016 +0000

----------------------------------------------------------------------
 .../apache/tika/parser/html/HtmlHandler.java    |  3 ++
 .../apache/tika/parser/html/HtmlParserTest.java | 38 ++++++++++++++++++++
 2 files changed, 41 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/b6d23c18/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index d5dfaa6..980ed7e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -121,6 +121,9 @@ class HtmlHandler extends TextContentHandler {
             } else if ("LINK".equals(name)) {
                 startElementWithSafeAttributes("link", atts);
                 xhtml.endElement("link");
+            } else if ("SCRIPT".equals(name) && atts.getValue("src") != null) {
+                startElementWithSafeAttributes("script", atts);
+                xhtml.endElement("script");
             }
         }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/b6d23c18/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index d54d3fa..654eb8a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1111,4 +1111,42 @@ public class HtmlParserTest {
         assertEquals("application/xhtml+xml; charset=ISO-8859-1", 
metadata.get(Metadata.CONTENT_TYPE));
 
     }
+
+    @Test
+    public void testScriptSrc() throws Exception {
+        String url = "http://domain.com/logic.js";;
+        String scriptInBody =
+                "<html><body><script src=\"" + url + 
"\"></script></body></html>";
+        String scriptInHead =
+                "<html><head><script src=\"" + url + 
"\"></script></head></html>";
+
+        assertScriptLink(scriptInBody, url);
+        assertScriptLink(scriptInHead, url);
+    }
+
+    private void assertScriptLink(String html, String url) throws Exception {
+        // IdentityHtmlMapper is needed to extract <script> tags
+        ParseContext context = new ParseContext();
+        context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "text/html");
+
+        final List<String> links = new ArrayList<String>();
+        new HtmlParser().parse(
+                new ByteArrayInputStream(html.getBytes(UTF_8)),
+                new DefaultHandler() {
+                    @Override
+                    public void startElement(
+                            String u, String l, String name, Attributes atts) {
+                        if (name.equals("script") && atts.getValue("", "src") 
!= null) {
+                            links.add(atts.getValue("", "src"));
+                        }
+                    }
+                },
+                metadata,
+                context);
+
+        assertEquals(1, links.size());
+        assertEquals(url, links.get(0));
+    }
 }

Reply via email to