Author: dmeikle
Date: Thu Oct 16 15:48:10 2008
New Revision: 705388

URL: http://svn.apache.org/viewvc?rev=705388&view=rev
Log:
TIKA-166: Updated HTMLParser to parse HTML meta tags into Metadata

Modified:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
    incubator/tika/trunk/src/test/resources/test-documents/testHTML.html
    incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java 
Thu Oct 16 15:48:10 2008
@@ -81,6 +81,7 @@
 
         DISCARD_ELEMENTS.add("STYLE");
         DISCARD_ELEMENTS.add("SCRIPT");
+
     }
 
     public void parse(
@@ -95,9 +96,11 @@
         XPathParser xpath = new XPathParser(null, "");
         Matcher body = xpath.parse("/HTML/BODY//node()");
         Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
+        Matcher meta = xpath.parse("/HTML/HEAD/META//node()");
         handler = new TeeContentHandler(
                 new MatchingContentHandler(getBodyHandler(xhtml), body),
-                new MatchingContentHandler(getTitleHandler(metadata), title));
+                new MatchingContentHandler(getTitleHandler(metadata), title),
+                new MatchingContentHandler(getMetaHandler(metadata), meta));
 
         // Parse the HTML document
         xhtml.startDocument();
@@ -116,6 +119,22 @@
         };
     }
 
+    private ContentHandler getMetaHandler(final Metadata metadata) {
+        return new WriteOutContentHandler() {
+            @Override
+            public void startElement(
+                    String uri, String local, String name, Attributes atts)
+                    throws SAXException {
+                    if (atts.getValue("http-equiv") != null) {
+                        metadata.set(atts.getValue("http-equiv"), 
atts.getValue("content"));
+                    }
+                    if (atts.getValue("name") != null) {
+                        metadata.set(atts.getValue("name"), 
atts.getValue("content"));
+                    }
+            }
+        };
+    }
+
     private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) {
         return new TextContentHandler(xhtml) {
 

Modified: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 (original)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 Thu Oct 16 15:48:10 2008
@@ -67,6 +67,8 @@
 
         assertEquals(
                 "Title : Test Indexation Html", metadata.get(Metadata.TITLE));
+        assertEquals("Tika Developers", metadata.get("Author"));
+        assertEquals("5", metadata.get("refresh"));
         assertEquals("http://www.apache.org/";, href.toString());
 
         String content = body.toString();
@@ -115,6 +117,8 @@
         assertEquals("application/xhtml+xml", 
metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("XHTML test document", metadata.get(Metadata.TITLE));
         String content = handler.toString();
+        assertEquals("Tika Developers", metadata.get("Author"));
+        assertEquals("5", metadata.get("refresh"));
         assertTrue(content.contains("ability of Apache Tika"));
         assertTrue(content.contains("extract content"));
         assertTrue(content.contains("an XHTML document"));

Modified: incubator/tika/trunk/src/test/resources/test-documents/testHTML.html
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testHTML.html?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/resources/test-documents/testHTML.html 
(original)
+++ incubator/tika/trunk/src/test/resources/test-documents/testHTML.html Thu 
Oct 16 15:48:10 2008
@@ -1,9 +1,11 @@
 <html>
        <head>
-               <title>Title : Test Indexation Html</title>     
-       </head>
+        <title>Title : Test Indexation Html</title>
+        <meta name="Author" content="Tika Developers">
+        <meta http-equiv="refresh" content="5">
+    </head>
        <body>
                <h1>Test Indexation Html</h1>
                <p><a href="http://www.apache.org/";>Indexation</a> du 
fichier</p>
-       </body> 
+       </body>
 </html>
\ No newline at end of file

Modified: incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html?rev=705388&r1=705387&r2=705388&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html 
(original)
+++ incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html Thu 
Oct 16 15:48:10 2008
@@ -1,11 +1,13 @@
 <html xmlns="http://www.w3.org/1999/xhtml";>
   <head>
     <title>XHTML test document</title>
+    <meta name="Author" content="Tika Developers"/>
+    <meta http-equiv="refresh" content="5"/>
   </head>
   <body>
     <p>
       This document tests the ability of Apache Tika to extract content
       from an <a href="http://www.w3.org/TR/xhtml1/";>XHTML document</a>.
     </p>
-  </body> 
+  </body>
 </html>
\ No newline at end of file


Reply via email to