Author: dmeikle Date: Thu Oct 16 15:48:10 2008 New Revision: 705388 URL: http://svn.apache.org/viewvc?rev=705388&view=rev Log: TIKA-166: Updated HTMLParser to parse HTML meta tags into Metadata
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java incubator/tika/trunk/src/test/resources/test-documents/testHTML.html incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=705388&r1=705387&r2=705388&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Thu Oct 16 15:48:10 2008 @@ -81,6 +81,7 @@ DISCARD_ELEMENTS.add("STYLE"); DISCARD_ELEMENTS.add("SCRIPT"); + } public void parse( @@ -95,9 +96,11 @@ XPathParser xpath = new XPathParser(null, ""); Matcher body = xpath.parse("/HTML/BODY//node()"); Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()"); + Matcher meta = xpath.parse("/HTML/HEAD/META//node()"); handler = new TeeContentHandler( new MatchingContentHandler(getBodyHandler(xhtml), body), - new MatchingContentHandler(getTitleHandler(metadata), title)); + new MatchingContentHandler(getTitleHandler(metadata), title), + new MatchingContentHandler(getMetaHandler(metadata), meta)); // Parse the HTML document xhtml.startDocument(); @@ -116,6 +119,22 @@ }; } + private ContentHandler getMetaHandler(final Metadata metadata) { + return new WriteOutContentHandler() { + @Override + public void startElement( + String uri, String local, String name, Attributes atts) + throws SAXException { + if (atts.getValue("http-equiv") != null) { + metadata.set(atts.getValue("http-equiv"), atts.getValue("content")); + } + if (atts.getValue("name") != null) { + metadata.set(atts.getValue("name"), atts.getValue("content")); + } + } + }; + } + private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) { return new TextContentHandler(xhtml) { Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=705388&r1=705387&r2=705388&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Thu Oct 16 15:48:10 2008 @@ -67,6 +67,8 @@ assertEquals( "Title : Test Indexation Html", metadata.get(Metadata.TITLE)); + assertEquals("Tika Developers", metadata.get("Author")); + assertEquals("5", metadata.get("refresh")); assertEquals("http://www.apache.org/", href.toString()); String content = body.toString(); @@ -115,6 +117,8 @@ assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("XHTML test document", metadata.get(Metadata.TITLE)); String content = handler.toString(); + assertEquals("Tika Developers", metadata.get("Author")); + assertEquals("5", metadata.get("refresh")); assertTrue(content.contains("ability of Apache Tika")); assertTrue(content.contains("extract content")); assertTrue(content.contains("an XHTML document")); Modified: incubator/tika/trunk/src/test/resources/test-documents/testHTML.html URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testHTML.html?rev=705388&r1=705387&r2=705388&view=diff ============================================================================== --- incubator/tika/trunk/src/test/resources/test-documents/testHTML.html (original) +++ incubator/tika/trunk/src/test/resources/test-documents/testHTML.html Thu Oct 16 15:48:10 2008 @@ -1,9 +1,11 @@ <html> <head> - <title>Title : Test Indexation Html</title> - </head> + <title>Title : Test Indexation Html</title> + <meta name="Author" content="Tika Developers"> + <meta http-equiv="refresh" content="5"> + </head> <body> <h1>Test Indexation Html</h1> <p><a href="http://www.apache.org/">Indexation</a> du fichier</p> - </body> + </body> </html> \ No newline at end of file Modified: incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html?rev=705388&r1=705387&r2=705388&view=diff ============================================================================== --- incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html (original) +++ incubator/tika/trunk/src/test/resources/test-documents/testXHTML.html Thu Oct 16 15:48:10 2008 @@ -1,11 +1,13 @@ <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>XHTML test document</title> + <meta name="Author" content="Tika Developers"/> + <meta http-equiv="refresh" content="5"/> </head> <body> <p> This document tests the ability of Apache Tika to extract content from an <a href="http://www.w3.org/TR/xhtml1/">XHTML document</a>. </p> - </body> + </body> </html> \ No newline at end of file