Author: jukka Date: Tue Apr 8 08:59:54 2008 New Revision: 645982 URL: http://svn.apache.org/viewvc?rev=645982&view=rev Log: TIKA-138: Ignore HTML style and script content - Added a set of elements to discard, currently style and script
Modified: incubator/tika/trunk/CHANGES.txt incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Modified: incubator/tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=645982&r1=645981&r2=645982&view=diff ============================================================================== --- incubator/tika/trunk/CHANGES.txt (original) +++ incubator/tika/trunk/CHANGES.txt Tue Apr 8 08:59:54 2008 @@ -41,6 +41,9 @@ 17. TIKA-134 - mvn package does not produce packages for bin/src (Karl Heinz Marbaise) +18. TIKA-138 - Ignore HTML style and script content (Jukka Zitting) + + Release 0.1-incubating - 12/27/2007 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann) Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=645982&r1=645981&r2=645982&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java Tue Apr 8 08:59:54 2008 @@ -20,7 +20,9 @@ import java.io.InputStream; import java.io.StringWriter; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; +import java.util.Set; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.tika.exception.TikaException; @@ -54,6 +56,11 @@ private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>(); + /** + * Set of HTML elements whose content will be discarded. + */ + private static final Set<String> DISCARD_ELEMENTS = new HashSet<String>(); + static { // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd SAFE_ELEMENTS.put("P", "p"); @@ -72,6 +79,9 @@ SAFE_ELEMENTS.put("PRE", "pre"); SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote"); SAFE_ELEMENTS.put("TABLE", "p"); // TODO colspan/rowspan issues + + DISCARD_ELEMENTS.add("STYLE"); + DISCARD_ELEMENTS.add("SCRIPT"); } public void parse( @@ -110,13 +120,19 @@ private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) { return new TextContentHandler(xhtml) { + + private int discardLevel = 0; + @Override public void startElement( String uri, String local, String name, Attributes atts) throws SAXException { - String safe = SAFE_ELEMENTS.get(name); - if (safe != null) { - xhtml.startElement(safe); + if (discardLevel != 0) { + discardLevel++; + } else if (DISCARD_ELEMENTS.contains(name)) { + discardLevel = 1; + } else if (SAFE_ELEMENTS.containsKey(name)) { + xhtml.startElement(SAFE_ELEMENTS.get(name)); } else if ("A".equals(name)) { String href = atts.getValue("href"); if (href == null) { @@ -129,13 +145,31 @@ @Override public void endElement( String uri, String local, String name) throws SAXException { - String safe = SAFE_ELEMENTS.get(name); - if (safe != null) { - xhtml.endElement(safe); + if (discardLevel != 0) { + discardLevel--; + } else if (SAFE_ELEMENTS.containsKey(name)) { + xhtml.endElement(SAFE_ELEMENTS.get(name)); } else if ("A".equals(name)) { xhtml.endElement("a"); } } + + @Override + public void characters(char[] ch, int start, int length) + throws SAXException { + if (discardLevel == 0) { + super.characters(ch, start, length); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + if (discardLevel == 0) { + super.ignorableWhitespace(ch, start, length); + } + } + }; }