Author: jukka
Date: Tue Apr  8 08:59:54 2008
New Revision: 645982

URL: http://svn.apache.org/viewvc?rev=645982&view=rev
Log:
TIKA-138: Ignore HTML style and script content
    - Added a set of elements to discard, currently style and script

Modified:
    incubator/tika/trunk/CHANGES.txt
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=645982&r1=645981&r2=645982&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Tue Apr  8 08:59:54 2008
@@ -41,6 +41,9 @@
 17. TIKA-134 - mvn package does not produce packages for bin/src
                (Karl Heinz Marbaise)
 
+18. TIKA-138 - Ignore HTML style and script content (Jukka Zitting)
+
+
 Release 0.1-incubating - 12/27/2007
 
 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=645982&r1=645981&r2=645982&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/html/HtmlParser.java 
Tue Apr  8 08:59:54 2008
@@ -20,7 +20,9 @@
 import java.io.InputStream;
 import java.io.StringWriter;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
@@ -54,6 +56,11 @@
     private static final Map<String, String> SAFE_ELEMENTS =
         new HashMap<String, String>();
 
+    /**
+     * Set of HTML elements whose content will be discarded.
+     */
+    private static final Set<String> DISCARD_ELEMENTS = new HashSet<String>();
+
     static {
         // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
         SAFE_ELEMENTS.put("P", "p");
@@ -72,6 +79,9 @@
         SAFE_ELEMENTS.put("PRE", "pre");
         SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
         SAFE_ELEMENTS.put("TABLE", "p"); // TODO colspan/rowspan issues
+
+        DISCARD_ELEMENTS.add("STYLE");
+        DISCARD_ELEMENTS.add("SCRIPT");
     }
 
     public void parse(
@@ -110,13 +120,19 @@
 
     private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) {
         return new TextContentHandler(xhtml) {
+
+            private int discardLevel = 0;
+
             @Override
             public void startElement(
                     String uri, String local, String name, Attributes atts)
                     throws SAXException {
-                String safe = SAFE_ELEMENTS.get(name);
-                if (safe != null) {
-                    xhtml.startElement(safe);
+                if (discardLevel != 0) {
+                    discardLevel++;
+                } else if (DISCARD_ELEMENTS.contains(name)) {
+                    discardLevel = 1;
+                } else if (SAFE_ELEMENTS.containsKey(name)) {
+                    xhtml.startElement(SAFE_ELEMENTS.get(name));
                 } else if ("A".equals(name)) {
                     String href = atts.getValue("href");
                     if (href == null) {
@@ -129,13 +145,31 @@
             @Override
             public void endElement(
                     String uri, String local, String name) throws SAXException 
{
-                String safe = SAFE_ELEMENTS.get(name);
-                if (safe != null) {
-                    xhtml.endElement(safe);
+                if (discardLevel != 0) {
+                    discardLevel--;
+                } else if (SAFE_ELEMENTS.containsKey(name)) {
+                    xhtml.endElement(SAFE_ELEMENTS.get(name));
                 } else if ("A".equals(name)) {
                     xhtml.endElement("a");
                 }
             }
+
+            @Override
+            public void characters(char[] ch, int start, int length)
+                    throws SAXException {
+                if (discardLevel == 0) {
+                    super.characters(ch, start, length);
+                }
+            }
+
+            @Override
+            public void ignorableWhitespace(char[] ch, int start, int length)
+                    throws SAXException {
+                if (discardLevel == 0) {
+                    super.ignorableWhitespace(ch, start, length);
+                }
+            }
+
         };
     }
 


Reply via email to