Author: jukka
Date: Thu Sep  4 09:40:24 2008
New Revision: 692148

URL: http://svn.apache.org/viewvc?rev=692148&view=rev
Log:
TIKA-149: Parser for zip files 

Applied a patch by Dave Meikle (added the Apache header to ZipParser.java)

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java   
(with props)
Modified:
    incubator/tika/trunk/src/main/resources/tika-config.xml
    incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java?rev=692148&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java 
(added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java 
Thu Sep  4 09:40:24 2008
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.zip;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+/**
+ * Zip File Parser.
+ */
+public class ZipParser extends AbstractParser {
+
+    private Parser parser;
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata)
+            throws IOException, TikaException, SAXException {
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        ZipInputStream zis = new ZipInputStream(stream);
+        ZipEntry ze;
+        while ((ze = zis.getNextEntry()) != null) {
+            xhtml.startElement("div", "class", "file");
+            xhtml.element("h1", ze.getName());
+
+            ContentHandler content = new BodyContentHandler();
+            getParser().parse(new CloseShieldInputStream(zis), content, new 
Metadata());
+
+            xhtml.element("content", content.toString());
+            xhtml.endElement("div");
+
+            zis.closeEntry();
+        }
+        zis.close();
+        xhtml.endDocument();
+    }
+
+    public Parser getParser() {
+        if (parser == null)
+        {
+            return new AutoDetectParser();
+        }
+        return parser;
+    }
+
+    public void setParser(Parser parser) {
+        this.parser = parser;
+    }
+}

Propchange: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=692148&r1=692147&r2=692148&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Sep  4 09:40:24 
2008
@@ -57,7 +57,7 @@
                 <mime>text/plain</mime>
         </parser>
 
-        <parser name="parse-openoffice" 
class="org.apache.tika.parser.opendocument.OpenOfficeParser">            
+        <parser name="parse-openoffice" 
class="org.apache.tika.parser.opendocument.OpenOfficeParser">
                 <mime>application/vnd.sun.xml.writer</mime>
                 <mime>application/vnd.oasis.opendocument.text</mime>
                 <mime>application/vnd.oasis.opendocument.graphics</mime>
@@ -105,6 +105,10 @@
                 <mime>image/x-xcf</mime>
         </parser>
 
+        <parser name="parse-zip" class="org.apache.tika.parser.zip.ZipParser">
+                <mime>application/zip</mime>
+        </parser>
+
     </parsers>
 
 </properties>
\ No newline at end of file

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=692148&r1=692147&r2=692148&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java 
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Thu Sep 
 4 09:40:24 2008
@@ -44,15 +44,15 @@
          * FIXME the old mechanism does not work anymore when running the tests
          * with Maven - need a resource-based one, but this means more changes
          * to classes which rely on filenames.
-         * 
+         *
          * String sep = File.separator; StringTokenizer st = new
          * StringTokenizer(System.getProperty( "java.class.path"),
          * File.pathSeparator);
-         * 
+         *
          * classDir = new File(st.nextToken());
-         * 
+         *
          * config = classDir.getParent() + sep + "config" + sep + "config.xml";
-         * 
+         *
          * String log4j = classDir.getParent() + sep + "Config" + sep + 
"log4j" +
          * sep + "log4j.properties";
          */
@@ -171,6 +171,16 @@
         assertNotNull(parser);
     }
 
+    public void testZipFileExtraction() throws Exception {
+        File file = getTestFile("test-documents.zip");
+        String s1 = ParseUtils.getStringContent(file, tc);
+        String s2 = ParseUtils.getStringContent(file, tc, "application/zip");
+        assertEquals(s1, s2);
+
+        Parser parser = tc.getParser("application/zip");
+        assertNotNull(parser);
+    }
+
     public void testZipExtraction() throws Exception {
         File zip = getTestFile("test-documents.zip");
         List<Parser> parsers = ParseUtils.getParsersFromZip(zip, tc);


Reply via email to