Author: jukka Date: Thu Sep 4 09:40:24 2008 New Revision: 692148 URL: http://svn.apache.org/viewvc?rev=692148&view=rev Log: TIKA-149: Parser for zip files
Applied a patch by Dave Meikle (added the Apache header to ZipParser.java) Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java (with props) Modified: incubator/tika/trunk/src/main/resources/tika-config.xml incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java?rev=692148&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java Thu Sep 4 09:40:24 2008 @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.zip; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +/** + * Zip File Parser. + */ +public class ZipParser extends AbstractParser { + + private Parser parser; + + public void parse(InputStream stream, ContentHandler handler, Metadata metadata) + throws IOException, TikaException, SAXException { + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + ZipInputStream zis = new ZipInputStream(stream); + ZipEntry ze; + while ((ze = zis.getNextEntry()) != null) { + xhtml.startElement("div", "class", "file"); + xhtml.element("h1", ze.getName()); + + ContentHandler content = new BodyContentHandler(); + getParser().parse(new CloseShieldInputStream(zis), content, new Metadata()); + + xhtml.element("content", content.toString()); + xhtml.endElement("div"); + + zis.closeEntry(); + } + zis.close(); + xhtml.endDocument(); + } + + public Parser getParser() { + if (parser == null) + { + return new AutoDetectParser(); + } + return parser; + } + + public void setParser(Parser parser) { + this.parser = parser; + } +} Propchange: incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: incubator/tika/trunk/src/main/resources/tika-config.xml URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=692148&r1=692147&r2=692148&view=diff ============================================================================== --- incubator/tika/trunk/src/main/resources/tika-config.xml (original) +++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Sep 4 09:40:24 2008 @@ -57,7 +57,7 @@ <mime>text/plain</mime> </parser> - <parser name="parse-openoffice" class="org.apache.tika.parser.opendocument.OpenOfficeParser"> + <parser name="parse-openoffice" class="org.apache.tika.parser.opendocument.OpenOfficeParser"> <mime>application/vnd.sun.xml.writer</mime> <mime>application/vnd.oasis.opendocument.text</mime> <mime>application/vnd.oasis.opendocument.graphics</mime> @@ -105,6 +105,10 @@ <mime>image/x-xcf</mime> </parser> + <parser name="parse-zip" class="org.apache.tika.parser.zip.ZipParser"> + <mime>application/zip</mime> + </parser> + </parsers> </properties> \ No newline at end of file Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=692148&r1=692147&r2=692148&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Thu Sep 4 09:40:24 2008 @@ -44,15 +44,15 @@ * FIXME the old mechanism does not work anymore when running the tests * with Maven - need a resource-based one, but this means more changes * to classes which rely on filenames. - * + * * String sep = File.separator; StringTokenizer st = new * StringTokenizer(System.getProperty( "java.class.path"), * File.pathSeparator); - * + * * classDir = new File(st.nextToken()); - * + * * config = classDir.getParent() + sep + "config" + sep + "config.xml"; - * + * * String log4j = classDir.getParent() + sep + "Config" + sep + "log4j" + * sep + "log4j.properties"; */ @@ -171,6 +171,16 @@ assertNotNull(parser); } + public void testZipFileExtraction() throws Exception { + File file = getTestFile("test-documents.zip"); + String s1 = ParseUtils.getStringContent(file, tc); + String s2 = ParseUtils.getStringContent(file, tc, "application/zip"); + assertEquals(s1, s2); + + Parser parser = tc.getParser("application/zip"); + assertNotNull(parser); + } + public void testZipExtraction() throws Exception { File zip = getTestFile("test-documents.zip"); List<Parser> parsers = ParseUtils.getParsersFromZip(zip, tc);