Author: jukka Date: Thu Sep 4 13:33:07 2008 New Revision: 692227 URL: http://svn.apache.org/viewvc?rev=692227&view=rev Log: TIKA-150: Parser for tar files
Added a tar parser implementation based on tar parsing code from Apache Ant. I preferred to copy the classes over to Tika instead of adding a dependency Ant. Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/ - copied from r692208, ant/core/trunk/src/main/org/apache/tools/tar/ incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ (props changed) - copied from r692181, incubator/tika/trunk/src/test/java/org/apache/tika/parser/zip/ incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar (with props) Removed: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarOutputStream.java incubator/tika/trunk/src/test/java/org/apache/tika/parser/zip/ Modified: incubator/tika/trunk/CHANGES.txt incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml incubator/tika/trunk/src/main/resources/tika-config.xml incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Modified: incubator/tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=692227&r1=692226&r2=692227&view=diff ============================================================================== --- incubator/tika/trunk/CHANGES.txt (original) +++ incubator/tika/trunk/CHANGES.txt Thu Sep 4 13:33:07 2008 @@ -64,6 +64,8 @@ 27. TIKA-149 - Parser for Zip files (Dave Meikle & Jukka Zitting) +28. TIKA-150 - Parser for tar files (Jukka Zitting) + Release 0.1-incubating - 12/27/2007 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann) Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java?rev=692227&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java Thu Sep 4 13:33:07 2008 @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.pkg.tar.TarEntry; +import org.apache.tika.parser.pkg.tar.TarInputStream; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Tar parser. + */ +public class TarParser extends PackageParser { + + /** + * Parses the given stream as a tar file. + */ + public void parse( + InputStream stream, ContentHandler handler, Metadata metadata) + throws IOException, TikaException, SAXException { + metadata.set(Metadata.CONTENT_TYPE, "application/x-tar"); + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + // At the end we want to close the tar stream to release any associated + // resources, but the underlying document stream should not be closed + TarInputStream tar = + new TarInputStream(new CloseShieldInputStream(stream)); + try { + TarEntry entry = tar.getNextEntry(); + while (entry != null) { + if (!entry.isDirectory()) { + Metadata entrydata = new Metadata(); + entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName()); + parseEntry(tar, xhtml, entrydata); + } + entry = tar.getNextEntry(); + } + } finally { + tar.close(); + } + + xhtml.endDocument(); + } + +} Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java?rev=692227&r1=692208&r2=692227&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java Thu Sep 4 13:33:07 2008 @@ -19,9 +19,11 @@ /* * This package is based on the work done by Timothy Gerard Endres * ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great code. + * + * This package has since been copied from Apache Ant to Apache Tika. */ -package org.apache.tools.tar; +package org.apache.tika.parser.pkg.tar; import java.io.InputStream; import java.io.OutputStream; Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java?rev=692227&r1=692208&r2=692227&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java Thu Sep 4 13:33:07 2008 @@ -19,9 +19,11 @@ /* * This package is based on the work done by Timothy Gerard Endres * ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great code. + * + * This package has since been copied from Apache Ant to Apache Tika. */ -package org.apache.tools.tar; +package org.apache.tika.parser.pkg.tar; /** * This interface contains all the definitions used in the package. Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java?rev=692227&r1=692208&r2=692227&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java Thu Sep 4 13:33:07 2008 @@ -19,9 +19,11 @@ /* * This package is based on the work done by Timothy Gerard Endres * ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great code. + * + * This package has since been copied from Apache Ant to Apache Tika. */ -package org.apache.tools.tar; +package org.apache.tika.parser.pkg.tar; import java.io.File; import java.util.Date; Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java?rev=692227&r1=692208&r2=692227&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java Thu Sep 4 13:33:07 2008 @@ -19,9 +19,11 @@ /* * This package is based on the work done by Timothy Gerard Endres * ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great code. + * + * This package has since been copied from Apache Ant to Apache Tika. */ -package org.apache.tools.tar; +package org.apache.tika.parser.pkg.tar; import java.io.FilterInputStream; import java.io.IOException; Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java?rev=692227&r1=692208&r2=692227&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java Thu Sep 4 13:33:07 2008 @@ -19,9 +19,11 @@ /* * This package is based on the work done by Timothy Gerard Endres * ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great code. + * + * This package has since been copied from Apache Ant to Apache Tika. */ -package org.apache.tools.tar; +package org.apache.tika.parser.pkg.tar; /** * This class provides static utility methods to work with byte streams. Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=692227&r1=692226&r2=692227&view=diff ============================================================================== --- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original) +++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Thu Sep 4 13:33:07 2008 @@ -395,6 +395,16 @@ <glob pattern="*.zip" /> </mime-type> + <mime-type type="application/x-tar"> + <magic priority="40"> + <!-- POSIX tar archive --> + <match value="ustar\0" type="string" offset="257" /> + <!-- GNU tar archive --> + <match value="ustar \0" type="string" offset="257" /> + </magic> + <glob pattern="*.tar" /> + </mime-type> + <mime-type type="application/msword"> <glob pattern="*.doc" /> <alias type="application/vnd.ms-word" /> Modified: incubator/tika/trunk/src/main/resources/tika-config.xml URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=692227&r1=692226&r2=692227&view=diff ============================================================================== --- incubator/tika/trunk/src/main/resources/tika-config.xml (original) +++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Sep 4 13:33:07 2008 @@ -109,6 +109,10 @@ <mime>application/zip</mime> </parser> + <parser name="parse-tar" class="org.apache.tika.parser.pkg.TarParser"> + <mime>application/x-tar</mime> + </parser> + </parsers> </properties> \ No newline at end of file Propchange: incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ ------------------------------------------------------------------------------ svn:mergeinfo = Added: incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java?rev=692227&view=auto ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java (added) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java Thu Sep 4 13:33:07 2008 @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import java.io.InputStream; + +import junit.framework.TestCase; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.xml.sax.ContentHandler; + +/** + * Test case for parsing tar files. + */ +public class TarParserTest extends TestCase { + + public void testTarParsing() throws Exception { + Parser parser = new AutoDetectParser(); // Should auto-detect! + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + + InputStream stream = TarParserTest.class.getResourceAsStream( + "/test-documents/test-documents.tar"); + try { + parser.parse(stream, handler, metadata); + } finally { + stream.close(); + } + + assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE)); + String content = handler.toString(); + System.out.println(content); + assertTrue(content.contains("test-documents/testEXCEL.xls")); + assertTrue(content.contains("Sample Excel Worksheet")); + assertTrue(content.contains("test-documents/testHTML.html")); + assertTrue(content.contains("Test Indexation Html")); + assertTrue(content.contains("test-documents/testOpenOffice2.odt")); + assertTrue(content.contains("This is a sample Open Office document")); + assertTrue(content.contains("test-documents/testPDF.pdf")); + assertTrue(content.contains("Apache Tika")); + assertTrue(content.contains("test-documents/testPPT.ppt")); + assertTrue(content.contains("Sample Powerpoint Slide")); + assertTrue(content.contains("test-documents/testRTF.rtf")); + assertTrue(content.contains("indexation Word")); + assertTrue(content.contains("test-documents/testTXT.txt")); + assertTrue(content.contains("Test d'indexation de Txt")); + assertTrue(content.contains("test-documents/testWORD.doc")); + assertTrue(content.contains("This is a sample Microsoft Word Document")); + assertTrue(content.contains("test-documents/testXML.xml")); + assertTrue(content.contains("Rida Benjelloun")); + } + +} Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=692227&r1=692181&r2=692227&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Thu Sep 4 13:33:07 2008 @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.parser.zip; +package org.apache.tika.parser.pkg; import java.io.InputStream; Added: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar?rev=692227&view=auto ============================================================================== Binary file - no diff available. Propchange: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream