Author: jukka
Date: Thu Sep  4 13:33:07 2008
New Revision: 692227

URL: http://svn.apache.org/viewvc?rev=692227&view=rev
Log:
TIKA-150: Parser for tar files

Added a tar parser implementation based on tar parsing code from Apache Ant. I 
preferred to copy the classes over to Tika instead of adding a dependency Ant.

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/
      - copied from r692208, ant/core/trunk/src/main/org/apache/tools/tar/
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/   (props 
changed)
      - copied from r692181, 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/zip/
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
    incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar   
(with props)
Removed:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarOutputStream.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/zip/
Modified:
    incubator/tika/trunk/CHANGES.txt
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java
    incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
    incubator/tika/trunk/src/main/resources/tika-config.xml
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=692227&r1=692226&r2=692227&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Thu Sep  4 13:33:07 2008
@@ -64,6 +64,8 @@
 
 27. TIKA-149 - Parser for Zip files (Dave Meikle & Jukka Zitting)
 
+28. TIKA-150 - Parser for tar files (Jukka Zitting)
+
 Release 0.1-incubating - 12/27/2007
 
 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java?rev=692227&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java 
(added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java 
Thu Sep  4 13:33:07 2008
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.pkg.tar.TarEntry;
+import org.apache.tika.parser.pkg.tar.TarInputStream;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Tar parser.
+ */
+public class TarParser extends PackageParser {
+
+    /**
+     * Parses the given stream as a tar file.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, TikaException, SAXException {
+        metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        // At the end we want to close the tar stream to release any associated
+        // resources, but the underlying document stream should not be closed
+        TarInputStream tar =
+            new TarInputStream(new CloseShieldInputStream(stream));
+        try {
+            TarEntry entry = tar.getNextEntry();
+            while (entry != null) {
+                if (!entry.isDirectory()) {
+                    Metadata entrydata = new Metadata();
+                    entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
+                    parseEntry(tar, xhtml, entrydata);
+                }
+                entry = tar.getNextEntry();
+            }
+        } finally {
+            tar.close();
+        }
+
+        xhtml.endDocument();
+    }
+
+}

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java?rev=692227&r1=692208&r2=692227&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java
 Thu Sep  4 13:33:07 2008
@@ -19,9 +19,11 @@
 /*
  * This package is based on the work done by Timothy Gerard Endres
  * ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great 
code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
  */
 
-package org.apache.tools.tar;
+package org.apache.tika.parser.pkg.tar;
 
 import java.io.InputStream;
 import java.io.OutputStream;

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java?rev=692227&r1=692208&r2=692227&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java
 Thu Sep  4 13:33:07 2008
@@ -19,9 +19,11 @@
 /*
  * This package is based on the work done by Timothy Gerard Endres
  * ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great 
code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
  */
 
-package org.apache.tools.tar;
+package org.apache.tika.parser.pkg.tar;
 
 /**
  * This interface contains all the definitions used in the package.

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java?rev=692227&r1=692208&r2=692227&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java 
Thu Sep  4 13:33:07 2008
@@ -19,9 +19,11 @@
 /*
  * This package is based on the work done by Timothy Gerard Endres
  * ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great 
code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
  */
 
-package org.apache.tools.tar;
+package org.apache.tika.parser.pkg.tar;
 
 import java.io.File;
 import java.util.Date;

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java?rev=692227&r1=692208&r2=692227&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java
 Thu Sep  4 13:33:07 2008
@@ -19,9 +19,11 @@
 /*
  * This package is based on the work done by Timothy Gerard Endres
  * ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great 
code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
  */
 
-package org.apache.tools.tar;
+package org.apache.tika.parser.pkg.tar;
 
 import java.io.FilterInputStream;
 import java.io.IOException;

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java?rev=692227&r1=692208&r2=692227&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java 
Thu Sep  4 13:33:07 2008
@@ -19,9 +19,11 @@
 /*
  * This package is based on the work done by Timothy Gerard Endres
  * ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great 
code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
  */
 
-package org.apache.tools.tar;
+package org.apache.tika.parser.pkg.tar;
 
 /**
  * This class provides static utility methods to work with byte streams.

Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=692227&r1=692226&r2=692227&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Thu Sep  4 
13:33:07 2008
@@ -395,6 +395,16 @@
     <glob pattern="*.zip" />
   </mime-type>
 
+  <mime-type type="application/x-tar">
+    <magic priority="40">
+      <!-- POSIX tar archive -->
+      <match value="ustar\0" type="string" offset="257" />
+      <!-- GNU tar archive -->
+      <match value="ustar  \0" type="string" offset="257" />
+    </magic>
+    <glob pattern="*.tar" />
+  </mime-type>
+
   <mime-type type="application/msword">
     <glob pattern="*.doc" />
     <alias type="application/vnd.ms-word" />

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=692227&r1=692226&r2=692227&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Sep  4 13:33:07 
2008
@@ -109,6 +109,10 @@
                 <mime>application/zip</mime>
         </parser>
 
+        <parser name="parse-tar" class="org.apache.tika.parser.pkg.TarParser">
+                <mime>application/x-tar</mime>
+        </parser>
+
     </parsers>
 
 </properties>
\ No newline at end of file

Propchange: incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/
------------------------------------------------------------------------------
    svn:mergeinfo = 

Added: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java?rev=692227&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
 (added)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
 Thu Sep  4 13:33:07 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing tar files.
+ */
+public class TarParserTest extends TestCase {
+
+    public void testTarParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        InputStream stream = TarParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents.tar");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+        String content = handler.toString();
+        System.out.println(content);
+        assertTrue(content.contains("test-documents/testEXCEL.xls"));
+        assertTrue(content.contains("Sample Excel Worksheet"));
+        assertTrue(content.contains("test-documents/testHTML.html"));
+        assertTrue(content.contains("Test Indexation Html"));
+        assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
+        assertTrue(content.contains("This is a sample Open Office document"));
+        assertTrue(content.contains("test-documents/testPDF.pdf"));
+        assertTrue(content.contains("Apache Tika"));
+        assertTrue(content.contains("test-documents/testPPT.ppt"));
+        assertTrue(content.contains("Sample Powerpoint Slide"));
+        assertTrue(content.contains("test-documents/testRTF.rtf"));
+        assertTrue(content.contains("indexation Word"));
+        assertTrue(content.contains("test-documents/testTXT.txt"));
+        assertTrue(content.contains("Test d'indexation de Txt"));
+        assertTrue(content.contains("test-documents/testWORD.doc"));
+        assertTrue(content.contains("This is a sample Microsoft Word 
Document"));
+        assertTrue(content.contains("test-documents/testXML.xml"));
+        assertTrue(content.contains("Rida Benjelloun"));
+    }
+
+}

Modified: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=692227&r1=692181&r2=692227&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
 (original)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
 Thu Sep  4 13:33:07 2008
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.parser.zip;
+package org.apache.tika.parser.pkg;
 
 import java.io.InputStream;
 

Added: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar?rev=692227&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to