[tika] 01/01: TIKA-2849 -- move to streaming detection of zip files if TikaInputStream doesn't already have a file.

tallison Fri, 12 Apr 2019 14:11:48 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-2849
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 1e08fb1e54653954a71c71a68218bcdf7a7a3afb
Author: TALLISON <[email protected]>
AuthorDate: Fri Apr 12 17:11:03 2019 -0400

    TIKA-2849 -- move to streaming detection of zip files if TikaInputStream 
doesn't already have a file.
---
 .../org/apache/tika/io/BoundedInputStream.java     | 118 ++++++++
 .../tika/parser/digest/InputStreamDigester.java    | 103 +------
 .../src/test/java/org/apache/tika/TikaTest.java    |   2 +-
 .../org/apache/tika/parser/epub/EpubParser.java    |  12 +-
 .../tika/parser/iwork/IWorkPackageParser.java      |   2 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  37 ++-
 .../parser/pkg/AbstractZipContainerDetector.java   | 163 +++++++++++
 .../parser/pkg/StreamingZipContainerDetector.java  | 213 ++++++++++++++
 .../tika/parser/pkg/ZipContainerDetector.java      | 325 ++++++---------------
 .../org/apache/tika/parser/utils/ZipSalvager.java  |  75 ++---
 .../parser/microsoft/ooxml/TruncatedOOXMLTest.java |   1 +
 .../tika/parser/pkg/ZipContainerDetectorTest.java  | 166 ++++++++++-
 .../org/apache/tika/parser/pkg/tika-config.xml     |  31 ++
 13 files changed, 857 insertions(+), 391 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java 
b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
new file mode 100644
index 0000000..dabedf5
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Very slight modification of Commons' BoundedInputStream
+ * so that we can figure out if this hit the bound or not.
+ */
+public class BoundedInputStream extends InputStream {
+
+
+    private final static int EOF = -1;
+    private final long max;
+    private final InputStream in;
+    private long pos;
+
+    public BoundedInputStream(long max, InputStream in) {
+        this.max = max;
+        this.in = in;
+    }
+
+    @Override
+    public int read() throws IOException {
+        if (max >= 0 && pos >= max) {
+            return EOF;
+        }
+        final int result = in.read();
+        pos++;
+        return result;
+    }
+
+    /**
+     * Invokes the delegate's <code>read(byte[])</code> method.
+     *
+     * @param b the buffer to read the bytes into
+     * @return the number of bytes read or -1 if the end of stream or
+     * the limit has been reached.
+     * @throws IOException if an I/O error occurs
+     */
+    @Override
+    public int read(final byte[] b) throws IOException {
+        return this.read(b, 0, b.length);
+    }
+
+    /**
+     * Invokes the delegate's <code>read(byte[], int, int)</code> method.
+     *
+     * @param b   the buffer to read the bytes into
+     * @param off The start offset
+     * @param len The number of bytes to read
+     * @return the number of bytes read or -1 if the end of stream or
+     * the limit has been reached.
+     * @throws IOException if an I/O error occurs
+     */
+    @Override
+    public int read(final byte[] b, final int off, final int len) throws 
IOException {
+        if (max >= 0 && pos >= max) {
+            return EOF;
+        }
+        final long maxRead = max >= 0 ? Math.min(len, max - pos) : len;
+        final int bytesRead = in.read(b, off, (int) maxRead);
+
+        if (bytesRead == EOF) {
+            return EOF;
+        }
+
+        pos += bytesRead;
+        return bytesRead;
+    }
+
+    /**
+     * Invokes the delegate's <code>skip(long)</code> method.
+     *
+     * @param n the number of bytes to skip
+     * @return the actual number of bytes skipped
+     * @throws IOException if an I/O error occurs
+     */
+    @Override
+    public long skip(final long n) throws IOException {
+        final long toSkip = max >= 0 ? Math.min(n, max - pos) : n;
+        final long skippedBytes = in.skip(toSkip);
+        pos += skippedBytes;
+        return skippedBytes;
+    }
+
+    @Override
+    public void reset() throws IOException {
+        in.reset();
+        pos = 0;
+    }
+
+    @Override
+    public void mark(int readLimit) {
+        in.mark(readLimit);
+    }
+
+    public boolean hasHitBound() {
+        return pos >= max;
+    }
+}
+
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
 
b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
index a208fab..3d3ff17 100644
--- 
a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
+++ 
b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
@@ -26,6 +26,7 @@ import java.security.NoSuchAlgorithmException;
 import java.security.Provider;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.io.IOExceptionWithCause;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
@@ -126,7 +127,7 @@ public class InputStreamDigester implements 
DigestingParser.Digester {
         //try the usual mark/reset stuff.
         //however, if you actually hit the bound,
         //then stop and spool to file via TikaInputStream
-        SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, 
is);
+        BoundedInputStream bis = new BoundedInputStream(markLimit, is);
         boolean finishedStream = false;
         bis.mark(markLimit + 1);
         finishedStream = digestStream(bis, metadata);
@@ -153,7 +154,6 @@ public class InputStreamDigester implements 
DigestingParser.Digester {
         }
     }
 
-
     private String getMetadataKey() {
         return TikaCoreProperties.TIKA_META_PREFIX +
                 "digest" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
@@ -179,8 +179,8 @@ public class InputStreamDigester implements 
DigestingParser.Digester {
         updateDigest(messageDigest, is);
         digestBytes = messageDigest.digest();
 
-        if (is instanceof SimpleBoundedInputStream) {
-            if (((SimpleBoundedInputStream) is).hasHitBound()) {
+        if (is instanceof BoundedInputStream) {
+            if (((BoundedInputStream) is).hasHitBound()) {
                 return false;
             }
         }
@@ -202,99 +202,4 @@ public class InputStreamDigester implements 
DigestingParser.Digester {
         return digest;
     }
 
-
-    /**
-     * Very slight modification of Commons' BoundedInputStream
-     * so that we can figure out if this hit the bound or not.
-     */
-    private static class SimpleBoundedInputStream extends InputStream {
-        private final static int EOF = -1;
-        private final long max;
-        private final InputStream in;
-        private long pos;
-
-        private SimpleBoundedInputStream(long max, InputStream in) {
-            this.max = max;
-            this.in = in;
-        }
-
-        @Override
-        public int read() throws IOException {
-            if (max >= 0 && pos >= max) {
-                return EOF;
-            }
-            final int result = in.read();
-            pos++;
-            return result;
-        }
-
-        /**
-         * Invokes the delegate's <code>read(byte[])</code> method.
-         *
-         * @param b the buffer to read the bytes into
-         * @return the number of bytes read or -1 if the end of stream or
-         * the limit has been reached.
-         * @throws IOException if an I/O error occurs
-         */
-        @Override
-        public int read(final byte[] b) throws IOException {
-            return this.read(b, 0, b.length);
-        }
-
-        /**
-         * Invokes the delegate's <code>read(byte[], int, int)</code> method.
-         *
-         * @param b   the buffer to read the bytes into
-         * @param off The start offset
-         * @param len The number of bytes to read
-         * @return the number of bytes read or -1 if the end of stream or
-         * the limit has been reached.
-         * @throws IOException if an I/O error occurs
-         */
-        @Override
-        public int read(final byte[] b, final int off, final int len) throws 
IOException {
-            if (max >= 0 && pos >= max) {
-                return EOF;
-            }
-            final long maxRead = max >= 0 ? Math.min(len, max - pos) : len;
-            final int bytesRead = in.read(b, off, (int) maxRead);
-
-            if (bytesRead == EOF) {
-                return EOF;
-            }
-
-            pos += bytesRead;
-            return bytesRead;
-        }
-
-        /**
-         * Invokes the delegate's <code>skip(long)</code> method.
-         *
-         * @param n the number of bytes to skip
-         * @return the actual number of bytes skipped
-         * @throws IOException if an I/O error occurs
-         */
-        @Override
-        public long skip(final long n) throws IOException {
-            final long toSkip = max >= 0 ? Math.min(n, max - pos) : n;
-            final long skippedBytes = in.skip(toSkip);
-            pos += skippedBytes;
-            return skippedBytes;
-        }
-
-        @Override
-        public void reset() throws IOException {
-            in.reset();
-            pos = 0;
-        }
-
-        @Override
-        public void mark(int readLimit) {
-            in.mark(readLimit);
-        }
-
-        public boolean hasHitBound() {
-            return pos >= max;
-        }
-    }
 }
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java 
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 00d8600..0aaaf35 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -394,7 +394,7 @@ public abstract class TikaTest {
             IOUtils.copy(is, bos);
         }
         if (truncatedLength > bos.toByteArray().length) {
-            throw new EOFException("Can't truncate beyond file length");
+            throw new EOFException("Can't truncate beyond file length: 
"+bos.toByteArray().length);
         }
         byte[] truncated = new byte[truncatedLength];
         System.arraycopy(bos.toByteArray(), 0, truncated, 0, truncatedLength);
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index df5b221..49019b6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -175,6 +175,12 @@ public class EpubParser extends AbstractParser {
         TemporaryResources temporaryResources = null;
         if (TikaInputStream.isTikaInputStream(stream)) {
             tis = TikaInputStream.cast(stream);
+            if (tis.getOpenContainer() instanceof ZipFile) {
+                bufferedParseZipFile(
+                        (ZipFile)tis.getOpenContainer(),
+                        bodyHandler, xhtml, metadata, context, true);
+                return;
+            }
         } else {
             temporaryResources = new TemporaryResources();
             tis = TikaInputStream.get(new CloseShieldInputStream(stream), 
temporaryResources);
@@ -192,7 +198,11 @@ public class EpubParser extends AbstractParser {
                 tis.close();
             }
         }
-        bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, 
true);
+        try {
+            bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, 
context, true);
+        } finally {
+            zipFile.close();
+        }
     }
 
     private void trySalvage(Path brokenZip, ContentHandler bodyHandler,
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 5d8f01a..2ffbf56 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -119,7 +119,7 @@ public class IWorkPackageParser extends AbstractParser {
           return detectType(zip);
        }
        
-       private static IWORKDocumentType detectType(InputStream stream) {
+       public static IWORKDocumentType detectType(InputStream stream) {
           QName qname = new XmlRootExtractor().extractRootElement(stream);
           if (qname != null) {
              String uri = qname.getNamespaceURI();
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 017469b..c37d895 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.io.EOFException;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -55,6 +56,7 @@ import 
org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtra
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
 import org.apache.tika.parser.pkg.ZipContainerDetector;
 import org.apache.tika.parser.utils.ZipSalvager;
+import org.apache.tika.utils.RereadableInputStream;
 import org.apache.xmlbeans.XmlException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -68,6 +70,7 @@ import org.xml.sax.SAXException;
 public class OOXMLExtractorFactory {
 
     private static final Logger LOG = 
LoggerFactory.getLogger(OOXMLExtractorFactory.class);
+    private static final int MAX_BUFFER_LENGTH = 1000000;
 
     public static void parse(
             InputStream stream, ContentHandler baseHandler,
@@ -98,14 +101,36 @@ public class OOXMLExtractorFactory {
                 }
                 tis.setOpenContainer(pkg);
             } else {
-                InputStream shield = new CloseShieldInputStream(stream);
-                pkg = OPCPackage.open(shield);
+                //OPCPackage slurps rris into memory so we can close rris
+                //without apparent problems
+                try (RereadableInputStream rereadableInputStream =
+                        new RereadableInputStream(stream, MAX_BUFFER_LENGTH,
+                                true, false)) {
+                    try {
+                        pkg = OPCPackage.open(rereadableInputStream);
+                    } catch (EOFException e) {
+                        rereadableInputStream.rewind();
+                        tmpRepairedCopy = 
File.createTempFile("tika-ooxml-repair", "");
+                        ZipSalvager.salvageCopy(rereadableInputStream, 
tmpRepairedCopy);
+                        pkg = OPCPackage.open(tmpRepairedCopy, 
PackageAccess.READ);
+                    }
+                }
+            }
+
+            MediaType type = null;
+            String mediaTypeString = metadata.get(Metadata.CONTENT_TYPE);
+            if (mediaTypeString != null) {
+                type = MediaType.parse(mediaTypeString);
+            }
+            if (type != null && 
OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
+                // Not a supported type, delegate to Empty Parser
+                EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, 
context);
+                return;
             }
 
-            // Get the type, and ensure it's one we handle
-            MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
-            if (type == null) {
-                type = ZipContainerDetector.detectXPSOPC(pkg);
+            if (type == null || ! OOXMLParser.SUPPORTED_TYPES.contains(type)) {
+                // Get the type, and ensure it's one we handle
+                type = ZipContainerDetector.detectOfficeOpenXML(pkg);
             }
 
             if (type == null || 
OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java
new file mode 100644
index 0000000..beadf74
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.tika.mime.MediaType;
+
+abstract class AbstractZipContainerDetector {
+
+
+    static final MediaType TIKA_OOXML = MediaType.application("x-tika-ooxml");
+    static final MediaType DOCX =
+            
MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+    static final MediaType DOCM =
+            MediaType.application("vnd.ms-word.document.macroEnabled.12");
+    static final MediaType DOTX =
+            MediaType.application("vnd.ms-word.document.macroEnabled.12");
+    static final MediaType PPTX =
+            
MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
+
+    static final MediaType PPSM =
+            
MediaType.application("vnd.ms-powerpoint.slideshow.macroEnabled.12");
+    static final MediaType PPSX =
+            
MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow");
+    static final MediaType PPTM =
+            
MediaType.application("vnd.ms-powerpoint.presentation.macroEnabled.12");
+    static final MediaType POTM =
+            
MediaType.application("vnd.ms-powerpoint.template.macroenabled.12");
+    static final MediaType POTX =
+            
MediaType.application("vnd.openxmlformats-officedocument.presentationml.template");
+    static final MediaType THMX =
+            MediaType.application("vnd.openxmlformats-officedocument");
+    static final MediaType XLSB =
+            MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12");
+    static final MediaType XLSX =
+            
MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+    static final MediaType XLSM =
+            MediaType.application("vnd.ms-excel.sheet.macroEnabled.12");
+    static final MediaType XPS =
+            MediaType.application("vnd.ms-xpsdocument");
+
+    static final Set<String> OOXML_HINTS = fillSet(
+            "word/document.xml",
+            "_rels/.rels",
+            "[Content_Types].xml",
+            "ppt/presentation.xml",
+            "ppt/slides/slide1.xml",
+            "xl/workbook.xml",
+            "xl/sharedStrings.xml",
+            "xl/worksheets/sheet1.xml"
+    );
+
+    private static Set<String> fillSet(String ... args) {
+        Set<String> tmp = new HashSet<>();
+        for (String arg : args) {
+            tmp.add(arg);
+        }
+        return Collections.unmodifiableSet(tmp);
+    }
+
+    static MediaType detectJar(Set<String> entryNames) {
+        if (entryNames.contains("META-INF/MANIFEST.MF")) {
+            // It's a Jar file, or something based on Jar
+
+            // Is it an Android APK?
+            if (entryNames.contains("AndroidManifest.xml")) {
+                return MediaType.application("vnd.android.package-archive");
+            }
+
+            // Check for WAR and EAR
+            if (entryNames.contains("WEB-INF/")) {
+                return MediaType.application("x-tika-java-web-archive");
+            }
+            if (entryNames.contains("META-INF/application.xml")) {
+                return MediaType.application("x-tika-java-enterprise-archive");
+            }
+
+            // Looks like a regular Jar Archive
+            return MediaType.application("java-archive");
+        } else {
+            // Some Android APKs miss the default Manifest
+            if (entryNames.contains("AndroidManifest.xml")) {
+                return MediaType.application("vnd.android.package-archive");
+            }
+
+            return null;
+        }
+    }
+
+    static MediaType detectKmz(Set<String> entryFileNames) {
+        //look for a single kml at the main level
+        boolean kmlFound = false;
+        for (String entryFileName : entryFileNames) {
+            if (entryFileName.indexOf('/') != -1
+                    || entryFileName.indexOf('\\') != -1) {
+                continue;
+            }
+            if (entryFileName.endsWith(".kml") && !kmlFound) {
+                kmlFound = true;
+            } else {
+                return null;
+            }
+        }
+        if (kmlFound) {
+            return MediaType.application("vnd.google-earth.kmz");
+        }
+        return null;
+    }
+
+    /**
+     * To be considered as an IPA file, it needs to match all of these
+     */
+    private static HashSet<Pattern> ipaEntryPatterns = new HashSet<Pattern>() {
+        private static final long serialVersionUID = 6545295886322115362L;
+        {
+            add(Pattern.compile("^Payload/$"));
+            add(Pattern.compile("^Payload/.*\\.app/$"));
+            add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$"));
+            
add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$"));
+            add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$"));
+            add(Pattern.compile("^Payload/.*\\.app/PkgInfo$"));
+        }};
+    @SuppressWarnings("unchecked")
+    static MediaType detectIpa(Set<String> entryNames) {
+        // Note - consider generalising this logic, if another format needs 
many regexp matching
+        Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone();
+
+        for (String entryName : entryNames) {
+            Iterator<Pattern> ip = tmpPatterns.iterator();
+            while (ip.hasNext()) {
+                if (ip.next().matcher(entryName).matches()) {
+                    ip.remove();
+                }
+            }
+            if (tmpPatterns.isEmpty()) {
+                // We've found everything we need to find
+                return MediaType.application("x-itunes-ipa");
+            }
+
+        }
+        return null;
+    }
+
+}
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
new file mode 100644
index 0000000..4d54b61
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.iwork.IWorkPackageParser;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class StreamingZipContainerDetector extends AbstractZipContainerDetector {
+
+    static Map<String, MediaType> OOXML_CONTENT_TYPES = new 
ConcurrentHashMap<>();
+    static {
+        OOXML_CONTENT_TYPES.put(XWPFRelation.DOCUMENT.getContentType(), DOCX);
+        OOXML_CONTENT_TYPES.put(XWPFRelation.MACRO_DOCUMENT.getContentType(), 
DOCM);
+        OOXML_CONTENT_TYPES.put(XWPFRelation.TEMPLATE.getContentType(), DOTX);
+
+        OOXML_CONTENT_TYPES.put(XSSFRelation.WORKBOOK.getContentType(), XLSX);
+        OOXML_CONTENT_TYPES.put(XSSFRelation.MACROS_WORKBOOK.getContentType(), 
XLSM);
+        
OOXML_CONTENT_TYPES.put(XSSFRelation.XLSB_BINARY_WORKBOOK.getContentType(), 
XLSB);
+        OOXML_CONTENT_TYPES.put(XSLFRelation.MAIN.getContentType(), PPTX);
+        OOXML_CONTENT_TYPES.put(XSLFRelation.MACRO.getContentType(), PPSM);
+        OOXML_CONTENT_TYPES.put(XSLFRelation.MACRO_TEMPLATE.getContentType(), 
POTM);
+        
OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(), 
PPTM);
+        OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML.getContentType(), 
PPSX);
+        
OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATION_MACRO.getContentType(), PPTM);
+        
OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(), 
POTX);
+        OOXML_CONTENT_TYPES.put(XSLFRelation.THEME_MANAGER.getContentType(), 
THMX);
+        
OOXML_CONTENT_TYPES.put("application/vnd.ms-package.xps-fixeddocumentsequence+xml",
 XPS);
+    }
+
+    static MediaType detect(InputStream is) {
+
+        Set<String> fileNames = new HashSet<>();
+        Set<String> directoryNames = new HashSet<>();
+        try {
+            ZipArchiveInputStream zipArchiveInputStream = new 
ZipArchiveInputStream(is);
+            ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+            while (zae != null) {
+                String name = zae.getName();
+                if (zae.isDirectory()) {
+                    directoryNames.add(name);
+                    zae = zipArchiveInputStream.getNextZipEntry();
+                    continue;
+                }
+                fileNames.add(name);
+                //we could also parse _rel/.rels, but if
+                // there isn't a valid content_types, then POI
+                //will throw an exception...Better to backoff to PKG
+                //than correctly identify a truncated
+                if (name.equals("[Content_Types].xml")) {
+                    MediaType mt = 
parseOOXMLContentTypes(zipArchiveInputStream);
+                    if (mt != null) {
+                        return mt;
+                    }
+                    return TIKA_OOXML;
+                } else if 
(IWorkPackageParser.IWORK_CONTENT_ENTRIES.contains(name)) {
+                    IWorkPackageParser.IWORKDocumentType type = 
IWorkPackageParser.IWORKDocumentType.detectType(zipArchiveInputStream);
+                    if (type != null) {
+                        return type.getType();
+                    }
+                } else if (name.equals("mimetype")) {
+                    //odt
+                    return 
MediaType.parse(IOUtils.toString(zipArchiveInputStream, UTF_8));
+                }
+                zae = zipArchiveInputStream.getNextZipEntry();
+            }
+        } catch (SecurityException e) {
+            throw e;
+        } catch (Exception e) {
+            //swallow
+        }
+        Set<String> entryNames = new HashSet<>(fileNames);
+        entryNames.addAll(fileNames);
+        MediaType mt = detectKmz(fileNames);
+        if (mt != null) {
+            return mt;
+        }
+        mt = detectJar(entryNames);
+        if (mt != null) {
+            return mt;
+        }
+        mt = detectIpa(entryNames);
+        if (mt != null) {
+            return mt;
+        }
+        mt = detectIWorks(entryNames);
+        if (mt != null) {
+            return mt;
+        }
+        int hits = 0;
+        for (String s : OOXML_HINTS) {
+            if (entryNames.contains(s)) {
+                hits++;
+            }
+        }
+        if (hits > 2) {
+            return TIKA_OOXML;
+        }
+        return MediaType.APPLICATION_ZIP;
+    }
+
+    private static MediaType detectIWorks(Set<String> entryNames) {
+        //general iworks
+        if (entryNames.contains(IWorkPackageParser.IWORK_COMMON_ENTRY)) {
+            return MediaType.application("vnd.apple.iwork");
+        }
+        return null;
+    }
+
+
+    public static Set<String> parseOOXMLRels(InputStream is) {
+        RelsHandler relsHandler = new RelsHandler();
+        try {
+            XMLReaderUtils.parseSAX(is, relsHandler, new ParseContext());
+        } catch (SecurityException e) {
+            throw e;
+        } catch (Exception e) {
+
+        }
+        return relsHandler.rels;
+    }
+
+    private static class RelsHandler extends DefaultHandler {
+        Set<String> rels = new HashSet<>();
+        private MediaType mediaType = null;
+        @Override
+        public void startElement(String uri, String localName,
+                                 String name, Attributes attrs) throws 
SAXException {
+            for (int i = 0; i < attrs.getLength(); i++) {
+                String attrName = attrs.getLocalName(i);
+                if (attrName.equals("Type")) {
+                    String contentType = attrs.getValue(i);
+                    rels.add(contentType);
+                    if (OOXML_CONTENT_TYPES.containsKey(contentType)) {
+                        mediaType = OOXML_CONTENT_TYPES.get(contentType);
+                    }
+                }
+            }
+        }
+    }
+
+    public static MediaType parseOOXMLContentTypes(InputStream is) {
+        ContentTypeHandler contentTypeHandler = new ContentTypeHandler();
+        try {
+            XMLReaderUtils.parseSAX(is, contentTypeHandler, new 
ParseContext());
+        } catch (SecurityException e) {
+            throw e;
+        } catch (Exception e) {
+
+        }
+        return contentTypeHandler.mediaType;
+    }
+
+
+
+
+    private static class ContentTypeHandler extends DefaultHandler {
+
+        private MediaType mediaType = null;
+
+        @Override
+        public void startElement(String uri, String localName,
+                                 String name, Attributes attrs) throws 
SAXException {
+            for (int i = 0; i < attrs.getLength(); i++) {
+                String attrName = attrs.getLocalName(i);
+                if (attrName.equals("ContentType")) {
+                    String contentType = attrs.getValue(i);
+                    if (OOXML_CONTENT_TYPES.containsKey(contentType)) {
+                        mediaType = OOXML_CONTENT_TYPES.get(contentType);
+                        throw new StoppingEarlyException();
+                    }
+
+                }
+            }
+        }
+    }
+
+    private static class StoppingEarlyException extends SAXException {
+
+    }
+}
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 3f2303b..0f448d5 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -16,10 +16,22 @@
  */
 package org.apache.tika.parser.pkg;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Pattern;
+
 import org.apache.commons.compress.archivers.ArchiveException;
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.commons.compress.compressors.CompressorException;
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
@@ -31,39 +43,15 @@ import 
org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
 import org.apache.poi.openxml4j.util.ZipEntrySource;
 import org.apache.poi.openxml4j.util.ZipFileZipEntrySource;
-import org.apache.poi.xslf.usermodel.XSLFRelation;
-import org.apache.poi.xssf.usermodel.XSSFRelation;
-import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.config.Field;
 import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.iwork.IWorkPackageParser;
 import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
 import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
-import org.apache.tika.utils.XMLReaderUtils;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.io.ByteArrayInputStream;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.regex.Pattern;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
 
 /**
  * A detector that works on Zip documents and other archive and compression
@@ -95,45 +83,14 @@ public class ZipContainerDetector implements Detector {
     private static final String XPS_DOCUMENT =
             "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";;
 
-    private static final MediaType TIKA_OOXML = 
MediaType.application("x-tika-ooxml");
-    private static final MediaType DOCX =
-            
MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
-    private static final MediaType DOCM =
-            MediaType.application("vnd.ms-word.document.macroEnabled.12");
-    private static final MediaType DOTX =
-            MediaType.application("vnd.ms-word.document.macroEnabled.12");
-    private static final MediaType PPTX =
-            
MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
-    private static final MediaType PPTM =
-            
MediaType.application("vnd.ms-powerpoint.presentation.macroEnabled.12");
-    private static final MediaType POTX =
-            
MediaType.application("vnd.openxmlformats-officedocument.presentationml.template");
-    private static final MediaType XLSX =
-            
MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-    private static final MediaType XLSM =
-            MediaType.application("vnd.ms-excel.sheet.macroEnabled.12");
-
-    private static final Set<String> OOXML_HINTS = fillSet(
-            "word/document.xml",
-            "_rels/.rels",
-            "[Content_Types].xml",
-            "ppt/presentation.xml",
-            "ppt/slides/slide1.xml",
-            "xl/workbook.xml",
-            "xl/sharedStrings.xml",
-            "xl/worksheets/sheet1.xml"
-    );
-
-    static Set<String> fillSet(String ... args) {
-        Set<String> tmp = new HashSet<>();
-        for (String arg : args) {
-            tmp.add(arg);
-        }
-        return Collections.unmodifiableSet(tmp);
-    }
+
     /** Serial version UID */
     private static final long serialVersionUID = 2891763938430295453L;
 
+    //this has to be > 100,000 to handle some of our test iworks files
+    @Field
+    int markLimit = 500_000;
+
     public MediaType detect(InputStream input, Metadata metadata)
             throws IOException {
         // Check if we have access to the document
@@ -141,32 +98,41 @@ public class ZipContainerDetector implements Detector {
             return MediaType.OCTET_STREAM;
         }
 
-        TemporaryResources tmp = new TemporaryResources();
+        byte[] prefix = new byte[1024]; // enough for all known archive formats
+        input.mark(1024);
+        int length = -1;
         try {
-            TikaInputStream tis = TikaInputStream.get(input, tmp);
+            length = IOUtils.read(input, prefix);
+        } finally {
+            input.reset();
+        }
 
-            byte[] prefix = new byte[1024]; // enough for all known formats
-            int length = tis.peek(prefix);
+        MediaType type = detectArchiveFormat(prefix, length);
 
-            MediaType type = detectArchiveFormat(prefix, length);
+        if (type == TIFF) {
+            return TIFF;
+        } else if (PackageParser.isZipArchive(type)) {
 
-            if (type == TIFF) {
-                return TIFF;
-            } else if (PackageParser.isZipArchive(type)
-                        && TikaInputStream.isTikaInputStream(input)) {
-                return detectZipFormat(tis);
-            } else if (!type.equals(MediaType.OCTET_STREAM)) {
-                return type;
-            } else {
-                return detectCompressorFormat(prefix, length);
+            if (TikaInputStream.isTikaInputStream(input)) {
+                TikaInputStream tis = TikaInputStream.cast(input);
+                if (tis.hasFile()) {
+                    return detectZipFormatOnFile(tis);
+                }
             }
-        } finally {
+
+            input.mark(markLimit);
             try {
-                tmp.dispose();
-            } catch (TikaException e) {
-                // ignore
+                return StreamingZipContainerDetector.detect(
+                        new BoundedInputStream(markLimit, input));
+            } finally {
+                input.reset();
             }
+        } else if (!type.equals(MediaType.OCTET_STREAM)) {
+            return type;
+        } else {
+            return detectCompressorFormat(prefix, length);
         }
+
     }
 
     private static MediaType detectCompressorFormat(byte[] prefix, int length) 
{
@@ -211,17 +177,18 @@ public class ZipContainerDetector implements Detector {
         }
     }
 
-    private static MediaType detectZipFormat(TikaInputStream tis) {
+    /**
+     * This will call TikaInputStream's getFile(). If there are no exceptions,
+     * it will place the ZipFile in TikaInputStream's openContainer and leave 
it
+     * open.
+     * @param tis
+     * @return
+     */
+    private static MediaType detectZipFormatOnFile(TikaInputStream tis) {
         try {
 
-            //try opc first because opening a package
-            //will not necessarily throw an exception for
-            //truncated files.
-            MediaType type = detectOPCBased(tis);
-            if (type != null) {
-                return type;
-            }
             ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
+            MediaType type = null;
             try {
                 type = detectOpenDocument(zip);
 
@@ -244,14 +211,17 @@ public class ZipContainerDetector implements Detector {
                     return type;
                 }
             } finally {
-                // TODO: shouldn't we record the open
-                // container so it can be later
-                // reused...?
-                // tis.setOpenContainer(zip);
-                try {
-                    zip.close();
-                } catch (IOException e) {
-                    // ignore
+                tis.setOpenContainer(zip);
+            }
+            //finally, test for opc based
+            //if it is not an opc based file, poi throws an exception
+            //and we close the zip
+            //if it is opc based, we put the pkg in TikaInputStream's open 
container
+            if (zip.getEntry("_rels/.rels") != null
+                    || zip.getEntry("[Content_Types].xml") != null) {
+                type = detectOPCBased(zip, tis);
+                if (type != null) {
+                    return type;
                 }
             }
         } catch (IOException e) {
@@ -281,57 +251,32 @@ public class ZipContainerDetector implements Detector {
         }
     }
 
-    private static MediaType detectOPCBased(TikaInputStream stream) {
+    //If this is not an OPCBased file, POI throws an exception and we close 
the zipFile.
+    private static MediaType detectOPCBased(ZipFile zipFile, TikaInputStream 
stream) {
+        //as of 4.x, POI throws an exception for non-POI OPC file types
+        //unless we change POI, we can't rely on POI for non-POI files
+        ZipEntrySource zipEntrySource = new ZipFileZipEntrySource(zipFile);
 
-        ZipEntrySource zipEntrySource = null;
-        try {
-            zipEntrySource = new ZipFileZipEntrySource(new 
ZipFile(stream.getFile()));
-        } catch (IOException e) {
-            return tryStreamingDetection(stream);
-        }
-
-        //if (zip.getEntry("_rels/.rels") != null
-        //  || zip.getEntry("[Content_Types].xml") != null) {
         // Use POI to open and investigate it for us
         //Unfortunately, POI can throw a RuntimeException...so we
         //have to catch that.
         OPCPackage pkg = null;
-        try {
-            pkg = OPCPackage.open(zipEntrySource);
-        } catch (SecurityException e) {
-            closeQuietly(zipEntrySource);
-            //TIKA-2571
-            throw e;
-        } catch (InvalidFormatException|RuntimeException e) {
-            closeQuietly(zipEntrySource);
-            return null;
-        }
-
         MediaType type = null;
         try {
-
-            // Is at an OOXML format?
+            pkg = OPCPackage.open(zipEntrySource);
             type = detectOfficeOpenXML(pkg);
-            if (type == null) {
-                // Is it XPS format?
-                type = detectXPSOPC(pkg);
-            }
-            if (type == null) {
-                // Is it an AutoCAD format?
-                type = detectAutoCADOPC(pkg);
-            }
-
         } catch (SecurityException e) {
             closeQuietly(zipEntrySource);
+            IOUtils.closeQuietly(zipFile);
             //TIKA-2571
             throw e;
-        } catch (RuntimeException e) {
+        } catch (InvalidFormatException|RuntimeException e) {
             closeQuietly(zipEntrySource);
+            IOUtils.closeQuietly(zipFile);
             return null;
         }
         //only set the open container if we made it here
         stream.setOpenContainer(pkg);
-        // We don't know what it is, sorry
         return type;
     }
 
@@ -360,7 +305,19 @@ public class ZipContainerDetector implements Detector {
         if (core.size() == 0) {
             core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
         }
-        
+        if (core.size() == 0) {
+            core = pkg.getRelationshipsByType(XPS_DOCUMENT);
+            if (core.size() == 1) {
+                return MediaType.application("vnd.ms-xpsdocument");
+            }
+        }
+
+        if (core.size() == 0) {
+            core = 
pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence";);
+            if (core.size() == 1) {
+                return MediaType.parse("model/vnd.dwfx+xps");
+            }
+        }
         // If we didn't find a single core document of any type, skip detection
         if (core.size() != 1) {
             // Invalid OOXML Package received
@@ -389,19 +346,7 @@ public class ZipContainerDetector implements Detector {
         // Build the MediaType object and return
         return MediaType.parse(docType);
     }
-    /**
-     * Detects Open XML Paper Specification (XPS)
-     */
-    public static MediaType detectXPSOPC(OPCPackage pkg) {
-        PackageRelationshipCollection xps = 
-                
pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";);
-        if (xps.size() == 1) {
-            return MediaType.application("vnd.ms-xpsdocument");
-        } else {
-            // Non-XPS Package received
-            return null;
-        }
-    }
+
     /**
      * Detects AutoCAD formats that live in OPC packaging
      */
@@ -534,95 +479,5 @@ public class ZipContainerDetector implements Detector {
         return null;
     }
 
-    private static MediaType tryStreamingDetection(TikaInputStream stream) {
-        Set<String> entryNames = new HashSet<>();
-        try (InputStream is = new FileInputStream(stream.getFile())) {
-            ZipArchiveInputStream zipArchiveInputStream = new 
ZipArchiveInputStream(is);
-            ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
-            while (zae != null) {
-                if (zae.isDirectory()) {
-                    zae = zipArchiveInputStream.getNextZipEntry();
-                    continue;
-                }
-                entryNames.add(zae.getName());
-                //we could also parse _rel/.rels, but if
-                // there isn't a valid content_types, then POI
-                //will throw an exception...Better to backoff to PKG
-                //than correctly identify a truncated
-                if (zae.getName().equals("[Content_Types].xml")) {
-                    MediaType mt = parseContentTypes(zipArchiveInputStream);
-                    if (mt != null) {
-                        return mt;
-                    }
-                    return TIKA_OOXML;
-                }
-                zae = zipArchiveInputStream.getNextZipEntry();
-            }
-        } catch (SecurityException e) {
-            throw e;
-        } catch (Exception e) {
-            //swallow
-        }
-        int hits = 0;
-        for (String s : OOXML_HINTS) {
-            if (entryNames.contains(s)) {
-                hits++;
-            }
-        }
-        if (hits > 2) {
-            return TIKA_OOXML;
-        }
-        return MediaType.APPLICATION_ZIP;
-    }
-
-    private static MediaType parseContentTypes(InputStream is) {
-        ContentTypeHandler contentTypeHandler = new ContentTypeHandler();
-        try {
-            XMLReaderUtils.parseSAX(is, contentTypeHandler, new 
ParseContext());
-        } catch (SecurityException e) {
-            throw e;
-        } catch (Exception e) {
-
-        }
-        return contentTypeHandler.mediaType;
-    }
-
-
-    private static class ContentTypeHandler extends DefaultHandler {
-        static Map<String, MediaType> CONTENT_TYPES = new 
ConcurrentHashMap<>();
-        static {
-            CONTENT_TYPES.put(XWPFRelation.DOCUMENT.getContentType(), DOCX);
-            CONTENT_TYPES.put(XWPFRelation.MACRO_DOCUMENT.getContentType(), 
DOCM);
-            CONTENT_TYPES.put(XWPFRelation.TEMPLATE.getContentType(), DOTX);
-
-            CONTENT_TYPES.put(XSSFRelation.WORKBOOK.getContentType(), XLSX);
-            CONTENT_TYPES.put(XSSFRelation.MACROS_WORKBOOK.getContentType(), 
XLSM);
-            CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML.getContentType(), 
PPTX);
-            
CONTENT_TYPES.put(XSLFRelation.PRESENTATION_MACRO.getContentType(), PPTM);
-            
CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(), POTX);
-        }
-
-        private MediaType mediaType = null;
-
-        @Override
-        public void startElement(String uri, String localName,
-                                 String name, Attributes attrs) throws 
SAXException {
-            for (int i = 0; i < attrs.getLength(); i++) {
-                String attrName = attrs.getLocalName(i);
-                if (attrName.equals("ContentType")) {
-                    String contentType = attrs.getValue(i);
-                    if (CONTENT_TYPES.containsKey(contentType)) {
-                        mediaType = CONTENT_TYPES.get(contentType);
-                        throw new StoppingEarlyException();
-                    }
-
-                }
-            }
-        }
-    }
-
-    private static class StoppingEarlyException extends SAXException {
-
-    }
 
 }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
index 20ebf1b..f7cf08a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
@@ -18,9 +18,9 @@ package org.apache.tika.parser.utils;
 
 import java.io.EOFException;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Files;
 import java.util.zip.ZipException;
 
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
@@ -38,52 +38,57 @@ public class ZipSalvager {
      * This streams the broken zip and rebuilds a new zip that
      * is at least a valid zip file.  The contents of the final stream
      * may be truncated, but the result should be a valid zip file.
-     *
+     * <p>
      * This does nothing fancy to fix the underlying broken zip.
      *
      * @param brokenZip
      * @param salvagedZip
      */
-    public static void salvageCopy(File brokenZip, File salvagedZip) {
+    public static void salvageCopy(InputStream brokenZip, File salvagedZip) {
         try (ZipArchiveOutputStream outputStream = new 
ZipArchiveOutputStream(salvagedZip)) {
-            try (InputStream is = new FileInputStream(brokenZip)) {
-                ZipArchiveInputStream zipArchiveInputStream = new 
ZipArchiveInputStream(is);
-                ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
-                while (zae != null) {
-                    try {
-                        if (!zae.isDirectory() && 
zipArchiveInputStream.canReadEntryData(zae)) {
-                            //create a new ZAE and copy over only the name so 
that
-                            //if there is bad info (e.g. CRC) in brokenZip's 
zae, that
-                            //won't be propagated or cause an exception
-                            outputStream.putArchiveEntry(new 
ZipArchiveEntry(zae.getName()));
-                            //this will copy an incomplete stream...so there
-                            //could be truncation of the xml/contents, but the 
zip file
-                            //should be intact.
-                            boolean successfullyCopied = false;
-                            try {
-                                IOUtils.copy(zipArchiveInputStream, 
outputStream);
-                                successfullyCopied = true;
-                            } catch (IOException e) {
-                                //this can hit a "truncated ZipFile" 
IOException
-                            }
-                            outputStream.flush();
-                            outputStream.closeArchiveEntry();
-                            if (!successfullyCopied) {
-                                break;
-                            }
+            ZipArchiveInputStream zipArchiveInputStream = new 
ZipArchiveInputStream(brokenZip);
+            ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+            while (zae != null) {
+                try {
+                    if (!zae.isDirectory() && 
zipArchiveInputStream.canReadEntryData(zae)) {
+                        //create a new ZAE and copy over only the name so that
+                        //if there is bad info (e.g. CRC) in brokenZip's zae, 
that
+                        //won't be propagated or cause an exception
+                        outputStream.putArchiveEntry(new 
ZipArchiveEntry(zae.getName()));
+                        //this will copy an incomplete stream...so there
+                        //could be truncation of the xml/contents, but the zip 
file
+                        //should be intact.
+                        boolean successfullyCopied = false;
+                        try {
+                            IOUtils.copy(zipArchiveInputStream, outputStream);
+                            successfullyCopied = true;
+                        } catch (IOException e) {
+                            //this can hit a "truncated ZipFile" IOException
+                        }
+                        outputStream.flush();
+                        outputStream.closeArchiveEntry();
+                        if (!successfullyCopied) {
+                            break;
                         }
-                        zae = zipArchiveInputStream.getNextZipEntry();
-                    } catch (ZipException|EOFException e) {
-                        break;
                     }
-
+                    zae = zipArchiveInputStream.getNextZipEntry();
+                } catch (ZipException | EOFException e) {
+                    break;
                 }
-                outputStream.flush();
-                outputStream.finish();
-                outputStream.close();
+
             }
+            outputStream.flush();
+            outputStream.finish();
+
+
         } catch (IOException e) {
             LOG.warn("problem fixing zip", e);
         }
     }
+
+    public static void salvageCopy(File brokenZip, File salvagedZip) throws 
IOException {
+        try (InputStream is = Files.newInputStream(brokenZip.toPath())) {
+            salvageCopy(is, salvagedZip);
+        }
+    }
 }
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
index 1cf1874..1247cc1 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
@@ -38,6 +38,7 @@ import java.nio.file.Paths;
 import java.util.List;
 import java.util.Random;
 
+import static junit.framework.TestCase.assertTrue;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
 
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
index 2865442..bcba573 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
@@ -18,38 +18,178 @@
 package org.apache.tika.parser.pkg;
 
 
-import org.apache.commons.compress.compressors.CompressorStreamFactory;
-import org.apache.tika.TikaTest;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.junit.BeforeClass;
-import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
 import java.io.InputStream;
+import java.nio.file.Paths;
+import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.odf.ODFParserTest;
+import org.junit.Ignore;
+import org.junit.Test;
 
 public class ZipContainerDetectorTest extends TikaTest {
+    private static MediaType ODT_TEXT = 
MediaType.application("vnd.oasis.opendocument.text");
+    private static MediaType TIFF = MediaType.image("tiff");
+    ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
 
     @Test
     public void testTiffWorkaround() throws Exception {
         //TIKA-2591
-        ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
         Metadata metadata = new Metadata();
         try (InputStream is = 
TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF.tif"))) {
             MediaType mt = zipContainerDetector.detect(is, metadata);
-            assertEquals(MediaType.image("tiff"), mt);
+            assertEquals(TIFF, mt);
         }
         metadata = new Metadata();
         try (InputStream is = 
TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF_multipage.tif")))
 {
             MediaType mt = zipContainerDetector.detect(is, metadata);
-            assertEquals(MediaType.image("tiff"), mt);
+            assertEquals(TIFF, mt);
+        }
+    }
+
+    @Test
+    public void testODT() throws Exception {
+        try (InputStream input = ODFParserTest.class.getResourceAsStream(
+                "/test-documents/testODFwithOOo3.odt")) {
+            Metadata metadata = new Metadata();
+            MediaType mt = zipContainerDetector.detect(input, metadata);
+            assertEquals(ODT_TEXT, mt);
+        }
+    }
+
+    @Test
+    public void testIWorks() throws Exception {
+        //have to have marklimit in ZipContainerDetector > 100000 for this to 
work
+        try (InputStream input = ODFParserTest.class.getResourceAsStream(
+                "/test-documents/testPages.pages")) {
+            Metadata metadata = new Metadata();
+            MediaType mt = zipContainerDetector.detect(input, metadata);
+            assertEquals("application/vnd.apple.pages", mt.toString());
+        }
+
+        InputStream is = 
getClass().getResourceAsStream("/org/apache/tika/parser/pkg/tika-config.xml");
+        assertNotNull(is);
+        TikaConfig tikaConfig = new TikaConfig(is);
+        try (InputStream input = ODFParserTest.class.getResourceAsStream(
+                "/test-documents/testPages.pages")) {
+            Metadata metadata = new Metadata();
+            MediaType mt = tikaConfig.getDetector().detect(input, metadata);
+            assertEquals("application/zip", mt.toString());
+        }
+    }
+
+    @Test
+    public void testXPS() throws Exception {
+        for (String file : new String[]{"testXPS_various.xps", "testPPT.xps"}) 
{
+            long start = System.currentTimeMillis();
+            try (InputStream input = ODFParserTest.class.getResourceAsStream(
+                    "/test-documents/" + file)) {
+                MediaType mediaType = 
StreamingZipContainerDetector.detect(input);
+                assertEquals(AbstractZipContainerDetector.XPS, mediaType);
+            }
+            try (TikaInputStream input = 
TikaInputStream.get(Paths.get(ODFParserTest.class.getResource(
+                    "/test-documents/" + file).toURI()))) {
+                MediaType mediaType = zipContainerDetector.detect(input, new 
Metadata());
+                assertEquals(AbstractZipContainerDetector.XPS, mediaType);
+            }
         }
+    }
 
+    @Test
+    @Ignore("to be used for offline timing tests")
+    public void timeDetection() throws Exception {
+        TikaConfig config = TikaConfig.getDefaultConfig();
+        Detector detector = config.getDetector();
+        MediaTypeRegistry registry = config.getMediaTypeRegistry();
+        List<File> zips = getTestZipBasedFiles(detector, registry);
+
+        Set<MediaType> mediaTypeSet = new HashSet<>();
+        long stream = 0;
+        long file = 0;
+        for (int i = 0; i < 20; i++) {
+            for (File z : zips) {
+                long start = System.currentTimeMillis();
+                try (InputStream is = new BufferedInputStream(new 
FileInputStream(z))) {
+                    MediaType mt = detector.detect(is, new Metadata());
+                    mediaTypeSet.add(mt);
+                }
+                stream += System.currentTimeMillis()-start;
+            }
+
+            for (File z : zips) {
+                long start = System.currentTimeMillis();
+                try (InputStream is = TikaInputStream.get(z)) {
+                    MediaType mt = detector.detect(is, new Metadata());
+                    mediaTypeSet.add(mt);
+                }
+                file += System.currentTimeMillis()-start;
+            }
+        }
+        System.out.println("stream: "+stream + " file: "+file);
+    }
+
+    @Test
+    @Ignore("to be used for offline timing tests")
+    public void timeParsing() throws Exception {
+        TikaConfig config = TikaConfig.getDefaultConfig();
+        Detector detector = config.getDetector();
+        MediaTypeRegistry registry = config.getMediaTypeRegistry();
+
+        List<File> zips = getTestZipBasedFiles(detector, registry);
+        System.out.println("zips size: "+zips.size());
+        Set<MediaType> mediaTypeSet = new HashSet<>();
+        long stream = 0;
+        long file = 0;
+        for (int i = 0; i < 20; i++) {
+            for (File z : zips) {
+                long start = System.currentTimeMillis();
+                try (InputStream is = new BufferedInputStream(new 
FileInputStream(z))) {
+                    getRecursiveMetadata(is, true);
+                }
+                stream += System.currentTimeMillis()-start;
+            }
+
+            for (File z : zips) {
+                long start = System.currentTimeMillis();
+                try (InputStream is = TikaInputStream.get(z)) {
+                    getRecursiveMetadata(is, true);
+                }
+                file += System.currentTimeMillis()-start;
+            }
+        }
+        System.out.println("stream: "+stream + " file: "+file);
+    }
+
+    //TODO -- we need to find a dwg+xps file for testing
+
+    private List<File> getTestZipBasedFiles(Detector detector, 
MediaTypeRegistry registry) throws Exception {
+        List<File> zips = new ArrayList<>();
+        for (File f : Paths.get(
+                
this.getClass().getResource("/test-documents").toURI()).toFile().listFiles()) {
+            try (InputStream is = TikaInputStream.get(f)) {
+                MediaType mt = detector.detect(is, new Metadata());
+                if (registry.isSpecializationOf(mt, 
MediaType.APPLICATION_ZIP)) {
+                    zips.add(f);
+                }
+            } catch (Exception e) {
+
+            }
+        }
+        return zips;
     }
 }
\ No newline at end of file
diff --git 
a/tika-parsers/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml 
b/tika-parsers/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml
new file mode 100644
index 0000000..97d7c7b
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers/>
+    <detectors>
+        <detector class="org.apache.tika.detect.DefaultDetector">
+            <detector-exclude 
class="org.apache.tika.parser.pkg.ZipContainerDetector"/>
+        </detector>
+        <detector class="org.apache.tika.parser.pkg.ZipContainerDetector">
+            <params>
+                <param name="markLimit" type="int">100000</param>
+            </params>
+        </detector>
+    </detectors>
+    <translator class="org.apache.tika.language.translate.DefaultTranslator"/>
+</properties>
\ No newline at end of file

[tika] 01/01: TIKA-2849 -- move to streaming detection of zip files if TikaInputStream doesn't already have a file.

Reply via email to