This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4395
in repository https://gitbox.apache.org/repos/asf/tika.git

commit aeef318398138c20398b81b32e17b76378f315a9
Author: tallison <[email protected]>
AuthorDate: Wed Apr 9 09:42:51 2025 -0400

    TIKA-4395 -- wip
---
 .../detect/microsoft/POIFSContainerDetector.java   |  2 +-
 .../detect/zip/DefaultZipContainerDetector.java    | 22 ++++++++++++++--------
 .../org/apache/tika/detect/zip/ZipParserTest.java  | 18 ++++++++++++++++++
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index 69be0361f..321a01f9a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -248,7 +248,7 @@ public class POIFSContainerDetector implements Detector {
     private static final Pattern mppDataMatch = 
Pattern.compile("\\s\\s\\s\\d+");
 
     @Field
-    private int markLimit = 128 * 1024 * 1024;
+    private int markLimit = -1;
 
     /**
      * Internal detection of the specific kind of OLE2 document, based on the
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 9adfe9ba0..b30a73e14 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -21,6 +21,7 @@ import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -43,6 +44,7 @@ import org.apache.tika.config.LoadErrorHandler;
 import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.io.BoundedInputStream;
+import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -71,7 +73,7 @@ public class DefaultZipContainerDetector implements Detector {
     //this has to be > 100,000 to handle some of the iworks files
     //in our unit tests
     @Field
-    int markLimit = 16 * 1024 * 1024;
+    int markLimit = -1;//16 * 1024 * 1024;
 
     private transient ServiceLoader loader;
 
@@ -181,14 +183,18 @@ public class DefaultZipContainerDetector implements 
Detector {
 
             if (TikaInputStream.isTikaInputStream(input)) {
                 TikaInputStream tis = TikaInputStream.cast(input);
-                if (markLimit < 0) {
-                    tis.getFile();
-                }
-                if (tis.hasFile()) {
-                    return detectZipFormatOnFile(tis, metadata);
+                return detectZipFormatOnFile(tis, metadata);
+            } else {
+                if (markLimit >= 0) {
+                    return detectStreaming(input, metadata);
+                } else {
+                    try (TemporaryResources tmp = new TemporaryResources()) {
+                        try (TikaInputStream tis = TikaInputStream.get(input, 
tmp, new Metadata())) {
+                            return detectZipFormatOnFile(tis, metadata);
+                        }
+                    }
                 }
             }
-            return detectStreaming(input, metadata);
         } else if (!type.equals(MediaType.OCTET_STREAM)) {
             return type;
         } else {
@@ -207,7 +213,7 @@ public class DefaultZipContainerDetector implements 
Detector {
     private MediaType detectZipFormatOnFile(TikaInputStream tis, Metadata 
metadata) {
         ZipFile zip = null;
         try {
-            zip = ZipFile.builder().setFile(tis.getFile()).get(); // TODO: 
hasFile()?
+            zip = ZipFile.builder().setFile(tis.getFile()).get();
 
             for (ZipContainerDetector zipDetector : getDetectors()) {
                 MediaType type = zipDetector.detect(zip, tis);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
index 2ed4c3572..14c0bb5a4 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java
@@ -18,8 +18,11 @@ package org.apache.tika.detect.zip;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
+import java.io.IOException;
+import java.io.InputStream;
 import java.util.List;
 
+import org.apache.commons.io.IOUtils;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.TikaTest;
@@ -44,4 +47,19 @@ public class ZipParserTest extends TikaTest {
         List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar");
         assertEquals("application/java-archive", 
metadataList.get(0).get(HttpHeaders.CONTENT_TYPE));
     }
+
+    @Test
+    public void testStreaming() throws Exception {
+        long len = getLength("testJAR.jar");
+        System.out.println(len);
+        DefaultZipContainerDetector detector = new 
DefaultZipContainerDetector();
+        //detector.setMarkLimit(100);
+        try (InputStream is = 
ZipParserTest.class.getResourceAsStream("/test-documents/testJAR.jar")) {
+            System.out.println(detector.detect(is, new Metadata()));
+        }
+    }
+
+    private long getLength(String fileName) throws IOException {
+        return 
IOUtils.toByteArray(ZipParserTest.class.getResourceAsStream("/test-documents/" 
+ fileName)).length;
+    }
 }

Reply via email to