This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4395 in repository https://gitbox.apache.org/repos/asf/tika.git
commit aeef318398138c20398b81b32e17b76378f315a9 Author: tallison <[email protected]> AuthorDate: Wed Apr 9 09:42:51 2025 -0400 TIKA-4395 -- wip --- .../detect/microsoft/POIFSContainerDetector.java | 2 +- .../detect/zip/DefaultZipContainerDetector.java | 22 ++++++++++++++-------- .../org/apache/tika/detect/zip/ZipParserTest.java | 18 ++++++++++++++++++ 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java index 69be0361f..321a01f9a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java @@ -248,7 +248,7 @@ public class POIFSContainerDetector implements Detector { private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+"); @Field - private int markLimit = 128 * 1024 * 1024; + private int markLimit = -1; /** * Internal detection of the specific kind of OLE2 document, based on the diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java index 9adfe9ba0..b30a73e14 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java @@ -21,6 +21,7 @@ import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; @@ -43,6 +44,7 @@ import org.apache.tika.config.LoadErrorHandler; import org.apache.tika.config.ServiceLoader; import org.apache.tika.detect.Detector; import org.apache.tika.io.BoundedInputStream; +import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -71,7 +73,7 @@ public class DefaultZipContainerDetector implements Detector { //this has to be > 100,000 to handle some of the iworks files //in our unit tests @Field - int markLimit = 16 * 1024 * 1024; + int markLimit = -1;//16 * 1024 * 1024; private transient ServiceLoader loader; @@ -181,14 +183,18 @@ public class DefaultZipContainerDetector implements Detector { if (TikaInputStream.isTikaInputStream(input)) { TikaInputStream tis = TikaInputStream.cast(input); - if (markLimit < 0) { - tis.getFile(); - } - if (tis.hasFile()) { - return detectZipFormatOnFile(tis, metadata); + return detectZipFormatOnFile(tis, metadata); + } else { + if (markLimit >= 0) { + return detectStreaming(input, metadata); + } else { + try (TemporaryResources tmp = new TemporaryResources()) { + try (TikaInputStream tis = TikaInputStream.get(input, tmp, new Metadata())) { + return detectZipFormatOnFile(tis, metadata); + } + } } } - return detectStreaming(input, metadata); } else if (!type.equals(MediaType.OCTET_STREAM)) { return type; } else { @@ -207,7 +213,7 @@ public class DefaultZipContainerDetector implements Detector { private MediaType detectZipFormatOnFile(TikaInputStream tis, Metadata metadata) { ZipFile zip = null; try { - zip = ZipFile.builder().setFile(tis.getFile()).get(); // TODO: hasFile()? + zip = ZipFile.builder().setFile(tis.getFile()).get(); for (ZipContainerDetector zipDetector : getDetectors()) { MediaType type = zipDetector.detect(zip, tis); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java index 2ed4c3572..14c0bb5a4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/test/java/org/apache/tika/detect/zip/ZipParserTest.java @@ -18,8 +18,11 @@ package org.apache.tika.detect.zip; import static org.junit.jupiter.api.Assertions.assertEquals; +import java.io.IOException; +import java.io.InputStream; import java.util.List; +import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; @@ -44,4 +47,19 @@ public class ZipParserTest extends TikaTest { List<Metadata> metadataList = getRecursiveMetadata("testJAR.jar"); assertEquals("application/java-archive", metadataList.get(0).get(HttpHeaders.CONTENT_TYPE)); } + + @Test + public void testStreaming() throws Exception { + long len = getLength("testJAR.jar"); + System.out.println(len); + DefaultZipContainerDetector detector = new DefaultZipContainerDetector(); + //detector.setMarkLimit(100); + try (InputStream is = ZipParserTest.class.getResourceAsStream("/test-documents/testJAR.jar")) { + System.out.println(detector.detect(is, new Metadata())); + } + } + + private long getLength(String fileName) throws IOException { + return IOUtils.toByteArray(ZipParserTest.class.getResourceAsStream("/test-documents/" + fileName)).length; + } }
