This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4395
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9f99c9a64bca53fd145fe632f890de234a097cc0
Author: tallison <[email protected]>
AuthorDate: Thu Apr 10 08:17:47 2025 -0400

    TIKA-4395 -- improve handling logging of container detection
---
 .../org/apache/tika/MultiThreadedTikaTest.java     |  1 -
 .../detect/microsoft/POIFSContainerDetector.java   | 23 ++++++++++++++--------
 .../src/test/resources/log4j2.xml                  |  3 +++
 .../detect/zip/DefaultZipContainerDetector.java    |  4 ++--
 .../tika/config/TikaConfigSerializerTest.java      |  2 +-
 .../tika/detect/TestContainerAwareDetector.java    |  4 ++++
 6 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java 
b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
index fd3f381d4..ee87f9bf7 100644
--- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
@@ -109,7 +109,6 @@ public class MultiThreadedTikaTest extends TikaTest {
                 baseline.put(f, new Extract(metadataList));
 
             } catch (Exception e) {
-                e.printStackTrace();
                 //swallow
             }
         }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index 2285630fb..f0605a78d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.detect.microsoft;
 
+import static org.apache.tika.mime.MediaType.OCTET_STREAM;
 import static org.apache.tika.mime.MediaType.application;
 import static org.apache.tika.mime.MediaType.image;
 
@@ -602,19 +603,26 @@ public class POIFSContainerDetector implements Detector {
             return MediaType.OCTET_STREAM;
         }
 
-        if (! isOleHeader(input)) {
-            return MediaType.OCTET_STREAM;
-        }
-
         TikaInputStream tis = TikaInputStream.cast(input);
-        if (tis == null) {
-            LOG.warn("POIFSContainerDetector requires a TikaInputStream for 
precise detection.");
+        if (tis != null) {
+            return handleTikaStream(tis, metadata);
+        }
+        if (isOleHeader(input)) {
             return OLE;
         }
+        return MediaType.OCTET_STREAM;
+    }
 
+    private MediaType handleTikaStream(TikaInputStream tis, Metadata metadata) 
throws IOException {
+        //try for an open container
         Set<String> names = tryOpenContainerOnTikaInputStream(tis, metadata);
 
-        // We can only detect the exact type when given a TikaInputStream
+        //if that didn't work, confirm the bytes are OLE
+        if (names == null && ! isOleHeader(tis)) {
+            return OCTET_STREAM;
+        }
+
+        // If OLE, spool to disk
         if (names == null) {
             // spool to disk and try detection
             names = getTopLevelNames(tis);
@@ -625,7 +633,6 @@ public class POIFSContainerDetector implements Detector {
                 tis.getOpenContainer() instanceof POIFSFileSystem) {
             return detect(names, ((POIFSFileSystem) 
tis.getOpenContainer()).getRoot());
         } else {
-            //can we actually get here?
             return detect(names, null);
         }
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
index 1e9327e01..d609d7631 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml
@@ -36,5 +36,8 @@
     <Logger name="org.apache.poi" level="ERROR" additivity="false">
       <AppenderRef ref="Console"/>
     </Logger>
+    <Logger name="org.apache.tika.detect.microsoft.POIFSContainerDetector" 
level="ERROR" additivity="false">
+      <AppenderRef ref="Console"/>
+    </Logger>
   </Loggers>
 </Configuration>
\ No newline at end of file
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 5b6567308..2c2669b85 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -262,9 +262,9 @@ public class DefaultZipContainerDetector implements 
Detector {
         }
         //problem opening zip file (truncated?)
         try (InputStream is = new 
BufferedInputStream(Files.newInputStream(tis.getPath()))) {
-            return detectStreaming(is, metadata);
+            return detectStreaming(is, metadata, false);
         } catch (IOException e) {
-                //swallow
+            //swallow
         }
         return MediaType.APPLICATION_ZIP;
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
index d7313db6f..9ba4b4ab6 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
@@ -46,7 +46,7 @@ public class TikaConfigSerializerTest {
         assertContains(encodingNeedle, xml);
 
         String detectorNeedle = "<detector 
class=\"org.apache.tika.detect.zip.DefaultZipContainerDetector\">" +
-                " <params> <param name=\"markLimit\" 
type=\"int\">16777216</param> </params>";
+                " <params> <param name=\"markLimit\" type=\"int\">-1</param> 
</params>";
         assertContains(detectorNeedle, xml);
 
         String parserNeedle = "<parser 
class=\"org.apache.tika.parser.pdf.PDFParser\">" +
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index d35df67bf..cb71b925e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -65,6 +65,10 @@ public class TestContainerAwareDetector extends 
MultiThreadedTikaTest {
     private final StreamingZipContainerDetector streamingZipDetector =
             new StreamingZipContainerDetector();
 
+    TestContainerAwareDetector() {
+        streamingZipDetector.setMarkLimit(128 * 1024 * 1024);
+    }
+
     @AfterEach
     public void tearDown() throws TikaException {
         //make sure to reset pool size because it is being randomly resized 
during the tests

Reply via email to