This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit b66d845fb3171468bbaca8ea40bcf2573f9d9ab9 Author: tallison <[email protected]> AuthorDate: Wed Sep 10 13:39:08 2025 -0400 TIKA-1180 -- fixes for pr #2251 --- .../org/apache/tika/detect/MatroskaDetector.java | 17 +++++-- .../apache/tika/detect/MatroskaDetectorTest.java | 55 +++++++++++++++++++-- .../test/resources/test-documents/sample-mkv.noext | Bin 0 -> 5759 bytes .../resources/test-documents/sample-webm.noext | Bin 0 -> 16234 bytes .../src/test/resources/test-documents/testMKV.mkv | Bin .../apache/tika/detect/TestDetectorLoading.java | 8 +-- 6 files changed, 67 insertions(+), 13 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java index 27aa17ba1..b2b25f606 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java @@ -19,7 +19,8 @@ package org.apache.tika.detect; import java.io.IOException; import java.io.InputStream; -import java.util.Objects; + +import org.apache.commons.io.IOUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -51,12 +52,18 @@ public class MatroskaDetector implements Detector { */ @Override public MediaType detect(InputStream input, Metadata metadata) throws IOException { - Objects.requireNonNull(input, "input stream must not be null"); + if (input == null) { + return MediaType.OCTET_STREAM; + } input.mark(64); byte[] header = new byte[64]; - int bytesRead = input.read(header); - input.reset(); + int bytesRead = -1; + try { + bytesRead = IOUtils.read(input, header, 0, 64); + } finally { + input.reset(); + } if (bytesRead < EBML_HEADER.length) { return MediaType.OCTET_STREAM; @@ -85,4 +92,4 @@ public class MatroskaDetector implements Detector { return MediaType.OCTET_STREAM; } -} \ No newline at end of file +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java index 32ed8daa1..a893055a0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java @@ -1,13 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an "AS IS" + * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ package org.apache.tika.detect; + import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.IOException; import java.io.InputStream; +import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; +import org.junit.jupiter.api.Test; + import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; -import org.junit.jupiter.api.Test; public class MatroskaDetectorTest { @@ -19,8 +38,36 @@ public class MatroskaDetectorTest { @Test public void testDetectMKV() throws IOException { - assertEquals(MediaType.video("x-matroska"), - detector.detect(getResourceAsStream("/test-documents/sample.nonexist"), + assertEquals(MediaType.application("x-matroska"), + detector.detect(getResourceAsStream("/test-documents/sample-mkv.noext"), + new Metadata())); + + assertEquals(MediaType.application("x-matroska"), + detector.detect(getResourceAsStream("/test-documents/testMKV.mkv"), + new Metadata())); + + + } + + @Test + public void testDetectWEBM() throws IOException { + assertEquals(MediaType.video("webm"), + detector.detect(getResourceAsStream("/test-documents/sample-webm.noext"), new Metadata())); } -} \ No newline at end of file + + @Test + public void testNullAndShort() throws Exception { + assertEquals(MediaType.OCTET_STREAM, + detector.detect(null, new Metadata())); + + byte[] bytes = new byte[10]; + assertEquals(MediaType.OCTET_STREAM, + detector.detect(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get(), new Metadata())); + + bytes = new byte[0]; + assertEquals(MediaType.OCTET_STREAM, + detector.detect(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get(), new Metadata())); + + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/sample-mkv.noext b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/sample-mkv.noext new file mode 100644 index 000000000..55dc0ca79 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/sample-mkv.noext differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/sample-webm.noext b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/sample-webm.noext new file mode 100644 index 000000000..16324824b Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/sample-webm.noext differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testMKV.mkv b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMKV.mkv similarity index 100% rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testMKV.mkv rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMKV.mkv diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java index 82a9e7df9..28b9b0dd4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java @@ -32,13 +32,13 @@ public class TestDetectorLoading { //integration test Detector detector = TikaConfig.getDefaultConfig().getDetector(); List<Detector> detectors = ((CompositeDetector) detector).getDetectors(); - assertEquals(7, detectors.size()); + assertEquals(8, detectors.size()); assertEquals("org.gagravarr.tika.OggDetector", detectors.get(0).getClass().getName()); assertEquals("org.apache.tika.detect.gzip.GZipSpecializationDetector", - detectors.get(2).getClass().getName()); + detectors.get(3).getClass().getName()); assertEquals("org.apache.tika.detect.microsoft.POIFSContainerDetector", - detectors.get(3).getClass().getName()); - assertEquals("org.apache.tika.mime.MimeTypes", detectors.get(6).getClass().getName()); + detectors.get(4).getClass().getName()); + assertEquals("org.apache.tika.mime.MimeTypes", detectors.get(7).getClass().getName()); } }
