This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 11a274a41b68a38a198a46fabcf711278a1bb5ae Author: Siraj A. <[email protected]> AuthorDate: Wed Sep 10 13:12:03 2025 -0400 TIKA-1180: Add MatroskaDetector for improved MKV/WEBM detection (#2251) (cherry picked from commit 2b3dd510f3702ea005f2916122c24d66d203743a) --- .../org/apache/tika/detect/MatroskaDetector.java | 88 ++++++++++++++++++++++ .../services/org.apache.tika.detect.Detector | 16 ++++ .../apache/tika/detect/MatroskaDetectorTest.java | 26 +++++++ 3 files changed, 130 insertions(+) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java new file mode 100644 index 000000000..27aa17ba1 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/detect/MatroskaDetector.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an "AS IS" + * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package org.apache.tika.detect; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Objects; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; + +/** + * Detector for Matroska (MKV and WEBM) files based on the EBML header. + */ +public class MatroskaDetector implements Detector { + + /** For serialization compatibility. */ + private static final long serialVersionUID = 1L; + + private static final MediaType MATROSKA = + MediaType.application("x-matroska"); + + private static final MediaType WEBM = + MediaType.video("webm"); + + private static final byte[] EBML_HEADER = + new byte[]{0x1A, 0x45, (byte) 0xDF, (byte) 0xA3}; + + /** + * Detects the media type of the input stream by inspecting EBML headers. + * + * @param input the input stream + * @param metadata the metadata to populate + * @return detected MediaType (WEBM, Matroska, or OCTET_STREAM) + * @throws IOException if an I/O error occurs + */ + @Override + public MediaType detect(InputStream input, Metadata metadata) throws IOException { + Objects.requireNonNull(input, "input stream must not be null"); + input.mark(64); + + byte[] header = new byte[64]; + int bytesRead = input.read(header); + input.reset(); + + if (bytesRead < EBML_HEADER.length) { + return MediaType.OCTET_STREAM; + } + + for (int i = 0; i < EBML_HEADER.length; i++) { + if (header[i] != EBML_HEADER[i]) { + return MediaType.OCTET_STREAM; + } + } + + for (int i = 4; i < bytesRead - 4; i++) { + if (header[i] == 'w' + && header[i + 1] == 'e' + && header[i + 2] == 'b' + && header[i + 3] == 'm') { + return WEBM; + } + if (header[i] == 'm' + && header[i + 1] == 'a' + && header[i + 2] == 't' + && header[i + 3] == 'r') { + return MATROSKA; + } + } + + return MediaType.OCTET_STREAM; + } +} \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector new file mode 100644 index 000000000..1428c6c4d --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.tika.detect.MatroskaDetector diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java new file mode 100644 index 000000000..32ed8daa1 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/detect/MatroskaDetectorTest.java @@ -0,0 +1,26 @@ +package org.apache.tika.detect; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.junit.jupiter.api.Test; + +public class MatroskaDetectorTest { + + private final MatroskaDetector detector = new MatroskaDetector(); + + private InputStream getResourceAsStream(String resourcePath) { + return this.getClass().getResourceAsStream(resourcePath); + } + + @Test + public void testDetectMKV() throws IOException { + assertEquals(MediaType.video("x-matroska"), + detector.detect(getResourceAsStream("/test-documents/sample.nonexist"), + new Metadata())); + } +} \ No newline at end of file
