This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new ca4e0679d [TIKA-4476] Force audio/mp4 where mp4 container is
exclusively sound (#2315)
ca4e0679d is described below
commit ca4e0679dc4b74a7b05df7b6d89cf04a01a14cbf
Author: tombrisland <[email protected]>
AuthorDate: Wed Sep 10 17:33:15 2025 +0100
[TIKA-4476] Force audio/mp4 where mp4 container is exclusively sound (#2315)
(cherry picked from commit ff1b8e83c09c0bedb8a7a6af477dab0a88407f1f)
---
.../java/org/apache/tika/parser/mp4/MP4Parser.java | 39 +++++++++++++++++--
.../org/apache/tika/parser/mp4/MP4ParserTest.java | 42 ++++++++++++++++++++-
.../resources/test-documents/testMP4AudioOnly.mp4 | Bin 0 -> 35236 bytes
3 files changed, 76 insertions(+), 5 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
index 5e1db299a..0b7e30b2a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
@@ -73,6 +73,8 @@ public class MP4Parser implements Parser {
Collections.unmodifiableSet(typesMap.keySet());
private static final MediaType APPLICATION_MP4 =
MediaType.application("mp4");
+ private static final MediaType AUDIO_MP4 = MediaType.audio("mp4");
+
private static final int MAX_ERROR_MESSAGES = 100;
static {
// All types should be 4 bytes long, space padded as needed
@@ -111,10 +113,15 @@ public class MP4Parser implements Parser {
}
//TODO -- figure out how to get IOExceptions out of boxhandler.
Mp4Reader
//currently swallows IOExceptions.
- Set<String> errorMessages =
- processMp4Directories(
-
mp4Metadata.getDirectoriesOfType(Mp4Directory.class),
- metadata);
+ final Collection<Mp4Directory> mp4Directories =
+ mp4Metadata.getDirectoriesOfType(Mp4Directory.class);
+ final Set<String> errorMessages =
processMp4Directories(mp4Directories, metadata);
+
+ // Despite the brand, if we ONLY have audio streams with no video
+ if (isAudioOnly(mp4Directories)) {
+ // Mark this as audio/mp4
+ metadata.set(Metadata.CONTENT_TYPE, AUDIO_MP4.toString());
+ }
for (String m : errorMessages) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
m);
@@ -160,6 +167,30 @@ public class MP4Parser implements Parser {
}
}
+ /**
+ * Check we have only audio with no video metadata.
+ * <p>
+ * Other non-video metadata can exist - as long as there's at least one
{@link Mp4SoundDirectory}.
+ *
+ * @param directories from MP4 file
+ * @return whether we can classify the file audio/mp4
+ */
+ static boolean isAudioOnly(final Collection<Mp4Directory> directories) {
+ boolean containsSound = false;
+
+ for (final Mp4Directory directory : directories) {
+ if (directory instanceof Mp4VideoDirectory) {
+ // Fail fast as this isn't audio only
+ return false;
+ }
+ if (directory instanceof Mp4SoundDirectory) {
+ containsSound = true;
+ }
+ }
+
+ return containsSound;
+ }
+
private void processMp4SoundDirectory(Mp4SoundDirectory mp4SoundDirectory,
Metadata metadata) {
addInt(mp4SoundDirectory, metadata,
Mp4SoundDirectory.TAG_AUDIO_SAMPLE_RATE,
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
index 951c59543..0b36e5ee4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
@@ -17,11 +17,19 @@
package org.apache.tika.parser.mp4;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
-
+import com.drew.metadata.mp4.Mp4Directory;
+import com.drew.metadata.mp4.media.Mp4MetaDirectory;
+import com.drew.metadata.mp4.media.Mp4SoundDirectory;
+import com.drew.metadata.mp4.media.Mp4VideoDirectory;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Timeout;
import org.xml.sax.ContentHandler;
@@ -35,6 +43,7 @@ import org.apache.tika.metadata.XMPDM;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
+
/**
* Test case for parsing mp4 files.
*/
@@ -129,6 +138,37 @@ public class MP4ParserTest extends TikaTest {
assertEquals("M4A", r.metadata.get(XMPDM.AUDIO_COMPRESSOR));
}
+ @Test
+ public void testAudioOnlyMP4() throws Exception {
+ final XMLResult xmlResult = getXML("testMP4AudioOnly.mp4");
+ final Metadata metadata = xmlResult.metadata;
+
+ assertEquals("audio/mp4", metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testAudioOnlyCheck() {
+ assertTrue(MP4Parser.isAudioOnly(List.of(new Mp4SoundDirectory())));
+ }
+
+ @Test
+ public void testMetadataWithSoundConsideredAudio() {
+ assertTrue(MP4Parser.isAudioOnly(List.of(new Mp4SoundDirectory(), new
Mp4MetaDirectory())));
+ }
+
+ @Test
+ public void testVideoDirectoriesNotConsideredAudio() {
+ final Collection<Mp4Directory> directories =
+ List.of(new Mp4VideoDirectory(), new Mp4VideoDirectory(), new
Mp4SoundDirectory());
+
+ assertFalse(MP4Parser.isAudioOnly(directories));
+ }
+
+ @Test
+ public void testNoDirectoriesNotConsideredAudio() {
+ assertFalse(MP4Parser.isAudioOnly(Collections.emptyList()));
+ }
+
/*
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP4AudioOnly.mp4
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP4AudioOnly.mp4
new file mode 100644
index 000000000..bdb51e220
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP4AudioOnly.mp4
differ