This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new ca4e0679d [TIKA-4476] Force audio/mp4 where mp4 container is 
exclusively sound (#2315)
ca4e0679d is described below

commit ca4e0679dc4b74a7b05df7b6d89cf04a01a14cbf
Author: tombrisland <[email protected]>
AuthorDate: Wed Sep 10 17:33:15 2025 +0100

    [TIKA-4476] Force audio/mp4 where mp4 container is exclusively sound (#2315)
    
    (cherry picked from commit ff1b8e83c09c0bedb8a7a6af477dab0a88407f1f)
---
 .../java/org/apache/tika/parser/mp4/MP4Parser.java |  39 +++++++++++++++++--
 .../org/apache/tika/parser/mp4/MP4ParserTest.java  |  42 ++++++++++++++++++++-
 .../resources/test-documents/testMP4AudioOnly.mp4  | Bin 0 -> 35236 bytes
 3 files changed, 76 insertions(+), 5 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
index 5e1db299a..0b7e30b2a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
@@ -73,6 +73,8 @@ public class MP4Parser implements Parser {
             Collections.unmodifiableSet(typesMap.keySet());
 
     private static final MediaType APPLICATION_MP4 = 
MediaType.application("mp4");
+    private static final MediaType AUDIO_MP4 = MediaType.audio("mp4");
+
     private static final int MAX_ERROR_MESSAGES = 100;
     static {
         // All types should be 4 bytes long, space padded as needed
@@ -111,10 +113,15 @@ public class MP4Parser implements Parser {
             }
             //TODO -- figure out how to get IOExceptions out of boxhandler. 
Mp4Reader
             //currently swallows IOExceptions.
-            Set<String> errorMessages =
-                    processMp4Directories(
-                            
mp4Metadata.getDirectoriesOfType(Mp4Directory.class),
-                    metadata);
+            final Collection<Mp4Directory> mp4Directories =
+                    mp4Metadata.getDirectoriesOfType(Mp4Directory.class);
+            final Set<String> errorMessages = 
processMp4Directories(mp4Directories, metadata);
+
+            // Despite the brand, if we ONLY have audio streams with no video
+            if (isAudioOnly(mp4Directories)) {
+                // Mark this as audio/mp4
+                metadata.set(Metadata.CONTENT_TYPE, AUDIO_MP4.toString());
+            }
 
             for (String m : errorMessages) {
                 metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, 
m);
@@ -160,6 +167,30 @@ public class MP4Parser implements Parser {
         }
     }
 
+    /**
+     * Check we have only audio with no video metadata.
+     * <p>
+     * Other non-video metadata can exist - as long as there's at least one 
{@link Mp4SoundDirectory}.
+     *
+     * @param directories from MP4 file
+     * @return whether we can classify the file audio/mp4
+     */
+    static boolean isAudioOnly(final Collection<Mp4Directory> directories) {
+        boolean containsSound = false;
+
+        for (final Mp4Directory directory : directories) {
+            if (directory instanceof Mp4VideoDirectory) {
+                // Fail fast as this isn't audio only
+                return false;
+            }
+            if (directory instanceof Mp4SoundDirectory) {
+                containsSound = true;
+            }
+        }
+
+        return containsSound;
+    }
+
     private void processMp4SoundDirectory(Mp4SoundDirectory mp4SoundDirectory,
                                         Metadata metadata) {
         addInt(mp4SoundDirectory, metadata, 
Mp4SoundDirectory.TAG_AUDIO_SAMPLE_RATE,
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
index 951c59543..0b36e5ee4 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
@@ -17,11 +17,19 @@
 package org.apache.tika.parser.mp4;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
-
+import com.drew.metadata.mp4.Mp4Directory;
+import com.drew.metadata.mp4.media.Mp4MetaDirectory;
+import com.drew.metadata.mp4.media.Mp4SoundDirectory;
+import com.drew.metadata.mp4.media.Mp4VideoDirectory;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.Timeout;
 import org.xml.sax.ContentHandler;
@@ -35,6 +43,7 @@ import org.apache.tika.metadata.XMPDM;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.BodyContentHandler;
 
+
 /**
  * Test case for parsing mp4 files.
  */
@@ -129,6 +138,37 @@ public class MP4ParserTest extends TikaTest {
         assertEquals("M4A", r.metadata.get(XMPDM.AUDIO_COMPRESSOR));
     }
 
+    @Test
+    public void testAudioOnlyMP4() throws Exception {
+        final XMLResult xmlResult = getXML("testMP4AudioOnly.mp4");
+        final Metadata metadata = xmlResult.metadata;
+
+        assertEquals("audio/mp4", metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testAudioOnlyCheck() {
+        assertTrue(MP4Parser.isAudioOnly(List.of(new Mp4SoundDirectory())));
+    }
+
+    @Test
+    public void testMetadataWithSoundConsideredAudio() {
+        assertTrue(MP4Parser.isAudioOnly(List.of(new Mp4SoundDirectory(), new 
Mp4MetaDirectory())));
+    }
+
+    @Test
+    public void testVideoDirectoriesNotConsideredAudio() {
+        final Collection<Mp4Directory> directories =
+                List.of(new Mp4VideoDirectory(), new Mp4VideoDirectory(), new 
Mp4SoundDirectory());
+
+        assertFalse(MP4Parser.isAudioOnly(directories));
+    }
+
+    @Test
+    public void testNoDirectoriesNotConsideredAudio() {
+        assertFalse(MP4Parser.isAudioOnly(Collections.emptyList()));
+    }
+
 /*
 
     @Test
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP4AudioOnly.mp4
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP4AudioOnly.mp4
new file mode 100644
index 000000000..bdb51e220
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/resources/test-documents/testMP4AudioOnly.mp4
 differ

Reply via email to