This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit b1ef25c424277923f72c9cbfec1521586776dcba
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Thu Nov 14 15:04:33 2024 +0100
TIKA-4350 HTML snippet containing <iframe> as root element erroneously
recognized as application/xml (#2045)
(cherry picked from commit bd878d3733853d0a9c6ffef32e18fba61f505760)
---
tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml | 2 ++
tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java | 2 ++
.../test/resources/org/apache/tika/mime/test-html-snippet-iframe.jsp | 2 ++
3 files changed, 6 insertions(+)
diff --git
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 62e3d56ca..ff0312839 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -7298,6 +7298,8 @@
<root-XML localName="SCRIPT"/>
<root-XML localName="frameset"/>
<root-XML localName="FRAMESET"/>
+ <root-XML localName="iframe"/>
+ <root-XML localName="IFRAME"/>
<magic priority="60">
<match value="(?i)<(html|head|body|title|div)[ >]" type="regex"
offset="0"/>
<match value="(?i)<h[123][ >]" type="regex" offset="0"/>
diff --git
a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index af42cf9b3..6f0c61bfb 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -73,6 +73,8 @@ public class MimeDetectionTest {
testFile("text/html", "testlargerbuffer.html");
// test fragment of HTML with <div> (TIKA-1102)
testFile("text/html", "htmlfragment");
+ // test fragment of HTML with <iframe> and potentially misleading file
suffix
+ testFile("text/html", "test-html-snippet-iframe.jsp");
// test binary CGM detection (TIKA-1170)
testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
// test HTML detection of malformed file, previously identified as
image/cgm (TIKA-1170)
diff --git
a/tika-core/src/test/resources/org/apache/tika/mime/test-html-snippet-iframe.jsp
b/tika-core/src/test/resources/org/apache/tika/mime/test-html-snippet-iframe.jsp
new file mode 100644
index 000000000..2681fecdc
--- /dev/null
+++
b/tika-core/src/test/resources/org/apache/tika/mime/test-html-snippet-iframe.jsp
@@ -0,0 +1,2 @@
+<!-- this is a comment: https://www.example.org/path/file.pdf -->
+ <iframe src='/path/file.pdf' width='100%' height='100%'
target='_blank'>