This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4419-v2
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 88c9946e78e2889fbee397847773d03ced935779
Author: tallison <[email protected]>
AuthorDate: Mon May 19 10:46:07 2025 -0400

    TIKA-4419 -- downgrade jsoup
---
 tika-parent/pom.xml                                            |  2 +-
 .../test/java/org/apache/tika/parser/html/HtmlParserTest.java  | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 6064bc5e0..042ce28a4 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -393,7 +393,7 @@
     <jhighlight.version>1.1.0</jhighlight.version>
     <jna.version>5.17.0</jna.version>
     <json.simple.version>1.1.1</json.simple.version>
-    <jsoup.version>1.20.1</jsoup.version>
+    <jsoup.version>1.19.1</jsoup.version>
     <jsr305.version>3.0.2</jsr305.version>
     <junit4.version>4.13.2</junit4.version>
     <junit5.version>5.13.0-RC1</junit5.version>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 72c93c138..9850463ef 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -53,6 +53,7 @@ import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import org.xml.sax.Attributes;
@@ -1286,4 +1287,13 @@ public class HtmlParserTest extends TikaTest {
             return DONE;
         }
     }
+
+    @Test
+    public void testJsoupScriptTagRegression() throws Exception {
+        //https://github.com/jhy/jsoup/issues/2329
+        String html = "<html><head><script src=\"blah\"/></head><body>this is 
content</body></html";
+        String xml = 
getXML(UnsynchronizedByteArrayInputStream.builder().setByteArray(html.getBytes(UTF_8)).get(),
+                TikaTest.AUTO_DETECT_PARSER, new Metadata()).xml;
+        assertContains("this is content", xml);
+    }
 }

Reply via email to