This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4419-v2 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 88c9946e78e2889fbee397847773d03ced935779 Author: tallison <[email protected]> AuthorDate: Mon May 19 10:46:07 2025 -0400 TIKA-4419 -- downgrade jsoup --- tika-parent/pom.xml | 2 +- .../test/java/org/apache/tika/parser/html/HtmlParserTest.java | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 6064bc5e0..042ce28a4 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -393,7 +393,7 @@ <jhighlight.version>1.1.0</jhighlight.version> <jna.version>5.17.0</jna.version> <json.simple.version>1.1.1</json.simple.version> - <jsoup.version>1.20.1</jsoup.version> + <jsoup.version>1.19.1</jsoup.version> <jsr305.version>3.0.2</jsr305.version> <junit4.version>4.13.2</junit4.version> <junit5.version>5.13.0-RC1</junit5.version> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 72c93c138..9850463ef 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -53,6 +53,7 @@ import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; +import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.xml.sax.Attributes; @@ -1286,4 +1287,13 @@ public class HtmlParserTest extends TikaTest { return DONE; } } + + @Test + public void testJsoupScriptTagRegression() throws Exception { + //https://github.com/jhy/jsoup/issues/2329 + String html = "<html><head><script src=\"blah\"/></head><body>this is content</body></html"; + String xml = getXML(UnsynchronizedByteArrayInputStream.builder().setByteArray(html.getBytes(UTF_8)).get(), + TikaTest.AUTO_DETECT_PARSER, new Metadata()).xml; + assertContains("this is content", xml); + } }
