This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_2x by this push:
new 71f800fd3 TIKA-4471: add test from trunk
71f800fd3 is described below
commit 71f800fd3a51243940ac5f693a059102f31e501b
Author: Tilman Hausherr <[email protected]>
AuthorDate: Tue Sep 9 12:38:36 2025 +0200
TIKA-4471: add test from trunk
---
.../java/org/apache/tika/utils/XMLReaderUtils.java | 2 +-
.../org/apache/tika/utils/XMLReaderUtilsTest.java | 225 +++++++++++++++++++--
2 files changed, 210 insertions(+), 17 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index ca5226a88..5aab3c4e7 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -1083,4 +1083,4 @@ public class XMLReaderUtils implements Serializable {
trySetXercesSecurityManager(saxParser);
}
}
-}
\ No newline at end of file
+}
diff --git
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
index 18e253587..b6ae25cbc 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -16,40 +16,233 @@
*/
package org.apache.tika.utils;
+import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;
import java.io.ByteArrayInputStream;
import java.net.ConnectException;
import java.nio.charset.StandardCharsets;
+import java.util.Locale;
+import java.util.NoSuchElementException;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLStreamException;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ToTextContentHandler;
public class XMLReaderUtilsTest {
+
+ private static final String EXTERNAL_DTD_SIMPLE_FILE =
+ "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM
\"tutorials.dtd\"><foo/>";
+ private static final String EXTERNAL_DTD_SIMPLE_URL =
+ "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM
\"http://127.234.172.38:7845/bar\"><foo/>";
+ private static final String EXTERNAL_ENTITY =
+ "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM
\"http://127.234.172.38:7845/bar\">" +
+ " ]><foo>&bar;</foo>";
+ private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" +
+ "<!ENTITY % local_dtd SYSTEM
\"file:///usr/local/app/schema.dtd\">" +
+ "%local_dtd;]><foo/>";
+
+ private static final String BILLION_LAUGHS_CLASSICAL = "<?xml
version=\"1.0\"?>\n" +
+ "<!DOCTYPE lolz [\n" + " <!ENTITY lol \"lol\">\n" + " <!ELEMENT
lolz (#PCDATA)>\n" +
+ " <!ENTITY lol1
\"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n" +
+ " <!ENTITY lol2
\"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n" +
+ " <!ENTITY lol3
\"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n" +
+ " <!ENTITY lol4
\"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n" +
+ " <!ENTITY lol5
\"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n" +
+ " <!ENTITY lol6
\"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" +
+ " <!ENTITY lol7
\"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" +
+ " <!ENTITY lol8
\"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" +
+ " <!ENTITY lol9
\"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" +
+ "]>\n" + "<lolz>&lol9;</lolz>";
+
+ private static String BILLION_LAUGHS_VARIANT;
+
+ static {
+ StringBuilder entity = new StringBuilder();
+ for (int i = 0; i < 1000000; i++) {
+ entity.append("a");
+ }
+ StringBuilder xml = new StringBuilder();
+ xml.append("<?xml version=\"1.0\"?>\n" + "<!DOCTYPE kaboom [\n" + "
<!ENTITY a \"");
+ xml.append(entity.toString());
+ xml.append("\">]>" + "<kaboom>");
+ for (int i = 0; i < 100000; i++) {
+ xml.append("&a;");
+ }
+ xml.append("</kaboom>");
+ BILLION_LAUGHS_VARIANT = xml.toString();
+ }
+
+ private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{
EXTERNAL_DTD_SIMPLE_FILE,
+ EXTERNAL_DTD_SIMPLE_URL,
+ EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD };
+
+ private static final String[] BILLION_LAUGHS = new String[]{
BILLION_LAUGHS_CLASSICAL,
+ BILLION_LAUGHS_VARIANT };
+
+ private static Locale defaultLocale;
+
+ @BeforeEach
+ public void beforeEach() {
+ defaultLocale = Locale.getDefault();
+ Locale.setDefault(Locale.US);
+ }
+
+ @AfterAll
+ public static void tearDown() {
+ Locale.setDefault(defaultLocale);
+ }
+
//make sure that parseSAX actually defends against external entities
@Test
- public void testExternalDTD() throws Exception {
- String xml = "<!DOCTYPE foo SYSTEM
\"http://127.234.172.38:7845/bar\"><foo/>";
- try {
- XMLReaderUtils.parseSAX(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
- new ToTextContentHandler(), new ParseContext());
- } catch (ConnectException e) {
- fail("Parser tried to access the external DTD:" + e);
+ public void testSAX() throws Exception {
+ for (String xml : EXTERNAL_ENTITY_XMLS) {
+ try {
+ XMLReaderUtils.parseSAX(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+ new ToTextContentHandler(), new ParseContext());
+ } catch (ConnectException e) {
+ fail("Parser tried to access resource: " + xml, e);
+ }
+ }
+ }
+
+ @Test
+ public void testDOM() throws Exception {
+ for (String xml : EXTERNAL_ENTITY_XMLS) {
+ try {
+ XMLReaderUtils.buildDOM(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+ new ParseContext());
+ } catch (ConnectException e) {
+ fail("Parser tried to access resource: " + xml, e);
+ }
+ }
+ }
+
+ @Test
+ public void testStax() throws Exception {
+ for (String xml : EXTERNAL_ENTITY_XMLS) {
+ try {
+ javax.xml.stream.XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory();
+ XMLEventReader reader = xmlInputFactory.createXMLEventReader(
+ new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+ StringBuilder sb = new StringBuilder();
+ while (reader.hasNext()) {
+ sb.append(reader.next());
+ }
+ if (sb.toString().contains("Exception scanning External")) {
+ fail("tried to read external dtd");
+ }
+ } catch (XMLStreamException e) {
+ fail("StreamException: " + xml, e);
+ } catch (NoSuchElementException e) {
+ if (e.getMessage() != null) {
+ if (e.getMessage().contains("Connection refused")) {
+ fail("Vulnerable to ssrf via url: " + xml, e);
+ } else if (e.getMessage().contains("No such file")) {
+ fail("Vulnerable to local file read via external
entity/dtd: " + xml, e);
+ }
+ }
+ }
+ }
+ }
+
+ @Test
+ public void testSAXBillionLaughs() throws Exception {
+ for (String xml : BILLION_LAUGHS) {
+ try {
+ XMLReaderUtils.parseSAX(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+ new ToTextContentHandler(), new ParseContext());
+ } catch (SAXException e) {
+ if (e.getMessage() != null && e.getMessage().contains("entity
expansions")) {
+ //do nothing
+ } else {
+ throw e;
+ }
+ }
}
}
@Test
- public void testExternalEntity() throws Exception {
- String xml =
- "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM
\"http://127.234.172.38:7845/bar\">" +
- " ]><foo>&bar;</foo>";
- try {
- XMLReaderUtils.parseSAX(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
- new ToTextContentHandler(), new ParseContext());
- } catch (ConnectException e) {
- fail("Parser tried to access the external DTD:" + e);
+ public void testDOMBillionLaughs() throws Exception {
+ //confirm that ExpandEntityReferences has been set to false.
+
+ //some implementations ignore the expandEntityReferences=false, and we
are still
+ //protected by the "The parser has encountered more than "20" entity
expansions" SAXException.
+ //We need to check for either: empty content and no exception, or this
SAXException
+ for (String xml : BILLION_LAUGHS) {
+ Document doc = null;
+ try {
+ doc = XMLReaderUtils.buildDOM(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+ new ParseContext());
+ } catch (SAXException e) {
+ if (e.getMessage() != null && e.getMessage().contains("entity
expansions")) {
+ //do nothing
+ continue;
+ } else {
+ throw e;
+ }
+ }
+ NodeList nodeList = doc.getChildNodes();
+ StringBuilder sb = new StringBuilder();
+ dumpChildren(nodeList, sb);
+ assertEquals(0, sb
+ .toString()
+ .trim()
+ .length(), sb.toString());
+ }
+ }
+
+ private void dumpChildren(NodeList nodeList, StringBuilder sb) {
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ Node n = nodeList.item(i);
+ String txt = n.getTextContent();
+ if (txt != null) {
+ sb.append(txt);
+ }
+ }
+ }
+
+ @Test
+ public void testStaxBillionLaughs() throws Exception {
+ /*
+ Turning off dtd support of the XMLInputFactory in XMLReaderUtils
turns off entity expansions and
+ causes a "NoSuchElementException" with the "'lol9' was referenced
but not declared" message with this line:
+ tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD,
false);
+ If that line doesn't exist, then we get a
+ NoSuchElementException with:
+ "The parser has encountered more than "20" entity expansions in
this document;
+ this is the limit imposed by the JDK."
+ */
+
+ for (String xml : BILLION_LAUGHS) {
+ javax.xml.stream.XMLInputFactory xmlInputFactory =
XMLReaderUtils.getXMLInputFactory();
+ XMLEventReader reader = xmlInputFactory.createXMLEventReader(
+ new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+ try {
+ while (reader.hasNext()) {
+ reader.next();
+ }
+ } catch (NoSuchElementException e) {
+ //full message on temurin-17: The entity "lol9" was
referenced, but not declared.
+ if (e.getMessage() != null && e
+ .getMessage()
+ .contains("referenced") && e
+ .getMessage()
+ .contains("not declared")) {
+ //swallow -- this is expected
+ } else {
+ throw e;
+ }
+ }
}
}
}