This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_2x by this push:
     new 71f800fd3 TIKA-4471: add test from trunk
71f800fd3 is described below

commit 71f800fd3a51243940ac5f693a059102f31e501b
Author: Tilman Hausherr <[email protected]>
AuthorDate: Tue Sep 9 12:38:36 2025 +0200

    TIKA-4471: add test from trunk
---
 .../java/org/apache/tika/utils/XMLReaderUtils.java |   2 +-
 .../org/apache/tika/utils/XMLReaderUtilsTest.java  | 225 +++++++++++++++++++--
 2 files changed, 210 insertions(+), 17 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index ca5226a88..5aab3c4e7 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -1083,4 +1083,4 @@ public class XMLReaderUtils implements Serializable {
             trySetXercesSecurityManager(saxParser);
         }
     }
-}
\ No newline at end of file
+}
diff --git 
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
index 18e253587..b6ae25cbc 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -16,40 +16,233 @@
  */
 package org.apache.tika.utils;
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.fail;
 
 import java.io.ByteArrayInputStream;
 import java.net.ConnectException;
 import java.nio.charset.StandardCharsets;
+import java.util.Locale;
+import java.util.NoSuchElementException;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLStreamException;
 
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
 
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.ToTextContentHandler;
 
 public class XMLReaderUtilsTest {
+
+    private static final String EXTERNAL_DTD_SIMPLE_FILE = 
+            "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"tutorials.dtd\"><foo/>";
+    private static final String EXTERNAL_DTD_SIMPLE_URL = 
+            "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"http://127.234.172.38:7845/bar\";><foo/>";
+    private static final String EXTERNAL_ENTITY = 
+            "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM 
\"http://127.234.172.38:7845/bar\";>" +
+            " ]><foo>&bar;</foo>";
+    private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" +
+            "<!ENTITY % local_dtd SYSTEM 
\"file:///usr/local/app/schema.dtd\">" +
+            "%local_dtd;]><foo/>";
+
+    private static final String BILLION_LAUGHS_CLASSICAL = "<?xml 
version=\"1.0\"?>\n" +
+            "<!DOCTYPE lolz [\n" + " <!ENTITY lol \"lol\">\n" + " <!ELEMENT 
lolz (#PCDATA)>\n" +
+            " <!ENTITY lol1 
\"&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;\">\n" +
+            " <!ENTITY lol2 
\"&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;\">\n" +
+            " <!ENTITY lol3 
\"&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;\">\n" +
+            " <!ENTITY lol4 
\"&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;\">\n" +
+            " <!ENTITY lol5 
\"&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;\">\n" +
+            " <!ENTITY lol6 
\"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" +
+            " <!ENTITY lol7 
\"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" +
+            " <!ENTITY lol8 
\"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" +
+            " <!ENTITY lol9 
\"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" +
+            "]>\n" + "<lolz>&lol9;</lolz>";
+
+    private static String BILLION_LAUGHS_VARIANT;
+
+    static {
+        StringBuilder entity = new StringBuilder();
+        for (int i = 0; i < 1000000; i++) {
+            entity.append("a");
+        }
+        StringBuilder xml = new StringBuilder();
+        xml.append("<?xml version=\"1.0\"?>\n" + "<!DOCTYPE kaboom [\n" + "  
<!ENTITY a \"");
+        xml.append(entity.toString());
+        xml.append("\">]>" + "<kaboom>");
+        for (int i = 0; i < 100000; i++) {
+            xml.append("&a;");
+        }
+        xml.append("</kaboom>");
+        BILLION_LAUGHS_VARIANT = xml.toString();
+    }
+
+    private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ 
EXTERNAL_DTD_SIMPLE_FILE,
+            EXTERNAL_DTD_SIMPLE_URL,
+            EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD };
+
+    private static final String[] BILLION_LAUGHS = new String[]{ 
BILLION_LAUGHS_CLASSICAL,
+            BILLION_LAUGHS_VARIANT };
+
+    private static Locale defaultLocale;
+
+    @BeforeEach
+    public void beforeEach() {
+        defaultLocale = Locale.getDefault();
+        Locale.setDefault(Locale.US);
+    }
+
+    @AfterAll
+    public static void tearDown() {
+        Locale.setDefault(defaultLocale);
+    }
+
     //make sure that parseSAX actually defends against external entities
     @Test
-    public void testExternalDTD() throws Exception {
-        String xml = "<!DOCTYPE foo SYSTEM 
\"http://127.234.172.38:7845/bar\";><foo/>";
-        try {
-            XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
-                    new ToTextContentHandler(), new ParseContext());
-        } catch (ConnectException e) {
-            fail("Parser tried to access the external DTD:" + e);
+    public void testSAX() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
+        }
+    }
+
+    @Test
+    public void testDOM() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ParseContext());
+            } catch (ConnectException e) {
+                fail("Parser tried to access resource: " + xml, e);
+            }
+        }
+    }
+
+    @Test
+    public void testStax() throws Exception {
+        for (String xml : EXTERNAL_ENTITY_XMLS) {
+            try {
+                javax.xml.stream.XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory();
+                XMLEventReader reader = xmlInputFactory.createXMLEventReader(
+                        new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+                StringBuilder sb = new StringBuilder();
+                while (reader.hasNext()) {
+                    sb.append(reader.next());
+                }
+                if (sb.toString().contains("Exception scanning External")) {
+                    fail("tried to read external dtd");
+                }
+            } catch (XMLStreamException e) {
+                fail("StreamException: " + xml, e);
+            } catch (NoSuchElementException e) {
+                if (e.getMessage() != null) {
+                    if (e.getMessage().contains("Connection refused")) {
+                        fail("Vulnerable to ssrf via url: " + xml, e);
+                    } else if (e.getMessage().contains("No such file")) {
+                        fail("Vulnerable to local file read via external 
entity/dtd: " + xml, e);
+                    }
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testSAXBillionLaughs() throws Exception {
+        for (String xml : BILLION_LAUGHS) {
+            try {
+                XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ToTextContentHandler(), new ParseContext());
+            } catch (SAXException e) {
+                if (e.getMessage() != null && e.getMessage().contains("entity 
expansions")) {
+                    //do nothing
+                } else {
+                    throw e;
+                }
+            }
         }
     }
 
     @Test
-    public void testExternalEntity() throws Exception {
-        String xml =
-                "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM 
\"http://127.234.172.38:7845/bar\";>" +
-                        " ]><foo>&bar;</foo>";
-        try {
-            XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
-                    new ToTextContentHandler(), new ParseContext());
-        } catch (ConnectException e) {
-            fail("Parser tried to access the external DTD:" + e);
+    public void testDOMBillionLaughs() throws Exception {
+        //confirm that ExpandEntityReferences has been set to false.
+
+        //some implementations ignore the expandEntityReferences=false, and we 
are still
+        //protected by the "The parser has encountered more than "20" entity 
expansions" SAXException.
+        //We need to check for either: empty content and no exception, or this 
SAXException
+        for (String xml : BILLION_LAUGHS) {
+            Document doc = null;
+            try {
+                doc = XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                        new ParseContext());
+            } catch (SAXException e) {
+                if (e.getMessage() != null && e.getMessage().contains("entity 
expansions")) {
+                    //do nothing
+                    continue;
+                } else {
+                    throw e;
+                }
+            }
+            NodeList nodeList = doc.getChildNodes();
+            StringBuilder sb = new StringBuilder();
+            dumpChildren(nodeList, sb);
+            assertEquals(0, sb
+                    .toString()
+                    .trim()
+                    .length(), sb.toString());
+        }
+    }
+
+    private void dumpChildren(NodeList nodeList, StringBuilder sb) {
+        for (int i = 0; i < nodeList.getLength(); i++) {
+            Node n = nodeList.item(i);
+            String txt = n.getTextContent();
+            if (txt != null) {
+                sb.append(txt);
+            }
+        }
+    }
+
+    @Test
+    public void testStaxBillionLaughs() throws Exception {
+        /*
+            Turning off dtd support of the XMLInputFactory in XMLReaderUtils 
turns off entity expansions and
+            causes a "NoSuchElementException" with the "'lol9' was referenced 
but not declared" message with this line:
+                    tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, 
false);
+            If that line doesn't exist, then we get a
+            NoSuchElementException with:
+            "The parser has encountered more than "20" entity expansions in 
this document;
+            this is the limit imposed by the JDK."
+         */
+
+        for (String xml : BILLION_LAUGHS) {
+            javax.xml.stream.XMLInputFactory xmlInputFactory = 
XMLReaderUtils.getXMLInputFactory();
+            XMLEventReader reader = xmlInputFactory.createXMLEventReader(
+                    new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
+            try {
+                while (reader.hasNext()) {
+                    reader.next();
+                }
+            } catch (NoSuchElementException e) {
+                //full message on temurin-17: The entity "lol9" was 
referenced, but not declared.
+                if (e.getMessage() != null && e
+                        .getMessage()
+                        .contains("referenced") && e
+                        .getMessage()
+                        .contains("not declared")) {
+                    //swallow -- this is expected
+                } else {
+                    throw e;
+                }
+            }
         }
     }
 }

Reply via email to