This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_2x by this push:
     new 2c28802d2 TIKA-4471 -- fix i18n for new unit tests (#2319)
2c28802d2 is described below

commit 2c28802d211abe4f48241cd5f544bf77153c3b3f
Author: Tim Allison <[email protected]>
AuthorDate: Tue Sep 9 08:51:50 2025 -0400

    TIKA-4471 -- fix i18n for new unit tests (#2319)
    
    (cherry picked from commit 9db2fb8777be0702bd7fc7e634ba725099b7faf6)
    
    # Conflicts:
    #       
tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
    #       tika-parent/pom.xml
---
 .../org/apache/tika/utils/XMLReaderUtilsTest.java  | 78 ++++++++++++----------
 tika-parent/pom.xml                                |  2 +
 2 files changed, 44 insertions(+), 36 deletions(-)

diff --git 
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
index 25da0e4f4..116e8f4ba 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -28,7 +28,6 @@ import javax.xml.stream.XMLEventReader;
 import javax.xml.stream.XMLStreamException;
 
 import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
@@ -40,12 +39,17 @@ import org.apache.tika.sax.ToTextContentHandler;
 
 public class XMLReaderUtilsTest {
 
-    private static final String EXTERNAL_DTD_SIMPLE_FILE = 
-            "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"tutorials.dtd\"><foo/>";
-    private static final String EXTERNAL_DTD_SIMPLE_URL = 
-            "<?xml version=\"1.0\" standalone=\"no\"?><!DOCTYPE foo SYSTEM 
\"http://127.234.172.38:7845/bar\";><foo/>";
-    private static final String EXTERNAL_ENTITY = 
-            "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM 
\"http://127.234.172.38:7845/bar\";>" +
+    private static final Locale defaultLocale = Locale.getDefault();
+    static {
+        //tests on content of Exception msgs require specifying locale.
+        //even this, though is not sufficient for the billion laughs tests ?!
+        Locale.setDefault(Locale.US);
+    }
+    private static final String EXTERNAL_DTD_SIMPLE_FILE = "<?xml 
version=\"1.0\" standalone=\"no\"?>" +
+            "<!DOCTYPE foo SYSTEM \"tutorials.dtd\"><foo/>";
+    private static final String EXTERNAL_DTD_SIMPLE_URL = "<?xml 
version=\"1.0\" standalone=\"no\"?>" +
+            "<!DOCTYPE foo SYSTEM \"http://127.234.172.38:7845/bar\";><foo/>";
+    private static final String EXTERNAL_ENTITY =  "<!DOCTYPE foo [" + " 
<!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\";>" +
             " ]><foo>&bar;</foo>";
     private static final String EXTERNAL_LOCAL_DTD = "<!DOCTYPE foo [" +
             "<!ENTITY % local_dtd SYSTEM 
\"file:///usr/local/app/schema.dtd\">" +
@@ -61,8 +65,8 @@ public class XMLReaderUtilsTest {
             " <!ENTITY lol6 
\"&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;\">\n" +
             " <!ENTITY lol7 
\"&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;\">\n" +
             " <!ENTITY lol8 
\"&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;\">\n" +
-            " <!ENTITY lol9 
\"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" +
-            "]>\n" + "<lolz>&lol9;</lolz>";
+            " <!ENTITY lol9 
\"&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;\">\n" + "]>\n" +
+            "<lolz>&lol9;</lolz>";
 
     private static String BILLION_LAUGHS_VARIANT;
 
@@ -82,20 +86,11 @@ public class XMLReaderUtilsTest {
         BILLION_LAUGHS_VARIANT = xml.toString();
     }
 
-    private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{ 
EXTERNAL_DTD_SIMPLE_FILE,
-            EXTERNAL_DTD_SIMPLE_URL,
+    private static final String[] EXTERNAL_ENTITY_XMLS = new String[]{
+            EXTERNAL_DTD_SIMPLE_FILE, EXTERNAL_DTD_SIMPLE_URL,
             EXTERNAL_ENTITY, EXTERNAL_LOCAL_DTD };
 
-    private static final String[] BILLION_LAUGHS = new String[]{ 
BILLION_LAUGHS_CLASSICAL,
-            BILLION_LAUGHS_VARIANT };
-
-    private static Locale defaultLocale;
-
-    @BeforeAll
-    public static void startUp() {
-        defaultLocale = Locale.getDefault();
-        Locale.setDefault(Locale.US);
-    }
+    private static final String[] BILLION_LAUGHS = new String[]{ 
BILLION_LAUGHS_CLASSICAL, BILLION_LAUGHS_VARIANT };
 
     @AfterAll
     public static void tearDown() {
@@ -162,11 +157,7 @@ public class XMLReaderUtilsTest {
                 XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
                         new ToTextContentHandler(), new ParseContext());
             } catch (SAXException e) {
-                if (e.getMessage() != null && e.getMessage().contains("entity 
expansions")) {
-                    //do nothing
-                } else {
-                    throw e;
-                }
+                limitCheck(e);
             }
         }
     }
@@ -184,12 +175,8 @@ public class XMLReaderUtilsTest {
                 doc = XMLReaderUtils.buildDOM(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
                         new ParseContext());
             } catch (SAXException e) {
-                if (e.getMessage() != null && e.getMessage().contains("entity 
expansions")) {
-                    //do nothing
-                    continue;
-                } else {
-                    throw e;
-                }
+                limitCheck(e);
+                continue;
             }
             NodeList nodeList = doc.getChildNodes();
             StringBuilder sb = new StringBuilder();
@@ -215,12 +202,12 @@ public class XMLReaderUtilsTest {
     public void testStaxBillionLaughs() throws Exception {
         /*
             Turning off dtd support of the XMLInputFactory in XMLReaderUtils 
turns off entity expansions and
-            causes a "NoSuchElementException" with the "'lol9' was referenced 
but not declared" message with this line:
+            causes a "NoSuchElementException" with the "'lol9' was referenced 
but not declared"
+            message with this line:
                     tryToSetStaxProperty(factory, XMLInputFactory.SUPPORT_DTD, 
false);
             If that line doesn't exist, then we get a
-            NoSuchElementException with:
-            "The parser has encountered more than "20" entity expansions in 
this document;
-            this is the limit imposed by the JDK."
+            NoSuchElementException with: "The parser has encountered more than 
"20" entity
+            expansions in this document; this is the limit imposed by the JDK."
          */
 
         for (String xml : BILLION_LAUGHS) {
@@ -245,4 +232,23 @@ public class XMLReaderUtilsTest {
             }
         }
     }
+
+    private void limitCheck(SAXException e) throws SAXException {
+        String msg = e.getLocalizedMessage();
+        if (msg == null) {
+            throw e;
+        }
+
+        //depending on the flavor/version of the jdk, entity expansions may be 
triggered
+        // OR entitySizeLimit may be triggered
+        //See TIKA-4471
+        if (msg.contains("JAXP00010001") || //entity expansions
+                msg.contains("JAXP00010003") || //max entity size limit
+                msg.contains("JAXP00010004") || //TotalEntitySizeLimit
+                msg.contains("entity expansions") ||
+                e.getMessage().contains("maxGeneralEntitySizeLimit")) {
+            return;
+        }
+        throw e;
+    }
 }
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index e3c09717a..0b46cbef9 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -1268,6 +1268,8 @@
         <version>${maven.surefire.version}
         </version> <!-- versions greater than this don't like System.exit 
calls in tika-batch -->
         <configuration>
+          <!-- for manual testing of i18n, try for example: -Duser.language=zh 
-Duser.region=CN or
+            -Duser.language=de -Duser.country=DE -->
           <argLine>-Xmx3072m -Djava.awt.headless=true</argLine>
         </configuration>
       </plugin>

Reply via email to