This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4633-centralize-limits
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2793f96542d34c3f6ad952aba36545881a15659a
Author: tallison <[email protected]>
AuthorDate: Wed Jan 28 08:46:19 2026 -0500

    TIKA-4633 -- first steps
---
 .../org/apache/tika/config/EmbeddedLimits.java     | 222 +++++++++++++++++++++
 .../org/apache/tika/config/EmbeddedLimitsTest.java | 109 ++++++++++
 .../resources/configs/embedded-limits-test.json    |  10 +
 3 files changed, 341 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/config/EmbeddedLimits.java 
b/tika-core/src/main/java/org/apache/tika/config/EmbeddedLimits.java
new file mode 100644
index 0000000000..104e0fe8a0
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/config/EmbeddedLimits.java
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import java.io.Serializable;
+
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Configuration for limits on embedded document processing.
+ * <p>
+ * This controls how deep and how many embedded documents are processed:
+ * <ul>
+ *   <li>{@code maxDepth} - maximum nesting depth for embedded documents (-1 = 
unlimited)</li>
+ *   <li>{@code throwOnMaxDepth} - whether to throw an exception when maxDepth 
is reached</li>
+ *   <li>{@code maxCount} - maximum number of embedded documents to process 
(-1 = unlimited)</li>
+ *   <li>{@code throwOnMaxCount} - whether to throw an exception when maxCount 
is reached</li>
+ * </ul>
+ * <p>
+ * <b>maxDepth behavior:</b> When the depth limit is reached, recursion stops 
but siblings at the
+ * current level continue to be processed. For example, with maxDepth=1:
+ * <pre>
+ * container.zip (depth 0)
+ * ├── doc1.docx (depth 1) ✓ PARSED
+ * │   ├── image1.png (depth 2) ✗ NOT PARSED (exceeds maxDepth)
+ * │   └── embed.xlsx (depth 2) ✗ NOT PARSED (exceeds maxDepth)
+ * ├── doc2.pdf (depth 1) ✓ PARSED (sibling at same level)
+ * └── doc3.txt (depth 1) ✓ PARSED (sibling at same level)
+ * </pre>
+ * <p>
+ * <b>maxCount behavior:</b> When the count limit is reached, processing stops 
immediately.
+ * No more embedded documents are processed, including siblings.
+ * <p>
+ * When a limit is hit and throwing is disabled:
+ * <ul>
+ *   <li>{@code X-TIKA-maxDepthReached=true} is set when maxDepth is hit</li>
+ *   <li>{@code X-TIKA-maxEmbeddedCountReached=true} is set when maxCount is 
hit</li>
+ * </ul>
+ * <p>
+ * Example configuration:
+ * <pre>
+ * {
+ *   "other-configs": {
+ *     "embedded-limits": {
+ *       "maxDepth": 10,
+ *       "throwOnMaxDepth": false,
+ *       "maxCount": 1000,
+ *       "throwOnMaxCount": false
+ *     }
+ *   }
+ * }
+ * </pre>
+ *
+ * @since Apache Tika 4.0
+ */
+@TikaComponent(spi = false)
+public class EmbeddedLimits implements Serializable {
+
+    private static final long serialVersionUID = 1L;
+
+    public static final int UNLIMITED = -1;
+
+    private int maxDepth = UNLIMITED;
+    private boolean throwOnMaxDepth = false;
+    private int maxCount = UNLIMITED;
+    private boolean throwOnMaxCount = false;
+
+    /**
+     * No-arg constructor for Jackson deserialization.
+     */
+    public EmbeddedLimits() {
+    }
+
+    /**
+     * Constructor with all parameters.
+     *
+     * @param maxDepth maximum nesting depth (-1 = unlimited)
+     * @param throwOnMaxDepth whether to throw when depth limit is reached
+     * @param maxCount maximum number of embedded documents (-1 = unlimited)
+     * @param throwOnMaxCount whether to throw when count limit is reached
+     */
+    public EmbeddedLimits(int maxDepth, boolean throwOnMaxDepth, int maxCount, 
boolean throwOnMaxCount) {
+        this.maxDepth = maxDepth;
+        this.throwOnMaxDepth = throwOnMaxDepth;
+        this.maxCount = maxCount;
+        this.throwOnMaxCount = throwOnMaxCount;
+    }
+
+    /**
+     * Gets the maximum nesting depth for embedded documents.
+     *
+     * @return maximum depth, or -1 for unlimited
+     */
+    public int getMaxDepth() {
+        return maxDepth;
+    }
+
+    /**
+     * Sets the maximum nesting depth for embedded documents.
+     *
+     * @param maxDepth maximum depth, or -1 for unlimited
+     */
+    public void setMaxDepth(int maxDepth) {
+        this.maxDepth = maxDepth;
+    }
+
+    /**
+     * Gets whether to throw an exception when maxDepth is reached.
+     *
+     * @return true if an exception should be thrown
+     */
+    public boolean isThrowOnMaxDepth() {
+        return throwOnMaxDepth;
+    }
+
+    /**
+     * Sets whether to throw an exception when maxDepth is reached.
+     *
+     * @param throwOnMaxDepth true to throw an exception
+     */
+    public void setThrowOnMaxDepth(boolean throwOnMaxDepth) {
+        this.throwOnMaxDepth = throwOnMaxDepth;
+    }
+
+    /**
+     * Gets the maximum number of embedded documents to process.
+     *
+     * @return maximum count, or -1 for unlimited
+     */
+    public int getMaxCount() {
+        return maxCount;
+    }
+
+    /**
+     * Sets the maximum number of embedded documents to process.
+     *
+     * @param maxCount maximum count, or -1 for unlimited
+     */
+    public void setMaxCount(int maxCount) {
+        this.maxCount = maxCount;
+    }
+
+    /**
+     * Gets whether to throw an exception when maxCount is reached.
+     *
+     * @return true if an exception should be thrown
+     */
+    public boolean isThrowOnMaxCount() {
+        return throwOnMaxCount;
+    }
+
+    /**
+     * Sets whether to throw an exception when maxCount is reached.
+     *
+     * @param throwOnMaxCount true to throw an exception
+     */
+    public void setThrowOnMaxCount(boolean throwOnMaxCount) {
+        this.throwOnMaxCount = throwOnMaxCount;
+    }
+
+    /**
+     * Helper method to get EmbeddedLimits from ParseContext with defaults.
+     *
+     * @param context the ParseContext (may be null)
+     * @return the EmbeddedLimits from context, or a new instance with 
defaults if not found
+     */
+    public static EmbeddedLimits get(ParseContext context) {
+        if (context == null) {
+            return new EmbeddedLimits();
+        }
+        EmbeddedLimits limits = context.get(EmbeddedLimits.class);
+        return limits != null ? limits : new EmbeddedLimits();
+    }
+
+    @Override
+    public String toString() {
+        return "EmbeddedLimits{" +
+                "maxDepth=" + maxDepth +
+                ", throwOnMaxDepth=" + throwOnMaxDepth +
+                ", maxCount=" + maxCount +
+                ", throwOnMaxCount=" + throwOnMaxCount +
+                '}';
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+        EmbeddedLimits that = (EmbeddedLimits) o;
+        return maxDepth == that.maxDepth &&
+                throwOnMaxDepth == that.throwOnMaxDepth &&
+                maxCount == that.maxCount &&
+                throwOnMaxCount == that.throwOnMaxCount;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = maxDepth;
+        result = 31 * result + (throwOnMaxDepth ? 1 : 0);
+        result = 31 * result + maxCount;
+        result = 31 * result + (throwOnMaxCount ? 1 : 0);
+        return result;
+    }
+}
diff --git 
a/tika-serialization/src/test/java/org/apache/tika/config/EmbeddedLimitsTest.java
 
b/tika-serialization/src/test/java/org/apache/tika/config/EmbeddedLimitsTest.java
new file mode 100644
index 0000000000..f362271fb3
--- /dev/null
+++ 
b/tika-serialization/src/test/java/org/apache/tika/config/EmbeddedLimitsTest.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.parser.ParseContext;
+
+public class EmbeddedLimitsTest extends TikaTest {
+
+    @Test
+    public void testLoadFromConfig() throws Exception {
+        TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), 
"embedded-limits-test.json"));
+        EmbeddedLimits limits = loader.configs().load(EmbeddedLimits.class);
+
+        assertNotNull(limits);
+        assertEquals(5, limits.getMaxDepth());
+        assertTrue(limits.isThrowOnMaxDepth());
+        assertEquals(100, limits.getMaxCount());
+        assertFalse(limits.isThrowOnMaxCount());
+    }
+
+    @Test
+    public void testLoadIntoParseContext() throws Exception {
+        TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), 
"embedded-limits-test.json"));
+        ParseContext context = loader.loadParseContext();
+
+        EmbeddedLimits limits = context.get(EmbeddedLimits.class);
+        assertNotNull(limits);
+        assertEquals(5, limits.getMaxDepth());
+        assertTrue(limits.isThrowOnMaxDepth());
+        assertEquals(100, limits.getMaxCount());
+        assertFalse(limits.isThrowOnMaxCount());
+    }
+
+    @Test
+    public void testDefaults() {
+        EmbeddedLimits limits = new EmbeddedLimits();
+        assertEquals(EmbeddedLimits.UNLIMITED, limits.getMaxDepth());
+        assertFalse(limits.isThrowOnMaxDepth());
+        assertEquals(EmbeddedLimits.UNLIMITED, limits.getMaxCount());
+        assertFalse(limits.isThrowOnMaxCount());
+    }
+
+    @Test
+    public void testHelperMethod() {
+        // Test with null context
+        EmbeddedLimits limits = EmbeddedLimits.get(null);
+        assertNotNull(limits);
+        assertEquals(EmbeddedLimits.UNLIMITED, limits.getMaxDepth());
+
+        // Test with context that doesn't have EmbeddedLimits
+        ParseContext context = new ParseContext();
+        limits = EmbeddedLimits.get(context);
+        assertNotNull(limits);
+        assertEquals(EmbeddedLimits.UNLIMITED, limits.getMaxDepth());
+
+        // Test with context that has EmbeddedLimits
+        EmbeddedLimits customLimits = new EmbeddedLimits(10, true, 500, false);
+        context.set(EmbeddedLimits.class, customLimits);
+        limits = EmbeddedLimits.get(context);
+        assertEquals(10, limits.getMaxDepth());
+        assertTrue(limits.isThrowOnMaxDepth());
+        assertEquals(500, limits.getMaxCount());
+        assertFalse(limits.isThrowOnMaxCount());
+    }
+
+    @Test
+    public void testEqualsAndHashCode() {
+        EmbeddedLimits limits1 = new EmbeddedLimits(5, true, 100, false);
+        EmbeddedLimits limits2 = new EmbeddedLimits(5, true, 100, false);
+        EmbeddedLimits limits3 = new EmbeddedLimits(10, true, 100, false);
+
+        assertEquals(limits1, limits2);
+        assertEquals(limits1.hashCode(), limits2.hashCode());
+        assertFalse(limits1.equals(limits3));
+    }
+
+    @Test
+    public void testToString() {
+        EmbeddedLimits limits = new EmbeddedLimits(5, true, 100, false);
+        String str = limits.toString();
+        assertTrue(str.contains("maxDepth=5"));
+        assertTrue(str.contains("throwOnMaxDepth=true"));
+        assertTrue(str.contains("maxCount=100"));
+        assertTrue(str.contains("throwOnMaxCount=false"));
+    }
+}
diff --git 
a/tika-serialization/src/test/resources/configs/embedded-limits-test.json 
b/tika-serialization/src/test/resources/configs/embedded-limits-test.json
new file mode 100644
index 0000000000..14ce20d2be
--- /dev/null
+++ b/tika-serialization/src/test/resources/configs/embedded-limits-test.json
@@ -0,0 +1,10 @@
+{
+  "other-configs": {
+    "embedded-limits": {
+      "maxDepth": 5,
+      "throwOnMaxDepth": true,
+      "maxCount": 100,
+      "throwOnMaxCount": false
+    }
+  }
+}

Reply via email to