This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4633-centralize-limits in repository https://gitbox.apache.org/repos/asf/tika.git
commit a18b6242191ad4cdc17007955766e490978a49ba Author: tallison <[email protected]> AuthorDate: Wed Jan 28 15:23:56 2026 -0500 TIKA-4633 -- centralize limits --- .../ROOT/pages/advanced/setting-limits.adoc | 352 ++++++++++++++++----- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 21 +- .../src/main/java/org/apache/tika/gui/TikaGUI.java | 2 +- .../test/resources/configs/config-template.json | 1 - .../exception/EmbeddedLimitReachedException.java | 62 ++++ .../tika/extractor/EmbeddedDocumentExtractor.java | 9 + .../ParsingEmbeddedDocumentExtractor.java | 49 ++- .../apache/tika/metadata/TikaCoreProperties.java | 7 + .../org/apache/tika/parser/CompositeParser.java | 6 + .../java/org/apache/tika/parser/ParseRecord.java | 67 ++-- .../apache/tika/parser/RecursiveParserWrapper.java | 11 - .../org/apache/tika/MultiThreadedTikaTest.java | 3 +- .../org/apache/tika/example/ParsingExample.java | 2 +- .../src/test/resources/kafka/plugins-template.json | 1 - .../resources/opensearch/plugins-template.json | 1 - .../opensearch/tika-config-opensearch.json | 1 - .../src/test/resources/s3/plugins-template.json | 1 - .../src/test/resources/solr/plugins-template.json | 1 - .../org/apache/tika/parser/apple/PListParser.java | 10 +- .../parser/iwork/iwana/IWork13PackageParser.java | 2 +- .../org/apache/tika/parser/crypto/TSDParser.java | 2 +- .../parser/microsoft/AbstractPOIFSExtractor.java | 8 +- .../tika/parser/microsoft/HSLFExtractor.java | 2 +- .../apache/tika/parser/microsoft/OfficeParser.java | 2 +- .../microsoft/onenote/OneNoteTreeWalker.java | 2 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 4 +- .../org/apache/tika/parser/epub/EpubParser.java | 2 +- .../parser/odf/FlatOpenDocumentMacroHandler.java | 5 +- .../tika/parser/odf/OpenDocumentBodyHandler.java | 5 +- .../tika/parser/odf/OpenDocumentMacroHandler.java | 3 +- .../tika/parser/RecursiveParserWrapperTest.java | 17 +- .../apache/tika/parser/image/JpegParserTest.java | 2 +- .../tika/parser/microsoft/rtf/RTFParserTest.java | 3 +- .../src/test/resources/configs/tika-4533.json | 3 - .../configs/tika-config-bc-digests-base32.json | 1 - .../configs/tika-config-bc-digests-basic.json | 1 - .../configs/tika-config-bc-digests-multiple.json | 1 - .../configs/tika-config-commons-digests-basic.json | 1 - .../configs/tika-config-digests-pdf-only.json | 1 - .../tika-config-digests-skip-container.json | 1 - .../resources/configs/tika-config-digests.json | 1 - ...a-config-doubling-custom-handler-decorator.json | 1 - .../resources/configs/tika-config-no-names.json | 1 - ...a-config-upcasing-custom-handler-decorator.json | 4 - .../resources/configs/tika-config-with-names.json | 1 - .../configs/tika-config-write-filter.json | 1 - .../src/main/resources/config-template.json | 1 - .../tika/pipes/core/server/ParseHandler.java | 1 - .../test/resources/configs/tika-config-basic.json | 2 - .../resources/configs/tika-config-passback.json | 2 - .../resources/configs/tika-config-truncate.json | 2 - .../resources/configs/tika-config-uppercasing.json | 1 - .../configs/tika-config-write-limiter.json | 2 - .../org/apache/tika/config/loader/TikaLoader.java | 27 +- .../java/org/apache/tika/config/AllLimitsTest.java | 156 +++++++++ .../test/resources/configs/TIKA-3695-exclude.json | 3 - .../test/resources/configs/TIKA-3695-fields.json | 3 - .../src/test/resources/configs/TIKA-3695.json | 3 - .../configs/TIKA-4207-embedded-bytes-config.json | 1 - .../test/resources/configs/all-limits-test.json | 32 ++ .../org/apache/tika/server/core/CXFTestBase.java | 1 - .../resources/configs/cxf-test-base-template.json | 1 - .../resources/configs/cxf-test-base-template.json | 1 - .../configs/tika-config-for-server-tests.json | 1 - .../tika-config-langdetect-opennlp-filter.json | 1 - .../tika-config-langdetect-optimaize-filter.json | 1 - 66 files changed, 681 insertions(+), 245 deletions(-) diff --git a/docs/modules/ROOT/pages/advanced/setting-limits.adoc b/docs/modules/ROOT/pages/advanced/setting-limits.adoc index db2bd2ae88..4dd02a4dd0 100644 --- a/docs/modules/ROOT/pages/advanced/setting-limits.adoc +++ b/docs/modules/ROOT/pages/advanced/setting-limits.adoc @@ -21,12 +21,275 @@ When processing untrusted documents, it's important to set limits on resource co to prevent denial-of-service attacks and protect against malicious or pathological files. Tika provides several mechanisms for limiting resource usage during parsing. -== Limiting Metadata Writes +== Overview -Malicious or pathological documents can contain enormous amounts of metadata, potentially -causing `OutOfMemoryError` or consuming excessive storage. The `MetadataWriteLimiter` -system allows you to constrain metadata size at write time, ensuring parsers cannot -exceed your configured limits. +Tika 4.x provides a unified configuration system for all limits through the `other-configs` +section of the JSON configuration file. All limits are loaded into the `ParseContext` and +flow through the parsing pipeline. + +=== Complete Example + +Here's a comprehensive example showing all limit configurations together. +This is the same configuration tested in `AllLimitsTest.java`: + +[source,json] +---- +{ + "parsers": ["default-parser"], + "other-configs": { + "embedded-limits": { + "maxDepth": 10, + "throwOnMaxDepth": false, + "maxCount": 1000, + "throwOnMaxCount": false + }, + "output-limits": { + "writeLimit": 100000, + "throwOnWriteLimit": false, + "maxXmlDepth": 100, + "maxPackageEntryDepth": 10, + "zipBombThreshold": 1000000, + "zipBombRatio": 100 + }, + "timeout-limits": { + "taskTimeoutMillis": 60000 + }, + "metadata-write-limiter-factory": { + "standard-metadata-limiter-factory": { + "maxTotalBytes": 1048576, + "maxFieldSize": 102400, + "maxKeySize": 1024, + "maxValuesPerField": 100 + } + } + } +} +---- + +Configuration file: `tika-serialization/src/test/resources/configs/all-limits-test.json` + +=== Loading Limits + +Use `TikaLoader.loadParseContext()` to load all configured limits into a `ParseContext`: + +[source,java] +---- +TikaLoader loader = TikaLoader.load(configPath); +ParseContext context = loader.loadParseContext(); + +// Limits are now available from the context +EmbeddedLimits embeddedLimits = context.get(EmbeddedLimits.class); +OutputLimits outputLimits = context.get(OutputLimits.class); +TimeoutLimits timeoutLimits = context.get(TimeoutLimits.class); +---- + +See test: `tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java` + +== Embedded Document Limits + +The `EmbeddedLimits` class controls how deeply nested and how many embedded documents +are processed. This is critical for protecting against "zip bomb" style attacks where +documents contain deeply nested or numerous embedded files. + +=== Configuration Options + +[cols="2,1,3"] +|=== +|Setting |Default |Description + +|`maxDepth` +|-1 (unlimited) +|Maximum nesting depth for embedded documents. When reached, recursion stops but +siblings at the current level continue to be processed. + +|`throwOnMaxDepth` +|false +|Whether to throw an `EmbeddedLimitReachedException` when maxDepth is reached. +If false, processing continues and `X-TIKA:maxDepthReached=true` is set in metadata. + +|`maxCount` +|-1 (unlimited) +|Maximum total number of embedded documents to process. When reached, processing +stops immediately. + +|`throwOnMaxCount` +|false +|Whether to throw an `EmbeddedLimitReachedException` when maxCount is reached. +If false, processing continues and `X-TIKA:maxEmbeddedCountReached=true` is set. +|=== + +=== maxDepth Behavior + +When the depth limit is reached, recursion stops but siblings at the current level +continue to be processed. For example, with `maxDepth=1`: + +[source] +---- +container.zip (depth 0) +├── doc1.docx (depth 1) ✓ PARSED +│ ├── image1.png (depth 2) ✗ NOT PARSED (exceeds maxDepth) +│ └── embed.xlsx (depth 2) ✗ NOT PARSED (exceeds maxDepth) +├── doc2.pdf (depth 1) ✓ PARSED (sibling at same level) +└── doc3.txt (depth 1) ✓ PARSED (sibling at same level) +---- + +=== JSON Configuration + +[source,json] +---- +{ + "other-configs": { + "embedded-limits": { + "maxDepth": 5, + "throwOnMaxDepth": true, + "maxCount": 100, + "throwOnMaxCount": false + } + } +} +---- + +Configuration file: `tika-serialization/src/test/resources/configs/embedded-limits-test.json` + +=== Java API + +[source,java] +---- +// Create with constructor +EmbeddedLimits limits = new EmbeddedLimits(10, true, 500, false); + +// Or use setters +EmbeddedLimits limits = new EmbeddedLimits(); +limits.setMaxDepth(10); +limits.setThrowOnMaxDepth(true); +limits.setMaxCount(500); +limits.setThrowOnMaxCount(false); + +// Add to ParseContext +context.set(EmbeddedLimits.class, limits); + +// Helper method to get limits with defaults +EmbeddedLimits limits = EmbeddedLimits.get(context); // Returns defaults if not set +---- + +See test: `tika-serialization/src/test/java/org/apache/tika/config/EmbeddedLimitsTest.java` + +== Output Limits + +The `OutputLimits` class controls limits on parsing output including text extraction +and protection against zip bombs. + +=== Configuration Options + +[cols="2,1,3"] +|=== +|Setting |Default |Description + +|`writeLimit` +|-1 (unlimited) +|Maximum characters of text to extract. When reached, extraction stops. + +|`throwOnWriteLimit` +|false +|Whether to throw a `WriteLimitReachedException` when writeLimit is reached. + +|`maxXmlDepth` +|100 +|Maximum XML element nesting depth. Protects against XML bomb attacks. + +|`maxPackageEntryDepth` +|10 +|Maximum depth of nested package entries (e.g., zip within zip). + +|`zipBombThreshold` +|1,000,000 +|Minimum decompressed size (in bytes) before zip bomb detection activates. + +|`zipBombRatio` +|100 +|Maximum ratio of decompressed to compressed size before flagging as zip bomb. +|=== + +=== JSON Configuration + +[source,json] +---- +{ + "other-configs": { + "output-limits": { + "writeLimit": 50000, + "throwOnWriteLimit": true, + "maxXmlDepth": 50, + "maxPackageEntryDepth": 5, + "zipBombThreshold": 500000, + "zipBombRatio": 50 + } + } +} +---- + +Configuration file: `tika-serialization/src/test/resources/configs/output-limits-test.json` + +=== Java API + +[source,java] +---- +OutputLimits limits = new OutputLimits(50000, true, 50, 5, 500000, 50); +context.set(OutputLimits.class, limits); + +// Helper method +OutputLimits limits = OutputLimits.get(context); +---- + +See test: `tika-serialization/src/test/java/org/apache/tika/config/OutputLimitsTest.java` + +== Timeout Limits + +The `TimeoutLimits` class controls time-based limits for parsing operations. + +=== Configuration Options + +[cols="2,1,3"] +|=== +|Setting |Default |Description + +|`taskTimeoutMillis` +|60000 (1 minute) +|Maximum time in milliseconds for a parse operation to complete. +|=== + +=== JSON Configuration + +[source,json] +---- +{ + "other-configs": { + "timeout-limits": { + "taskTimeoutMillis": 120000 + } + } +} +---- + +Configuration file: `tika-serialization/src/test/resources/configs/timeout-limits-test.json` + +=== Java API + +[source,java] +---- +TimeoutLimits limits = new TimeoutLimits(120000); +context.set(TimeoutLimits.class, limits); + +// Helper method +TimeoutLimits limits = TimeoutLimits.get(context); +---- + +See test: `tika-serialization/src/test/java/org/apache/tika/config/TimeoutLimitsTest.java` + +== Metadata Limits + +The `MetadataWriteLimiter` system allows you to constrain metadata size at write time, +ensuring parsers cannot exceed your configured limits. === How It Works @@ -52,8 +315,6 @@ Metadata metadata = Metadata.newInstance(context); === Configuration Options -The `StandardMetadataLimiterFactory` provides the following settings: - [cols="2,1,3"] |=== |Setting |Default |Description @@ -93,8 +354,6 @@ Use this to extract only the metadata you need. === JSON Configuration -You can configure the metadata limiter in a Tika JSON configuration file: - [source,json] ---- { @@ -126,15 +385,6 @@ of `includeFields` or size limits: * `X-TIKA:WARN:*` - Warning metadata * Access permission fields -These fields still contribute to `maxTotalBytes` (except `X-TIKA:content`), but they -are never filtered out by `includeFields` or `excludeFields`. - -=== Size Calculation - -All sizes are estimated in UTF-16 bytes (2 bytes per character). This provides a -rough approximation of Java's in-memory String representation. The actual memory -usage may vary depending on JVM version and string interning. - === Detecting Truncation When metadata is truncated due to limits, Tika sets the metadata field @@ -148,72 +398,6 @@ if ("true".equals(metadata.get(TikaCoreProperties.TRUNCATED_METADATA))) { } ---- -=== Field Prioritization Strategy - -When working with limited budgets, consider using `includeFields` to prioritize -the metadata you actually need: - -[source,java] ----- -// Only capture essential fields -factory.setIncludeFields(Set.of( - "dc:title", - "dc:creator", - "dc:subject", - "Content-Type", - "Last-Modified" -)); ----- - -This approach is more efficient than setting a low `maxTotalBytes`, because -unwanted fields are filtered out immediately rather than consuming budget. - -=== Budget Considerations - -When setting `maxTotalBytes`, remember that system fields (Content-Type, -X-TIKA:Parsed-By, etc.) consume approximately 200-300 bytes before any -document-specific metadata is added. Set your budget accordingly: - -[source,java] ----- -// Allow ~300 bytes for system fields + 10KB for document metadata -factory.setMaxTotalBytes(300 + 10 * 1024); ----- - -== Other Limits - -=== Write Limits for Text Content - -To limit the amount of extracted text, use a `WriteLimit` with the content handler: - -[source,java] ----- -ContentHandler handler = new BodyContentHandler(100000); // 100KB limit ----- - -Or configure via `BasicContentHandlerFactory`: - -[source,java] ----- -BasicContentHandlerFactory factory = new BasicContentHandlerFactory( - BasicContentHandlerFactory.HANDLER_TYPE.TEXT, - 100000 // max characters -); ----- - -=== Parse Timeouts - -When using Tika Pipes, you can configure timeouts for parse operations: - -[source,json] ----- -{ - "pipes": { - "timeoutMillis": 60000 - } -} ----- - == Recommendations 1. **Always set limits** when processing untrusted content diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 8e87b0ca7d..7179918182 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -83,7 +83,6 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.language.detect.LanguageHandler; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; -import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.mime.MimeType; @@ -557,7 +556,7 @@ public class TikaCLI { private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException { Metadata metadata = Metadata.newInstance(context); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type)); try (TikaInputStream tis = TikaInputStream.get(url, metadata)) { wrapper.parse(tis, handler, metadata, context); } @@ -735,23 +734,19 @@ public class TikaCLI { } else { parser = tikaLoader.loadAutoDetectParser(); } - // Set DigesterFactory in ParseContext if configured via --digest= + + // Load configs from tika-config.json and merge into existing context + // (preserves EmbeddedDocumentExtractor and other items set before configure()) + ParseContext loadedContext = tikaLoader.loadParseContext(); + context.copyFrom(loadedContext); + + // Override DigesterFactory in ParseContext if configured via --digest= command line if (digesterFactory != null) { context.set(DigesterFactory.class, digesterFactory); } detector = tikaLoader.loadDetectors(); context.set(Parser.class, parser); context.set(PasswordProvider.class, new SimplePasswordProvider(password)); - - // Load default MetadataWriteLimiterFactory if configured - try { - MetadataWriteLimiterFactory factory = tikaLoader.configs().load(MetadataWriteLimiterFactory.class); - if (factory != null) { - context.set(MetadataWriteLimiterFactory.class, factory); - } - } catch (Exception e) { - LOG.warn("Failed to load MetadataWriteLimiterFactory", e); - } } private void displayMetModels() { diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java index b065c19c14..bc04a208f2 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java +++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java @@ -380,7 +380,7 @@ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener } if (isReset) { RecursiveParserWrapperHandler recursiveParserWrapperHandler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1), -1); + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1)); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); ParseContext rpwContext = new ParseContext(); wrapper.parse(tis, recursiveParserWrapperHandler, Metadata.newInstance(rpwContext), rpwContext); diff --git a/tika-app/src/test/resources/configs/config-template.json b/tika-app/src/test/resources/configs/config-template.json index e25bc96833..62940e6711 100644 --- a/tika-app/src/test/resources/configs/config-template.json +++ b/tika-app/src/test/resources/configs/config-template.json @@ -3,7 +3,6 @@ "basic-content-handler-factory": { "type": "TEXT", "writeLimit": -1, - "maxEmbeddedResources": -1, "throwOnWriteLimitReached": true } }, diff --git a/tika-core/src/main/java/org/apache/tika/exception/EmbeddedLimitReachedException.java b/tika-core/src/main/java/org/apache/tika/exception/EmbeddedLimitReachedException.java new file mode 100644 index 0000000000..40f571db71 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/exception/EmbeddedLimitReachedException.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.exception; + +/** + * Runtime exception thrown when an embedded document limit is reached + * and the configuration specifies that parsing should stop with an exception. + * <p> + * This is a runtime exception to avoid polluting the {@code EmbeddedDocumentExtractor} + * interface with checked exceptions, since most implementations don't need limit checking. + * + * @since Apache Tika 3.2 + */ +public class EmbeddedLimitReachedException extends RuntimeException { + + public enum LimitType { + MAX_DEPTH, + MAX_COUNT + } + + private final LimitType limitType; + private final int limit; + + public EmbeddedLimitReachedException(LimitType limitType, int limit) { + super(buildMessage(limitType, limit)); + this.limitType = limitType; + this.limit = limit; + } + + private static String buildMessage(LimitType limitType, int limit) { + switch (limitType) { + case MAX_DEPTH: + return "Max embedded depth reached: " + limit; + case MAX_COUNT: + return "Max embedded count reached: " + limit; + default: + return "Embedded limit reached: " + limit; + } + } + + public LimitType getLimitType() { + return limitType; + } + + public int getLimit() { + return limit; + } +} diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java index 46d16919ba..e244900037 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java @@ -26,6 +26,15 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; public interface EmbeddedDocumentExtractor { + /** + * Determines whether the given embedded document should be parsed. + * <p> + * Note: Implementations may throw {@link org.apache.tika.exception.EmbeddedLimitReachedException} + * (a RuntimeException) if a limit is exceeded and throwing is configured. + * + * @param metadata the metadata for the embedded document + * @return true if the embedded document should be parsed + */ boolean shouldParseEmbedded(Metadata metadata); /** diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java index 4b1e406183..402b4f6381 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java @@ -27,6 +27,7 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.exception.CorruptedFileException; +import org.apache.tika.exception.EmbeddedLimitReachedException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -63,7 +64,7 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract public boolean shouldParseEmbedded(Metadata metadata) { // Check ParseRecord for depth/count limits first ParseRecord parseRecord = context.get(ParseRecord.class); - if (parseRecord != null && !parseRecord.shouldParseEmbedded()) { + if (parseRecord != null && !checkEmbeddedLimits(parseRecord)) { return false; } @@ -85,6 +86,52 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract return true; } + /** + * Checks embedded document limits from ParseRecord. + * <p> + * If throwOnMaxDepth or throwOnMaxCount is configured and the respective limit is hit, + * an EmbeddedLimitReachedException is thrown. Otherwise, returns false and sets the + * appropriate limit flag on the ParseRecord. + * <p> + * Note: The count limit is a hard stop (once hit, no more embedded docs are parsed). + * The depth limit only affects documents at that depth - sibling documents at + * shallower depths will still be parsed. + * + * @param parseRecord the parse record to check + * @return true if the embedded document should be parsed, false if limits are exceeded + * @throws EmbeddedLimitReachedException if a limit is exceeded and throwing is configured + */ + private boolean checkEmbeddedLimits(ParseRecord parseRecord) { + // Count limit is a hard stop - once we've hit max, no more embedded parsing + if (parseRecord.isEmbeddedCountLimitReached()) { + return false; + } + int maxCount = parseRecord.getMaxEmbeddedCount(); + if (maxCount >= 0 && parseRecord.getEmbeddedCount() >= maxCount) { + parseRecord.setEmbeddedCountLimitReached(true); + if (parseRecord.isThrowOnMaxCount()) { + throw new EmbeddedLimitReachedException( + EmbeddedLimitReachedException.LimitType.MAX_COUNT, maxCount); + } + return false; + } + + // Depth limit only applies to current depth - siblings at shallower levels + // can still be parsed. The flag is set for reporting purposes. + // depth is 1-indexed (main doc is depth 1), so embedded depth limit of N + // means we allow parsing up to depth N+1 + int maxDepth = parseRecord.getMaxEmbeddedDepth(); + if (maxDepth >= 0 && parseRecord.getDepth() > maxDepth) { + parseRecord.setEmbeddedDepthLimitReached(true); + if (parseRecord.isThrowOnMaxDepth()) { + throw new EmbeddedLimitReachedException( + EmbeddedLimitReachedException.LimitType.MAX_DEPTH, maxDepth); + } + return false; + } + return true; + } + @Override public void parseEmbedded( TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext parseContext, boolean outputHtml) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index cc712543b5..ef3575f6e3 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -134,6 +134,13 @@ public interface TikaCoreProperties { Property WRITE_LIMIT_REACHED = Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "write_limit_reached"); + + Property EMBEDDED_RESOURCE_LIMIT_REACHED = + Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached"); + + Property EMBEDDED_DEPTH_LIMIT_REACHED = + Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "embedded_depth_limit_reached"); + /** * Use this to store exceptions caught during a parse that are * non-fatal, e.g. if a parser is in lenient mode and more diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java index 8a7dd6ac7c..4832da01c4 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java @@ -335,6 +335,12 @@ public class CompositeParser implements Parser { if (record.isWriteLimitReached()) { metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, true); } + if (record.isEmbeddedCountLimitReached()) { + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_LIMIT_REACHED, true); + } + if (record.isEmbeddedDepthLimitReached()) { + metadata.set(TikaCoreProperties.EMBEDDED_DEPTH_LIMIT_REACHED, true); + } for (Metadata m : record.getMetadataList()) { for (String n : m.names()) { diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java index daa427e948..2cf4218c97 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java @@ -22,7 +22,6 @@ import java.util.List; import java.util.Set; import org.apache.tika.config.EmbeddedLimits; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; /** @@ -147,45 +146,39 @@ public class ParseRecord { } /** - * Checks whether an embedded document should be parsed based on configured limits. - * This should be called before parsing each embedded document. - * <p> - * If throwOnMaxDepth or throwOnMaxCount is true and the respective limit is hit, - * a TikaException is thrown. Otherwise, returns false and sets the appropriate - * limit flag. - * <p> - * Note: The count limit is a hard stop (once hit, no more embedded docs are parsed). - * The depth limit only affects documents at that depth - sibling documents at - * shallower depths will still be parsed. + * Returns whether throwing is configured when max depth is reached. * - * @return true if the embedded document should be parsed, false if limits are exceeded - * @throws TikaException if a limit is exceeded and throwing is configured + * @return true if an exception should be thrown on max depth */ - public boolean shouldParseEmbedded() throws TikaException { - // Count limit is a hard stop - once we've hit max, no more embedded parsing - if (embeddedCountLimitReached) { - return false; - } - if (maxEmbeddedCount >= 0 && embeddedCount >= maxEmbeddedCount) { - embeddedCountLimitReached = true; - if (throwOnMaxCount) { - throw new TikaException("Max embedded count reached: " + maxEmbeddedCount); - } - return false; - } + public boolean isThrowOnMaxDepth() { + return throwOnMaxDepth; + } - // Depth limit only applies to current depth - siblings at shallower levels - // can still be parsed. The flag is set for reporting purposes. - // depth is 1-indexed (main doc is depth 1), so embedded depth limit of N - // means we allow parsing up to depth N+1 - if (maxEmbeddedDepth >= 0 && depth > maxEmbeddedDepth) { - embeddedDepthLimitReached = true; - if (throwOnMaxDepth) { - throw new TikaException("Max embedded depth reached: " + maxEmbeddedDepth); - } - return false; - } - return true; + /** + * Returns whether throwing is configured when max count is reached. + * + * @return true if an exception should be thrown on max count + */ + public boolean isThrowOnMaxCount() { + return throwOnMaxCount; + } + + /** + * Sets the flag indicating the embedded depth limit was reached. + * + * @param embeddedDepthLimitReached true if depth limit was reached + */ + public void setEmbeddedDepthLimitReached(boolean embeddedDepthLimitReached) { + this.embeddedDepthLimitReached = embeddedDepthLimitReached; + } + + /** + * Sets the flag indicating the embedded count limit was reached. + * + * @param embeddedCountLimitReached true if count limit was reached + */ + public void setEmbeddedCountLimitReached(boolean embeddedCountLimitReached) { + this.embeddedCountLimitReached = embeddedCountLimitReached; } /** diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index b714b3715a..22faaf86e4 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -225,17 +225,6 @@ public class RecursiveParserWrapper extends ParserDecorator { public void parse(TikaInputStream tis, ContentHandler ignore, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - // Check ParseRecord limits (configured from EmbeddedLimits) - ParseRecord parseRecord = context.get(ParseRecord.class); - if (parseRecord != null && !parseRecord.shouldParseEmbedded()) { - return; - } - - // Increment embedded count in ParseRecord - if (parseRecord != null) { - parseRecord.incrementEmbeddedCount(); - } - // Work out what this thing is String objectName = getResourceName(metadata, parserState.unknownCount); String objectLocation = this.location + objectName; diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java index 289dc6ddf6..bc48075dc2 100644 --- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java @@ -120,8 +120,7 @@ public class MultiThreadedTikaTest extends TikaTest { //content's metadata and they'll differ by file. parseContext = new ParseContext(); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - -1); + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); parser.parse(tis, handler, new Metadata(), parseContext); return handler.getMetadataList(); } diff --git a/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java b/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java index 78239c3405..9f008b3d0a 100644 --- a/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java @@ -167,7 +167,7 @@ public class ParsingExample { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded.docx"); ParseContext context = new ParseContext(); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(factory, -1); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(factory); try (TikaInputStream tis = TikaInputStream.get(ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx"))) { wrapper.parse(tis, handler, metadata, context); } diff --git a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json index 128a1a8b44..a58ee6b824 100644 --- a/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json +++ b/tika-integration-tests/tika-pipes-kafka-integration-tests/src/test/resources/kafka/plugins-template.json @@ -3,7 +3,6 @@ "basic-content-handler-factory": { "type": "TEXT", "writeLimit": -1, - "maxEmbeddedResources": -1, "throwOnWriteLimitReached": true } }, diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json index 2b4f98f92e..0c601b66a0 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json +++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/plugins-template.json @@ -3,7 +3,6 @@ "basic-content-handler-factory": { "type": "TEXT", "writeLimit": -1, - "maxEmbeddedResources": -1, "throwOnWriteLimitReached": true } }, diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json index 172a0c1c0e..afad751349 100644 --- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json +++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.json @@ -3,7 +3,6 @@ "basic-content-handler-factory": { "type": "TEXT", "writeLimit": -1, - "maxEmbeddedResources": -1, "throwOnWriteLimitReached": true } }, diff --git a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json index 816d5c49e5..b7bce1b100 100644 --- a/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json +++ b/tika-integration-tests/tika-pipes-s3-integration-tests/src/test/resources/s3/plugins-template.json @@ -3,7 +3,6 @@ "basic-content-handler-factory": { "type": "TEXT", "writeLimit": -1, - "maxEmbeddedResources": -1, "throwOnWriteLimitReached": true } }, diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json index 63cf5d73b5..4f8f0e8719 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/solr/plugins-template.json @@ -3,7 +3,6 @@ "basic-content-handler-factory": { "type": "TEXT", "writeLimit": -1, - "maxEmbeddedResources": -1, "throwOnWriteLimitReached": true } }, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java index 76a12a52d1..37c529e860 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java @@ -128,7 +128,7 @@ public class PListParser implements Parser { xhtml.endDocument(); } - private void parseObject(NSObject obj, State state) throws SAXException, IOException { + private void parseObject(NSObject obj, State state) throws SAXException, IOException, TikaException { if (obj instanceof NSDictionary) { parseDict((NSDictionary) obj, state); @@ -170,7 +170,7 @@ public class PListParser implements Parser { } } - private void parseSet(NSSet obj, State state) throws SAXException, IOException { + private void parseSet(NSSet obj, State state) throws SAXException, IOException, TikaException { state.xhtml.startElement(SET); for (NSObject child : obj.allObjects()) { parseObject(child, state); @@ -178,7 +178,7 @@ public class PListParser implements Parser { state.xhtml.endElement(SET); } - private void parseDict(NSDictionary obj, State state) throws SAXException, IOException { + private void parseDict(NSDictionary obj, State state) throws SAXException, IOException, TikaException { state.xhtml.startElement(DICT); for (Map.Entry<String, NSObject> mapEntry : obj.getHashMap().entrySet()) { String key = mapEntry.getKey(); @@ -189,9 +189,9 @@ public class PListParser implements Parser { state.xhtml.endElement(DICT); } - private void handleData(NSData value, State state) throws IOException, SAXException { + private void handleData(NSData value, State state) throws IOException, SAXException, TikaException { state.xhtml.characters(value.getBase64EncodedData()); - Metadata embeddedMetadata = state.Metadata.newInstance(parseContext); + Metadata embeddedMetadata = Metadata.newInstance(state.parseContext); if (!state.embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { return; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java index abb1ebfbd4..59f1d500d6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java @@ -207,7 +207,7 @@ public class IWork13PackageParser implements Parser { private void handleEmbedded(TikaInputStream tis, Metadata embeddedMetadata, XHTMLContentHandler xhtml, EmbeddedDocumentExtractor embeddedDocumentExtractor) - throws IOException, SAXException { + throws IOException, SAXException, TikaException { if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { embeddedDocumentExtractor.parseEmbedded(tis, xhtml, embeddedMetadata, new ParseContext(), true); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java index 8751772cea..83ce3698a2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java @@ -168,7 +168,7 @@ public class TSDParser implements Parser { } private void parseTSDContent(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws SAXException { + ParseContext context) throws SAXException, TikaException { CMSTimeStampedDataParser cmsTimeStampedDataParser = null; EmbeddedDocumentExtractor edx = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java index 9dbad3ef26..23d4445606 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java @@ -274,7 +274,7 @@ abstract class AbstractPOIFSExtractor { private void handleCompObj(DirectoryEntry parentDir, POIFSDocumentType type, String rName, Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml) - throws IOException, SAXException { + throws IOException, SAXException, TikaException { //TODO: figure out if the equivalent of OLE 1.0's //getCommand() and getFileName() exist for OLE 2.0 to populate //TikaCoreProperties.ORIGINAL_RESOURCE_NAME @@ -349,7 +349,7 @@ abstract class AbstractPOIFSExtractor { private void handleOLENative(DirectoryEntry dir, POIFSDocumentType type, String rName, Metadata metadata, XHTMLContentHandler xhtml, boolean outputHtml) - throws IOException, SAXException { + throws IOException, SAXException, TikaException { byte[] data = null; try { // Try to un-wrap the OLE10Native record: @@ -385,7 +385,7 @@ abstract class AbstractPOIFSExtractor { private void parseEmbedded(DirectoryEntry parentDir, TikaInputStream tis, XHTMLContentHandler xhtml, Metadata metadata, boolean outputHtml) throws IOException, - SAXException { + SAXException, TikaException { if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) { return; } @@ -398,7 +398,7 @@ abstract class AbstractPOIFSExtractor { private void parseEmbedded(DirectoryEntry dir, XHTMLContentHandler xhtml, Metadata metadata, boolean outputHtml) - throws IOException, SAXException { + throws IOException, SAXException, TikaException { if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) { return; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java index b0982c23f3..aa8c8a9682 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java @@ -406,7 +406,7 @@ public class HSLFExtractor extends AbstractPOIFSExtractor { try { OfficeParser.extractMacros(poifsFileSystem, xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), context); - } catch (IOException | SAXException inner) { + } catch (IOException | SAXException | TikaException inner) { EmbeddedDocumentUtil.recordException(inner, parentMetadata); } } catch (IOException e) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index 5d028702ae..212ea14f8c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -113,7 +113,7 @@ public class OfficeParser extends AbstractOfficeParser { public static void extractMacros(POIFSFileSystem fs, ContentHandler xhtml, EmbeddedDocumentExtractor embeddedDocumentExtractor, ParseContext context) - throws IOException, SAXException { + throws IOException, SAXException, TikaException { VBAMacroReader reader = null; Map<String, String> macros = null; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java index 64df532e2b..c470d697b9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java @@ -353,7 +353,7 @@ class OneNoteTreeWalker { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); return; } - Metadata embeddedMetadata = this.Metadata.newInstance(parseContext); + Metadata embeddedMetadata = Metadata.newInstance(this.parseContext); try { AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index 735d3968a5..f77a40e833 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -330,7 +330,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { private void handleEmbeddedOLE(PackagePart part, XHTMLContentHandler xhtml, String rel, Metadata parentMetadata, EmbeddedPartMetadata embeddedPartMetadata) throws IOException, - SAXException { + SAXException, TikaException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid if (part.getSize() >= 0 && part.getSize() < 512 * 3) { // Too small, skip @@ -453,7 +453,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { String rel, EmbeddedPartMetadata embeddedPartMetadata, TikaCoreProperties.EmbeddedResourceType embeddedResourceType) - throws SAXException, IOException { + throws SAXException, IOException, TikaException { Metadata metadata = Metadata.newInstance(context); metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel); metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java index 74259cd6e9..cb53aff7ca 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java @@ -356,7 +356,7 @@ public class EpubParser implements Parser { EmbeddedDocumentExtractor embeddedDocumentExtractor, XHTMLContentHandler xhtml, Metadata parentMetadata, ParseContext context) - throws IOException, SAXException { + throws IOException, SAXException, TikaException { if (hRefMediaPair.href == null) { return; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java index 91888be07b..ac06f1d34f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java @@ -24,6 +24,7 @@ import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; @@ -77,7 +78,7 @@ class FlatOpenDocumentMacroHandler extends ContentHandlerDecorator { if (SOURCE_CODE.equals(localName)) { try { handleMacro(); - } catch (IOException e) { + } catch (IOException | TikaException e) { throw new SAXException(e); } finally { resetMacroState(); @@ -91,7 +92,7 @@ class FlatOpenDocumentMacroHandler extends ContentHandlerDecorator { inMacro = false; } - protected void handleMacro() throws IOException, SAXException { + protected void handleMacro() throws IOException, SAXException, TikaException { byte[] bytes = macroBuffer.toString().getBytes(StandardCharsets.UTF_8); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java index 1e06808cb0..b001f09f85 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java @@ -31,6 +31,7 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; @@ -449,7 +450,7 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler { inBinaryData = false; try { processBinaryData(); - } catch (IOException e) { + } catch (IOException | TikaException e) { throw new SAXException(e); } return; @@ -512,7 +513,7 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler { assert nodeDepth >= 0; } - private void processBinaryData() throws IOException, SAXException { + private void processBinaryData() throws IOException, SAXException, TikaException { //TODO: figure out whether we're in an inline image or a regular //attachment and add that info to the embedded metadata diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java index a5a559873a..bfb23beefc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMacroHandler.java @@ -22,6 +22,7 @@ import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.apache.tika.exception.TikaException; import org.apache.tika.parser.ParseContext; import org.apache.tika.utils.XMLReaderUtils; @@ -49,7 +50,7 @@ class OpenDocumentMacroHandler extends FlatOpenDocumentMacroHandler { if (MODULE.equals(localName)) { try { handleMacro(); - } catch (IOException e) { + } catch (IOException | TikaException e) { throw new SAXException(e); } finally { //this shouldn't be necessary in the compressed odf files diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index efa124e8f0..437238835d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -36,6 +36,7 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaLoaderHelper; import org.apache.tika.TikaTest; +import org.apache.tika.config.EmbeddedLimits; import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -251,13 +252,16 @@ public class RecursiveParserWrapperTest extends TikaTest { assertNull(limitReached); } - //test setting value + //test setting value via EmbeddedLimits metadata = new Metadata(); + ParseContext limitContext = new ParseContext(); + EmbeddedLimits limits = new EmbeddedLimits(); + limits.setMaxCount(maxEmbedded); + limitContext.set(EmbeddedLimits.class, limits); try (TikaInputStream tis = getResourceAsStream("/test-documents/test_recursive_embedded.docx")) { RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - maxEmbedded); - wrapper.parse(tis, handler, metadata, context); + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); + wrapper.parse(tis, handler, metadata, limitContext); List<Metadata> list = handler.getMetadataList(); //add 1 for outer container file assertEquals(maxEmbedded + 1, list.size()); @@ -267,12 +271,11 @@ public class RecursiveParserWrapperTest extends TikaTest { assertEquals("true", limitReached); } - //test setting value < 0 + //test setting value < 0 (unlimited) metadata = new Metadata(); try (TikaInputStream tis = getResourceAsStream("/test-documents/test_recursive_embedded.docx")) { RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - -2); + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); wrapper.parse(tis, handler, metadata, context); List<Metadata> list = handler.getMetadataList(); assertEquals(totalNoLimit, list.size()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java index 79ea679d1a..24922e8420 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java @@ -45,7 +45,7 @@ public class JpegParserTest extends TikaTest { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1), 1000); + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg")) { wrapper.parse(TikaInputStream.get(is), handler, new Metadata(), new ParseContext()); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java index 986bb6b5cc..de78b97bd7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java @@ -109,8 +109,7 @@ public class RTFParserTest extends TikaTest { ParseContext ctx = new ParseContext(); RecursiveParserWrapper parser = new RecursiveParserWrapper(AUTO_DETECT_PARSER); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1), - -1); + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)); Metadata rootMetadata = new Metadata(); rootMetadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf"); try (TikaInputStream tis = TikaInputStream diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json index 76416f19d7..f87df79434 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.json @@ -1,8 +1,5 @@ { "auto-detect-parser": { - "maximumCompressionRatio": 100, - "maximumDepth": 100, - "maximumPackageEntryDepth": 100, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json index f9e04fe037..25ffe85de7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-base32.json @@ -1,6 +1,5 @@ { "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json index 8d4a9db55f..9098c8607f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-basic.json @@ -1,6 +1,5 @@ { "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json index d8dcaba9a3..c8d4c29aa5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-bc-digests-multiple.json @@ -1,6 +1,5 @@ { "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json index 7256297b30..b0ae0326e9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-commons-digests-basic.json @@ -1,6 +1,5 @@ { "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json index 4dc5242e60..b14d19509b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json @@ -9,7 +9,6 @@ } ], "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json index c6676b29be..23186720bc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.json @@ -1,6 +1,5 @@ { "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json index 360b4f5170..0671621fc0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.json @@ -1,6 +1,5 @@ { "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json index c721b2df1a..b6b31e7910 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json @@ -1,6 +1,5 @@ { "auto-detect-parser": { - "outputThreshold": 1000, "contentHandlerDecoratorFactory": "doubling-content-handler-decorator-factory" } } \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json index b56a7d5d2d..9687287ca5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json @@ -1,6 +1,5 @@ { "auto-detect-parser": { - "outputThreshold": 678900, "embeddedDocumentExtractorFactory": { "runpack-extractor-factory": { "writeFileNameToContent": false diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json index 28c5763f0b..39ddeff844 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json @@ -1,9 +1,5 @@ { "auto-detect-parser": { - "outputThreshold": 1000, - "maximumCompressionRatio": 0.8, - "maximumDepth": 1000, - "maximumPackageEntryDepth": 1000, "embeddedDocumentExtractorFactory": { "runpack-extractor-factory": { "writeFileNameToContent": true, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json index 17811c8dec..abea0d901e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json @@ -1,6 +1,5 @@ { "auto-detect-parser": { - "outputThreshold": 678900, "embeddedDocumentExtractorFactory": { "runpack-extractor-factory": { "writeFileNameToContent": true diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json index 6a1e6a925a..3179f5aceb 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json @@ -1,6 +1,5 @@ { "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-pipes/tika-async-cli/src/main/resources/config-template.json b/tika-pipes/tika-async-cli/src/main/resources/config-template.json index d4c70d5d73..ee1efd49dc 100644 --- a/tika-pipes/tika-async-cli/src/main/resources/config-template.json +++ b/tika-pipes/tika-async-cli/src/main/resources/config-template.json @@ -3,7 +3,6 @@ "basic-content-handler-factory": { "type": "TEXT", "writeLimit": -1, - "maxEmbeddedResources": -1, "throwOnWriteLimitReached": true } }, diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java index 7cfe5c439f..e56132f268 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java @@ -49,7 +49,6 @@ import org.apache.tika.pipes.api.FetchEmitTuple; import org.apache.tika.pipes.api.ParseMode; import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; -import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.ContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.utils.ExceptionUtils; diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json index 98573b46fb..fd6bfa852c 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-basic.json @@ -3,7 +3,6 @@ "basic-content-handler-factory": { "type": "TEXT", "writeLimit": -1, - "maxEmbeddedResources": -1, "throwOnWriteLimitReached": true } }, @@ -45,7 +44,6 @@ } }, "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json index 5308be9a1c..c55fd2a026 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-passback.json @@ -3,7 +3,6 @@ "basic-content-handler-factory": { "type": "TEXT", "writeLimit": -1, - "maxEmbeddedResources": -1, "throwOnWriteLimitReached": true } }, @@ -44,7 +43,6 @@ } }, "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json index f8d5d3464b..50d9875b25 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json @@ -3,7 +3,6 @@ "basic-content-handler-factory": { "type": "TEXT", "writeLimit": -1, - "maxEmbeddedResources": -1, "throwOnWriteLimitReached": true } }, @@ -45,7 +44,6 @@ } }, "auto-detect-parser": { - "outputThreshold": 1000000, "embeddedDocumentExtractorFactory": { "runpack-extractor-factory": { "writeFileNameToContent": false, diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json index c9189c2ae2..a7549f9385 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-uppercasing.json @@ -40,7 +40,6 @@ } }, "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json index c8dfbacc7c..0e2a8e85ab 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-write-limiter.json @@ -3,7 +3,6 @@ "basic-content-handler-factory": { "type": "TEXT", "writeLimit": -1, - "maxEmbeddedResources": -1, "throwOnWriteLimitReached": true } }, @@ -45,7 +44,6 @@ } }, "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 55f6ff0993..2c5960ed25 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -31,7 +31,10 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ObjectNode; +import org.apache.tika.config.EmbeddedLimits; import org.apache.tika.config.GlobalSettings; +import org.apache.tika.config.OutputLimits; +import org.apache.tika.config.TimeoutLimits; import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.Detector; @@ -395,6 +398,7 @@ public class TikaLoader { * TikaLoader loader = TikaLoader.load(configPath); * Parser parser = loader.loadAutoDetectParser(); * ParseContext context = loader.loadParseContext(); + * Metadata metadata = Metadata.newInstance(context); * parser.parse(stream, handler, metadata, context); * </pre> * @@ -403,20 +407,19 @@ public class TikaLoader { */ public ParseContext loadParseContext() throws TikaConfigException { ParseContext context = new ParseContext(); + loadOne(DigesterFactory.class, context); + loadOne(MetadataWriteLimiterFactory.class, context); + loadOne(EmbeddedLimits.class, context); + loadOne(OutputLimits.class, context); + loadOne(TimeoutLimits.class, context); + return context; + } - // Load DigesterFactory from other-configs if present - DigesterFactory digesterFactory = configs().load("digester-factory", DigesterFactory.class); - if (digesterFactory != null) { - context.set(DigesterFactory.class, digesterFactory); - } - - // Load MetadataWriteLimiterFactory from other-configs if present - MetadataWriteLimiterFactory metadataWriteLimiterFactory = configs().load(MetadataWriteLimiterFactory.class); - if (metadataWriteLimiterFactory != null) { - context.set(MetadataWriteLimiterFactory.class, metadataWriteLimiterFactory); + private <T> void loadOne(Class<T> clazz, ParseContext context) throws TikaConfigException { + T instnce = configs().load(clazz); + if (instnce != null) { + context.set(clazz, instnce); } - - return context; } /** diff --git a/tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java b/tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java new file mode 100644 index 0000000000..6ab03f828a --- /dev/null +++ b/tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.junit.jupiter.api.Test; + +import org.apache.tika.TikaTest; +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory; +import org.apache.tika.parser.ParseContext; + +/** + * Tests loading all limit configurations from a single tika-config.json file. + * <p> + * This test demonstrates how to configure all limits in one place using + * the "other-configs" section of the JSON configuration. + * <p> + * Configuration file: configs/all-limits-test.json + * <pre> + * { + * "parsers": ["default-parser"], + * "other-configs": { + * "embedded-limits": { + * "maxDepth": 10, + * "throwOnMaxDepth": false, + * "maxCount": 1000, + * "throwOnMaxCount": false + * }, + * "output-limits": { + * "writeLimit": 100000, + * "throwOnWriteLimit": false, + * "maxXmlDepth": 100, + * "maxPackageEntryDepth": 10, + * "zipBombThreshold": 1000000, + * "zipBombRatio": 100 + * }, + * "timeout-limits": { + * "taskTimeoutMillis": 60000 + * }, + * "metadata-write-limiter-factory": { + * "standard-metadata-limiter-factory": { + * "maxTotalBytes": 1048576, + * "maxFieldSize": 102400, + * "maxKeySize": 1024, + * "maxValuesPerField": 100 + * } + * } + * } + * } + * </pre> + */ +public class AllLimitsTest extends TikaTest { + + @Test + public void testLoadAllLimitsFromConfig() throws Exception { + TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "all-limits-test.json")); + + // Load all limits into ParseContext + ParseContext context = loader.loadParseContext(); + + // Verify EmbeddedLimits + EmbeddedLimits embeddedLimits = context.get(EmbeddedLimits.class); + assertNotNull(embeddedLimits, "EmbeddedLimits should be loaded"); + assertEquals(10, embeddedLimits.getMaxDepth()); + assertFalse(embeddedLimits.isThrowOnMaxDepth()); + assertEquals(1000, embeddedLimits.getMaxCount()); + assertFalse(embeddedLimits.isThrowOnMaxCount()); + + // Verify OutputLimits + OutputLimits outputLimits = context.get(OutputLimits.class); + assertNotNull(outputLimits, "OutputLimits should be loaded"); + assertEquals(100000, outputLimits.getWriteLimit()); + assertFalse(outputLimits.isThrowOnWriteLimit()); + assertEquals(100, outputLimits.getMaxXmlDepth()); + assertEquals(10, outputLimits.getMaxPackageEntryDepth()); + assertEquals(1000000, outputLimits.getZipBombThreshold()); + assertEquals(100, outputLimits.getZipBombRatio()); + + // Verify TimeoutLimits + TimeoutLimits timeoutLimits = context.get(TimeoutLimits.class); + assertNotNull(timeoutLimits, "TimeoutLimits should be loaded"); + assertEquals(60000, timeoutLimits.getTaskTimeoutMillis()); + + // Verify MetadataWriteLimiterFactory + MetadataWriteLimiterFactory metadataFactory = context.get(MetadataWriteLimiterFactory.class); + assertNotNull(metadataFactory, "MetadataWriteLimiterFactory should be loaded"); + } + + @Test + public void testLoadIndividualLimits() throws Exception { + TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "all-limits-test.json")); + + // Load individual limit configs directly + EmbeddedLimits embeddedLimits = loader.configs().load(EmbeddedLimits.class); + assertNotNull(embeddedLimits); + assertEquals(10, embeddedLimits.getMaxDepth()); + + OutputLimits outputLimits = loader.configs().load(OutputLimits.class); + assertNotNull(outputLimits); + assertEquals(100000, outputLimits.getWriteLimit()); + + TimeoutLimits timeoutLimits = loader.configs().load(TimeoutLimits.class); + assertNotNull(timeoutLimits); + assertEquals(60000, timeoutLimits.getTaskTimeoutMillis()); + } + + @Test + public void testHelperMethodsWithContext() throws Exception { + TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "all-limits-test.json")); + ParseContext context = loader.loadParseContext(); + + // Use helper methods to get limits with fallback defaults + EmbeddedLimits embeddedLimits = EmbeddedLimits.get(context); + assertEquals(10, embeddedLimits.getMaxDepth()); + + OutputLimits outputLimits = OutputLimits.get(context); + assertEquals(100000, outputLimits.getWriteLimit()); + + TimeoutLimits timeoutLimits = TimeoutLimits.get(context); + assertEquals(60000, timeoutLimits.getTaskTimeoutMillis()); + } + + @Test + public void testHelperMethodsWithNullContext() { + // Helper methods should return defaults when context is null + EmbeddedLimits embeddedLimits = EmbeddedLimits.get(null); + assertNotNull(embeddedLimits); + assertEquals(EmbeddedLimits.UNLIMITED, embeddedLimits.getMaxDepth()); + + OutputLimits outputLimits = OutputLimits.get(null); + assertNotNull(outputLimits); + assertEquals(OutputLimits.UNLIMITED, outputLimits.getWriteLimit()); + + TimeoutLimits timeoutLimits = TimeoutLimits.get(null); + assertNotNull(timeoutLimits); + assertEquals(TimeoutLimits.DEFAULT_TASK_TIMEOUT_MILLIS, timeoutLimits.getTaskTimeoutMillis()); + } +} diff --git a/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json b/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json index ddb158ba0c..8dfe51784b 100644 --- a/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json +++ b/tika-serialization/src/test/resources/configs/TIKA-3695-exclude.json @@ -2,9 +2,6 @@ "parsers": [ "default-parser" ], - "auto-detect-parser": { - "outputThreshold": 6789 - }, "other-configs": { "metadata-write-limiter-factory": { "standard-metadata-limiter-factory": { diff --git a/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json b/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json index b89d58e12f..7e408b8aba 100644 --- a/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json +++ b/tika-serialization/src/test/resources/configs/TIKA-3695-fields.json @@ -2,9 +2,6 @@ "parsers": [ "default-parser" ], - "auto-detect-parser": { - "outputThreshold": 6789 - }, "other-configs": { "metadata-write-limiter-factory": { "standard-metadata-limiter-factory": { diff --git a/tika-serialization/src/test/resources/configs/TIKA-3695.json b/tika-serialization/src/test/resources/configs/TIKA-3695.json index 13065a7ce6..ef95f8003b 100644 --- a/tika-serialization/src/test/resources/configs/TIKA-3695.json +++ b/tika-serialization/src/test/resources/configs/TIKA-3695.json @@ -2,9 +2,6 @@ "parsers": [ "default-parser" ], - "auto-detect-parser": { - "outputThreshold": 6789 - }, "other-configs": { "metadata-write-limiter-factory": { "standard-metadata-limiter-factory": { diff --git a/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json b/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json index b014152172..106205e5c5 100644 --- a/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json +++ b/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json @@ -3,7 +3,6 @@ "default-parser" ], "auto-detect-parser": { - "outputThreshold": 678900, "embeddedDocumentExtractorFactory": { "runpack-extractor-factory": { "writeFileNameToContent": false, diff --git a/tika-serialization/src/test/resources/configs/all-limits-test.json b/tika-serialization/src/test/resources/configs/all-limits-test.json new file mode 100644 index 0000000000..40faa856ec --- /dev/null +++ b/tika-serialization/src/test/resources/configs/all-limits-test.json @@ -0,0 +1,32 @@ +{ + "parsers": [ + "default-parser" + ], + "other-configs": { + "embedded-limits": { + "maxDepth": 10, + "throwOnMaxDepth": false, + "maxCount": 1000, + "throwOnMaxCount": false + }, + "output-limits": { + "writeLimit": 100000, + "throwOnWriteLimit": false, + "maxXmlDepth": 100, + "maxPackageEntryDepth": 10, + "zipBombThreshold": 1000000, + "zipBombRatio": 100 + }, + "timeout-limits": { + "taskTimeoutMillis": 60000 + }, + "metadata-write-limiter-factory": { + "standard-metadata-limiter-factory": { + "maxTotalBytes": 1048576, + "maxFieldSize": 102400, + "maxKeySize": 1024, + "maxValuesPerField": 100 + } + } + } +} diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java index 0cb335676c..63e0044baa 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java @@ -78,7 +78,6 @@ public abstract class CXFTestBase { public final static String BASIC_CONFIG = """ { "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json index 434b61e626..14e1c6c511 100644 --- a/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json +++ b/tika-server/tika-server-core/src/test/resources/configs/cxf-test-base-template.json @@ -45,7 +45,6 @@ } }, "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json index 06510b1a1e..6c6f2c5df9 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/cxf-test-base-template.json @@ -47,7 +47,6 @@ } }, "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json index dc25f3ae0e..dfbcbb8ee4 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-for-server-tests.json @@ -10,7 +10,6 @@ } ], "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json index 665442b733..6efc957936 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-opennlp-filter.json @@ -15,7 +15,6 @@ } ], "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": { diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json index 51e7806e81..05a1cdf374 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-langdetect-optimaize-filter.json @@ -15,7 +15,6 @@ } ], "auto-detect-parser": { - "outputThreshold": 1000000, "throwOnZeroBytes": false }, "other-configs": {
