This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4633-centralize-limits in repository https://gitbox.apache.org/repos/asf/tika.git
commit 887d4efa49d7f2b27d9401c464073d43b3e54c73 Merge: 14797b0851 bd1513677e Author: tallison <[email protected]> AuthorDate: Wed Jan 28 10:37:50 2026 -0500 Merge remote-tracking branch 'origin/main' into TIKA-4633-centralize-limits # Conflicts: # tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java # tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java .../ROOT/pages/configuration/digesters.adoc | 192 +++++++++++++++++++++ docs/modules/ROOT/pages/configuration/index.adoc | 4 + .../src/main/java/org/apache/tika/cli/TikaCLI.java | 18 +- .../java/org/apache/tika/digest/DigestDef.java | 2 +- .../java/org/apache/tika/digest/DigestHelper.java | 41 +++-- .../org/apache/tika/digest/DigesterFactory.java | 39 +++-- .../org/apache/tika/parser/AutoDetectParser.java | 6 +- .../apache/tika/parser/AutoDetectParserConfig.java | 80 --------- .../java/org/apache/tika/parser/ParseContext.java | 26 +++ .../parser/digestutils/BouncyCastleDigester.java | 4 +- .../digestutils/BouncyCastleDigesterFactory.java | 27 ++- .../tika/parser/digestutils/CommonsDigester.java | 4 +- .../parser/digestutils/CommonsDigesterFactory.java | 27 ++- .../tika/parser/AutoDetectParserConfigTest.java | 24 ++- .../apache/tika/parser/AutoDetectParserTest.java | 6 +- .../tika/parser/RecursiveParserWrapperTest.java | 9 +- .../tika/parser/digest/DigestConfigTest.java | 58 ++++--- .../digest/SkipContainerDocumentDigestTest.java | 92 ++++++---- .../parser/microsoft/ooxml/OOXMLParserTest.java | 6 +- .../src/test/resources/configs/tika-4533.json | 6 +- .../configs/tika-config-bc-digests-base32.json | 8 +- .../configs/tika-config-bc-digests-basic.json | 8 +- .../configs/tika-config-bc-digests-multiple.json | 8 +- .../configs/tika-config-commons-digests-basic.json | 8 +- .../configs/tika-config-digests-pdf-only.json | 8 +- .../tika-config-digests-skip-container.json | 14 +- .../resources/configs/tika-config-digests.json | 10 +- .../resources/configs/tika-config-md5-digest.json | 4 +- ...a-config-upcasing-custom-handler-decorator.json | 9 +- .../configs/tika-config-write-filter.json | 13 +- .../apache/tika/pipes/core/server/EmitHandler.java | 9 +- .../tika/pipes/core/server/FetchHandler.java | 5 +- .../tika/pipes/core/server/ParseHandler.java | 10 +- .../apache/tika/pipes/core/server/PipesServer.java | 57 +++--- .../apache/tika/pipes/core/server/PipesWorker.java | 21 ++- .../test/resources/configs/tika-config-basic.json | 9 +- .../resources/configs/tika-config-passback.json | 9 +- .../resources/configs/tika-config-truncate.json | 9 +- .../resources/configs/tika-config-uppercasing.json | 9 +- .../configs/tika-config-write-limiter.json | 1 - .../org/apache/tika/config/loader/TikaLoader.java | 43 +++++ .../server/core/resource/MetadataResource.java | 2 + .../tika/server/core/resource/TikaResource.java | 15 +- .../org/apache/tika/server/core/CXFTestBase.java | 8 +- .../resources/configs/cxf-test-base-template.json | 10 +- .../resources/configs/cxf-test-base-template.json | 10 +- .../configs/tika-config-for-server-tests.json | 8 +- .../tika-config-langdetect-opennlp-filter.json | 8 +- .../tika-config-langdetect-optimaize-filter.json | 8 +- 49 files changed, 680 insertions(+), 332 deletions(-) diff --cc tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java index a8e30083fd,ebf359ff1c..2ce72443b8 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java @@@ -56,20 -74,59 +54,9 @@@ public class AutoDetectParserConfig imp private ContentHandlerDecoratorFactory contentHandlerDecoratorFactory = NOOP_CONTENT_HANDLER_DECORATOR_FACTORY; - private DigesterFactory digesterFactory = null; - - // Lazily built digester from the factory - private transient Digester digester = null; - - /** - * If true, skip digesting for container (top-level) documents. - * Only embedded documents will be digested. - */ - private boolean skipContainerDocumentDigest = false; - private boolean throwOnZeroBytes = true; - /** - * Creates a SecureContentHandlerConfig using the passed in parameters. - * - * @param outputThreshold SecureContentHandler - character output threshold. - * @param maximumCompressionRatio SecureContentHandler - max compression ratio allowed. - * @param maximumDepth SecureContentHandler - maximum XML element nesting level. - * @param maximumPackageEntryDepth SecureContentHandler - maximum package entry nesting level. - */ - public AutoDetectParserConfig(Long outputThreshold, - Long maximumCompressionRatio, Integer maximumDepth, - Integer maximumPackageEntryDepth) { - this.outputThreshold = outputThreshold; - this.maximumCompressionRatio = maximumCompressionRatio; - this.maximumDepth = maximumDepth; - this.maximumPackageEntryDepth = maximumPackageEntryDepth; - } - public AutoDetectParserConfig() { - - } - - public Long getOutputThreshold() { - return outputThreshold; - } - - public void setOutputThreshold(Long outputThreshold) { - this.outputThreshold = outputThreshold; - } - - public Long getMaximumCompressionRatio() { - return maximumCompressionRatio; - } - - public void setMaximumCompressionRatio(Long maximumCompressionRatio) { - this.maximumCompressionRatio = maximumCompressionRatio; - } - - public Integer getMaximumDepth() { - return maximumDepth; - } - - public void setMaximumDepth(Integer maximumDepth) { - this.maximumDepth = maximumDepth; - } - - public Integer getMaximumPackageEntryDepth() { - return maximumPackageEntryDepth; - } - - public void setMaximumPackageEntryDepth(Integer maximumPackageEntryDepth) { - this.maximumPackageEntryDepth = maximumPackageEntryDepth; } public void setEmbeddedDocumentExtractorFactory( @@@ -165,11 -157,12 +87,9 @@@ @Override public String toString() { - return "AutoDetectParserConfig{" + "outputThreshold=" + - outputThreshold + ", maximumCompressionRatio=" + maximumCompressionRatio + - ", maximumDepth=" + maximumDepth + ", maximumPackageEntryDepth=" + - maximumPackageEntryDepth + ", embeddedDocumentExtractorFactory=" + - embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" + - contentHandlerDecoratorFactory + + return "AutoDetectParserConfig{" + + "embeddedDocumentExtractorFactory=" + embeddedDocumentExtractorFactory + + ", contentHandlerDecoratorFactory=" + contentHandlerDecoratorFactory + - ", digesterFactory=" + digesterFactory + - ", skipContainerDocumentDigest=" + skipContainerDocumentDigest + ", throwOnZeroBytes=" + throwOnZeroBytes + '}'; } } diff --cc tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java index ce9ba25026,7101105ba8..3a516d7751 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java @@@ -77,8 -77,9 +77,9 @@@ public class MetadataResource @Context HttpHeaders httpHeaders, @Context UriInfo info) throws Exception { + // Load default context from config, then overlay with request config ParseContext context = TikaResource.createParseContext(); - Metadata metadata = context.newMetadata(); + Metadata metadata = Metadata.newInstance(context); try (TikaInputStream tis = setupMultipartConfig(attachments, metadata, context)) { // No need to parse embedded docs for metadata-only extraction context.set(DocumentSelector.class, metadata1 -> false);
