This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4635 in repository https://gitbox.apache.org/repos/asf/tika.git
commit d94b8700c94ad778656e898fc4a15184a503ca52 Merge: 068976320e 766cf2cd51 Author: tallison <[email protected]> AuthorDate: Wed Jan 28 08:37:10 2026 -0500 Merge remote-tracking branch 'origin/main' into TIKA-4635 # Conflicts: # tika-core/src/main/java/org/apache/tika/parser/ParseContext.java # tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json # tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java # tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java # tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java docs/modules/ROOT/nav.adoc | 1 + docs/modules/ROOT/pages/advanced/index.adoc | 1 + .../ROOT/pages/advanced/setting-limits.adoc | 229 +++++++++++++++++++++ .../src/main/java/org/apache/tika/cli/TikaCLI.java | 17 +- .../src/main/java/org/apache/tika/gui/TikaGUI.java | 7 +- tika-core/src/main/java/org/apache/tika/Tika.java | 67 ++++-- .../tika/extractor/ParserContainerExtractor.java | 4 +- .../java/org/apache/tika/metadata/Metadata.java | 44 ++-- ...aWriteFilter.java => MetadataWriteLimiter.java} | 21 +- ...ctory.java => MetadataWriteLimiterFactory.java} | 19 +- ...iteFilter.java => StandardMetadataLimiter.java} | 94 +++------ .../StandardMetadataLimiterFactory.java | 152 ++++++++++++++ .../writefilter/StandardWriteFilterFactory.java | 127 ------------ .../org/apache/tika/parser/AutoDetectParser.java | 5 - .../apache/tika/parser/AutoDetectParserConfig.java | 15 +- .../java/org/apache/tika/parser/ParseContext.java | 27 +++ .../java/org/apache/tika/parser/ParsingReader.java | 21 +- .../apache/tika/parser/journal/TEIDOMParser.java | 2 +- .../tika/parser/apple/AppleSingleFileParser.java | 2 +- .../org/apache/tika/parser/apple/PListParser.java | 9 +- .../parser/iwork/iwana/IWork13PackageParser.java | 4 +- .../executable/UniversalExecutableParser.java | 2 +- .../org/apache/tika/parser/crypto/Pkcs7Parser.java | 2 +- .../org/apache/tika/parser/crypto/TSDParser.java | 2 +- .../org/apache/tika/parser/html/HtmlHandler.java | 8 +- .../apache/tika/parser/jdbc/JDBCTableReader.java | 4 +- .../tika/parser/mail/MailContentHandler.java | 4 +- .../org/apache/tika/parser/mbox/MboxParser.java | 2 +- .../parser/microsoft/AbstractPOIFSExtractor.java | 4 +- .../apache/tika/parser/microsoft/EMFParser.java | 4 +- .../tika/parser/microsoft/HSLFExtractor.java | 4 +- .../tika/parser/microsoft/JackcessExtractor.java | 2 +- .../apache/tika/parser/microsoft/OfficeParser.java | 14 +- .../tika/parser/microsoft/OutlookExtractor.java | 12 +- .../apache/tika/parser/microsoft/TNEFParser.java | 2 +- .../microsoft/activemime/ActiveMimeParser.java | 2 +- .../tika/parser/microsoft/chm/ChmParser.java | 2 +- .../tika/parser/microsoft/libpst/EmailVisitor.java | 6 +- .../microsoft/onenote/OneNoteTreeWalker.java | 7 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 8 +- .../ooxml/XWPFWordExtractorDecorator.java | 5 +- .../microsoft/ooxml/xps/XPSExtractorDecorator.java | 2 +- .../microsoft/ooxml/xps/XPSPageContentHandler.java | 8 +- .../ooxml/xwpf/ml2006/BinaryDataHandler.java | 2 +- .../parser/microsoft/pst/OutlookPSTParser.java | 10 +- .../parser/microsoft/pst/PSTMailItemParser.java | 6 +- .../parser/microsoft/rtf/RTFEmbObjHandler.java | 8 +- .../tika/parser/microsoft/xml/WordMLParser.java | 2 +- .../org/apache/tika/parser/epub/EpubParser.java | 2 +- .../apache/tika/parser/indesign/IDMLParser.java | 6 +- .../parser/odf/FlatOpenDocumentMacroHandler.java | 2 +- .../tika/parser/odf/OpenDocumentBodyHandler.java | 2 +- .../apache/tika/parser/odf/OpenDocumentParser.java | 4 +- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 12 +- .../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 2 +- .../java/org/apache/tika/parser/pdf/PDFParser.java | 2 +- .../tika/parser/pdf/image/ImageGraphicsEngine.java | 2 +- .../tika/renderer/pdf/mutool/MuPDFRenderer.java | 2 +- .../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java | 2 +- .../apache/tika/parser/pkg/CompressorParser.java | 2 +- .../org/apache/tika/parser/pkg/PackageParser.java | 7 +- .../java/org/apache/tika/parser/pkg/RarParser.java | 2 +- .../org/apache/tika/parser/pkg/UnrarParser.java | 2 +- .../org/apache/tika/parser/http/HttpParser.java | 2 +- .../org/apache/tika/parser/wacz/WACZParser.java | 18 +- .../org/apache/tika/parser/warc/WARCParser.java | 2 +- .../apache/tika/parser/xml/FictionBookParser.java | 2 +- .../tika/parser/AutoDetectParserConfigTest.java | 20 +- ...a-config-upcasing-custom-handler-decorator.json | 13 -- .../configs/tika-config-write-filter.json | 16 +- .../apache/tika/pipes/core/server/PipesServer.java | 12 +- .../apache/tika/pipes/core/server/PipesWorker.java | 27 ++- .../tika/pipes/core/MetadataWriteLimiterTest.java | 131 ++++++++++++ .../configs/tika-config-write-limiter.json | 64 ++++++ .../apache/tika/config/loader/TikaJsonConfig.java | 1 + .../org/apache/tika/config/loader/TikaLoader.java | 8 + .../org/apache/tika/serialization/TikaModule.java | 4 +- ...rTest.java => StandardMetadataLimiterTest.java} | 74 ++++--- .../test/resources/configs/TIKA-3695-exclude.json | 8 +- .../test/resources/configs/TIKA-3695-fields.json | 12 +- .../src/test/resources/configs/TIKA-3695.json | 12 +- .../server/core/resource/DetectorResource.java | 4 +- .../server/core/resource/MetadataResource.java | 15 +- .../server/core/resource/PipesParsingHelper.java | 3 +- .../core/resource/RecursiveMetadataResource.java | 12 +- .../tika/server/core/resource/TikaResource.java | 93 ++++++--- .../server/core/resource/UnpackerResource.java | 8 +- .../standard/resource/XMPMetadataResource.java | 7 +- 88 files changed, 1107 insertions(+), 514 deletions(-) diff --cc tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index ffb95dc609,351aa49a65..2d8e7ca21d --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@@ -146,14 -146,11 +146,9 @@@ public class AutoDetectParser extends C public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - if (autoDetectParserConfig.getMetadataWriteFilterFactory() != null) { - metadata.setMetadataWriteFilter( - autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance()); - } - // Compute digests before type detection if configured - DigestHelper.maybeDigest(tis, - autoDetectParserConfig.digester(), - autoDetectParserConfig.isSkipContainerDocumentDigest(), - metadata, context); + // DigesterFactory is retrieved from ParseContext (configured via other-configs) + DigestHelper.maybeDigest(tis, metadata, context); // Automatically detect the MIME type of the document MediaType type = detector.detect(tis, metadata, context); diff --cc tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java index c5c6632c00,4fa1d3c083..ebf359ff1c --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java @@@ -21,9 -21,10 +21,8 @@@ import java.io.Serializable import org.xml.sax.ContentHandler; import org.apache.tika.config.TikaComponent; -import org.apache.tika.digest.Digester; -import org.apache.tika.digest.DigesterFactory; import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; import org.apache.tika.metadata.Metadata; - import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory; import org.apache.tika.sax.ContentHandlerDecoratorFactory; /** @@@ -172,10 -238,10 +160,9 @@@ public class AutoDetectParserConfig imp return "AutoDetectParserConfig{" + "outputThreshold=" + outputThreshold + ", maximumCompressionRatio=" + maximumCompressionRatio + ", maximumDepth=" + maximumDepth + ", maximumPackageEntryDepth=" + - maximumPackageEntryDepth + ", metadataWriteFilterFactory=" + - metadataWriteFilterFactory + ", embeddedDocumentExtractorFactory=" + + maximumPackageEntryDepth + ", embeddedDocumentExtractorFactory=" + embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" + - contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory + - ", skipContainerDocumentDigest=" + skipContainerDocumentDigest + + contentHandlerDecoratorFactory + ", throwOnZeroBytes=" + throwOnZeroBytes + '}'; } } diff --cc tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index 78c8eb2e9a,ae1ecc3bbb..f5338594ff --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@@ -222,32 -224,31 +224,57 @@@ public class ParseContext implements Se return context.isEmpty() && jsonConfigs.isEmpty(); } + /** + * Copies all entries from the source ParseContext into this one. + * Existing entries in this context are overwritten by source entries. + * <p> + * This copies both typed objects (from context map) and JSON configs. + * + * @param source the ParseContext to copy from + * @since Apache Tika 4.0 + */ + public void copyFrom(ParseContext source) { + if (source == null) { + return; + } + // Copy typed objects + context.putAll(source.context); + // Copy JSON configs + jsonConfigs.putAll(source.jsonConfigs); + // Copy resolved configs (if any) + if (source.resolvedConfigs != null && !source.resolvedConfigs.isEmpty()) { + if (resolvedConfigs == null) { + resolvedConfigs = new HashMap<>(); + } + resolvedConfigs.putAll(source.resolvedConfigs); + } + } + + /** + * Creates a new Metadata object with any configured limits applied. + * <p> + * If a {@link MetadataWriteLimiterFactory} is configured in this ParseContext, the returned + * Metadata will have a write limiter that enforces those limits. Otherwise, + * returns a plain Metadata object. + * <p> + * Parsers should use this method instead of {@code new Metadata()} when creating + * metadata for embedded documents, to ensure limits are applied at creation time + * rather than later during parsing. + * <p> + * Example usage: + * <pre> + * Metadata embeddedMetadata = context.newMetadata(); + * embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); + * // limits are already applied, no data bypasses the limiter + * </pre> + * + * @return a new Metadata object, with limits applied if configured + * @since Apache Tika 4.0 + */ + public Metadata newMetadata() { + MetadataWriteLimiterFactory factory = get(MetadataWriteLimiterFactory.class); + return factory != null ? new Metadata(factory.newInstance()) : new Metadata(); + } /** diff --cc tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json index bb7acf39e9,48314a2ab4..6a1e6a925a --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json @@@ -1,25 -1,25 +1,25 @@@ { "auto-detect-parser": { "outputThreshold": 1000000, - "metadataWriteFilterFactory": { - "standard-write-filter-factory": { - "includeFields": [ - "X-TIKA-CONTENT", - "dc:creator" - ] - } - }, - "skipContainerDocumentDigest": true, - "digesterFactory": { + "throwOnZeroBytes": false + }, + "other-configs": { + "digester-factory": { "commons-digester-factory": { "digests": [ { "algorithm": "SHA256", "encoding": "BASE32" }, { "algorithm": "MD5" } - ] + ], + "skipContainerDocumentDigest": true } + }, - "throwOnZeroBytes": false - }, - "other-configs": { + "metadata-write-limiter-factory": { + "standard-metadata-limiter-factory": { + "includeFields": [ + "X-TIKA:content", + "dc:creator" + ] + } } } } diff --cc tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java index a91e66823a,8cf9308577..1b6897edbb --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesServer.java @@@ -58,8 -59,8 +58,9 @@@ import org.apache.tika.exception.TikaEx import org.apache.tika.extractor.RUnpackExtractorFactory; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.filter.MetadataFilter; + import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory; import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.pipes.api.FetchEmitTuple; import org.apache.tika.pipes.api.PipesResult; @@@ -343,7 -346,7 +348,7 @@@ public class PipesServer implements Aut Long thresholdBytes = pipesConfig.getEmitStrategy().getThresholdBytes(); long threshold = (thresholdBytes != null) ? thresholdBytes : EmitStrategyConfig.DEFAULT_DIRECT_EMIT_THRESHOLD_BYTES; EmitHandler emitHandler = new EmitHandler(defaultMetadataFilter, emitStrategy, emitterManager, threshold); - PipesWorker pipesWorker = new PipesWorker(fetchEmitTuple, mergedContext, autoDetectParser, emitterManager, fetchHandler, parseHandler, emitHandler); - PipesWorker pipesWorker = new PipesWorker(fetchEmitTuple, autoDetectParser, emitterManager, fetchHandler, parseHandler, emitHandler, defaultMetadataWriteLimiterFactory); ++ PipesWorker pipesWorker = new PipesWorker(fetchEmitTuple, mergedContext, autoDetectParser, emitterManager, fetchHandler, parseHandler, emitHandler, defaultMetadataWriteLimiterFactory); return pipesWorker; } @@@ -474,25 -491,9 +479,26 @@@ } this.detector = this.autoDetectParser.getDetector(); this.rMetaParser = new RecursiveParserWrapper(autoDetectParser); + } + /** + * Creates a merged ParseContext with defaults from tika-config overlaid with request values. + * Request values take precedence over defaults. + * <p> + * Creates a fresh context each time to avoid shared state between requests. + * + * @param requestContext the ParseContext from FetchEmitTuple + * @return a new ParseContext with defaults + request overrides + */ + private ParseContext createMergedParseContext(ParseContext requestContext) throws TikaConfigException { + // Create fresh context with defaults from tika-config (e.g., DigesterFactory) + ParseContext mergedContext = tikaLoader.loadParseContext(); + // Overlay request's values (request takes precedence) + mergedContext.copyFrom(requestContext); + return mergedContext; + } + private ConfigStore createConfigStore(PipesConfig pipesConfig, TikaPluginManager tikaPluginManager) throws TikaException { String configStoreType = pipesConfig.getConfigStoreType(); String configStoreParams = pipesConfig.getConfigStoreParams(); diff --cc tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java index 57733d38d5,18b83192ac..df54ea0042 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/PipesWorker.java @@@ -57,12 -57,11 +58,13 @@@ class PipesWorker implements Callable<P private final FetchHandler fetchHandler; private final ParseHandler parseHandler; private final EmitHandler emitHandler; + private final MetadataWriteLimiterFactory defaultMetadataWriteLimiterFactory; - public PipesWorker(FetchEmitTuple fetchEmitTuple, AutoDetectParser autoDetectParser, EmitterManager emitterManager, FetchHandler fetchHandler, ParseHandler parseHandler, + public PipesWorker(FetchEmitTuple fetchEmitTuple, ParseContext parseContext, AutoDetectParser autoDetectParser, + EmitterManager emitterManager, FetchHandler fetchHandler, ParseHandler parseHandler, - EmitHandler emitHandler) { + EmitHandler emitHandler, MetadataWriteLimiterFactory defaultMetadataWriteLimiterFactory) { this.fetchEmitTuple = fetchEmitTuple; + this.parseContext = parseContext; this.autoDetectParser = autoDetectParser; this.emitterManager = emitterManager; this.fetchHandler = fetchHandler; @@@ -109,22 -109,22 +112,23 @@@ //start a new metadata object to gather info from the fetch process //we want to isolate and not touch the metadata sent into the fetchEmitTuple //so that we can inject it after the filter at the very end - Metadata metadata = new Metadata(); - FetchHandler.TisOrResult tisOrResult = fetchHandler.fetch(fetchEmitTuple, metadata, parseContext); - if (tisOrResult.pipesResult() != null) { - return new ParseDataOrPipesResult(null, tisOrResult.pipesResult()); - } - - ParseContext parseContext = null; + ParseContext localContext = null; try { - parseContext = setupParseContext(fetchEmitTuple); + localContext = setupParseContext(); } catch (IOException e) { LOG.warn("fetcher initialization exception id={}", fetchEmitTuple.getId(), e); return new ParseDataOrPipesResult(null, new PipesResult(PipesResult.RESULT_STATUS.FETCHER_INITIALIZATION_EXCEPTION, ExceptionUtils.getStackTrace(e))); } - Metadata metadata = parseContext.newMetadata(); - FetchHandler.TisOrResult tisOrResult = fetchHandler.fetch(fetchEmitTuple, metadata); ++ // Use newMetadata() to apply any configured write limits ++ Metadata metadata = localContext.newMetadata(); ++ FetchHandler.TisOrResult tisOrResult = fetchHandler.fetch(fetchEmitTuple, metadata, localContext); + if (tisOrResult.pipesResult() != null) { + return new ParseDataOrPipesResult(null, tisOrResult.pipesResult()); + } + try (TikaInputStream tis = tisOrResult.tis()) { - return parseHandler.parseWithStream(fetchEmitTuple, tis, metadata, parseContext); + return parseHandler.parseWithStream(fetchEmitTuple, tis, metadata, localContext); } catch (SecurityException e) { LOG.error("security exception id={}", fetchEmitTuple.getId(), e); throw e; @@@ -137,9 -137,18 +141,17 @@@ - private ParseContext setupParseContext(FetchEmitTuple fetchEmitTuple) throws TikaException, IOException { - ParseContext parseContext = fetchEmitTuple.getParseContext(); + private ParseContext setupParseContext() throws TikaException, IOException { // ContentHandlerFactory and ParseMode are retrieved from ParseContext in ParseHandler. // They are set in ParseContext from PipesConfig loaded via TikaLoader at startup. + + // If the parseContext from the FetchEmitTuple doesn't have a MetadataWriteLimiterFactory, + // use the default one loaded from config in PipesServer + MetadataWriteLimiterFactory existingFactory = parseContext.get(MetadataWriteLimiterFactory.class); + if (existingFactory == null && defaultMetadataWriteLimiterFactory != null) { + parseContext.set(MetadataWriteLimiterFactory.class, defaultMetadataWriteLimiterFactory); + } + EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class); if (embeddedDocumentBytesConfig == null) { //make sure there's one here -- or do we make this default in fetchemit tuple? diff --cc tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 389f33697d,b527532e5b..55f6ff0993 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@@ -43,6 -42,6 +43,7 @@@ import org.apache.tika.language.transla import org.apache.tika.metadata.filter.CompositeMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; ++import org.apache.tika.metadata.writefilter.MetadataWriteLimiterFactory; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.AutoDetectParser; @@@ -378,39 -376,6 +379,46 @@@ public class TikaLoader return autoDetectParser; } + /** + * Loads and returns a ParseContext populated with components from the "other-configs" section. + * <p> + * This method loads components that should be passed via ParseContext, such as: + * <ul> + * <li>DigesterFactory (from "digester-factory")</li> ++ * <li>MetadataWriteLimiterFactory (from "metadata-write-limiter-factory")</li> + * </ul> + * <p> + * Use this method when you need a pre-configured ParseContext for parsing operations. + * + * <p>Example usage: + * <pre> + * TikaLoader loader = TikaLoader.load(configPath); + * Parser parser = loader.loadAutoDetectParser(); + * ParseContext context = loader.loadParseContext(); + * parser.parse(stream, handler, metadata, context); + * </pre> + * + * @return a ParseContext populated with configured components + * @throws TikaConfigException if loading fails + */ + public ParseContext loadParseContext() throws TikaConfigException { + ParseContext context = new ParseContext(); + + // Load DigesterFactory from other-configs if present + DigesterFactory digesterFactory = configs().load("digester-factory", DigesterFactory.class); + if (digesterFactory != null) { + context.set(DigesterFactory.class, digesterFactory); + } + ++ // Load MetadataWriteLimiterFactory from other-configs if present ++ MetadataWriteLimiterFactory metadataWriteLimiterFactory = configs().load(MetadataWriteLimiterFactory.class); ++ if (metadataWriteLimiterFactory != null) { ++ context.set(MetadataWriteLimiterFactory.class, metadataWriteLimiterFactory); ++ } ++ + return context; + } + /** * Returns a ConfigLoader for loading simple configuration objects. * <p> diff --cc tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java index e3e4aebae6,c84676c14b..7101105ba8 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java @@@ -76,9 -77,8 +77,9 @@@ public class MetadataResource @Context HttpHeaders httpHeaders, @Context UriInfo info) throws Exception { - Metadata metadata = new Metadata(); + // Load default context from config, then overlay with request config - ParseContext context = TikaResource.getTikaLoader().loadParseContext(); + ParseContext context = TikaResource.createParseContext(); + Metadata metadata = context.newMetadata(); try (TikaInputStream tis = setupMultipartConfig(attachments, metadata, context)) { // No need to parse embedded docs for metadata-only extraction context.set(DocumentSelector.class, metadata1 -> false); @@@ -169,8 -171,7 +172,8 @@@ protected Metadata parseMetadata(TikaInputStream tis, Metadata metadata, MultivaluedMap<String, String> httpHeaders, UriInfo info) throws IOException, TikaConfigException { + // Load default context from config (includes DigesterFactory from other-configs) - final ParseContext context = TikaResource.getTikaLoader().loadParseContext(); + final ParseContext context = TikaResource.createParseContext(); Parser parser = TikaResource.createParser(); fillMetadata(parser, metadata, httpHeaders); //no need to parse embedded docs diff --cc tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 287cd95ced,968bd83f99..31a5817ad8 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@@ -107,6 -114,21 +114,22 @@@ public class TikaResource return PIPES_PARSING_HELPER; } + /** - * Creates a new ParseContext with the default MetadataWriteLimiterFactory set if configured. - * This should be used instead of {@code createParseContext()} to ensure metadata limits - * are applied when configured. ++ * Creates a new ParseContext with defaults loaded from tika-config. ++ * This loads components from "other-configs" such as DigesterFactory and MetadataWriteLimiterFactory. + * + * @return a new ParseContext with defaults applied + */ + public static ParseContext createParseContext() { - ParseContext context = new ParseContext(); - if (DEFAULT_METADATA_WRITE_LIMITER_FACTORY != null) { - context.set(MetadataWriteLimiterFactory.class, DEFAULT_METADATA_WRITE_LIMITER_FACTORY); ++ try { ++ return TIKA_LOADER.loadParseContext(); ++ } catch (TikaConfigException e) { ++ // Fall back to empty context if loading fails ++ LOG.warn("Failed to load ParseContext from config, using empty context", e); ++ return new ParseContext(); + } - return context; + } + @SuppressWarnings("serial") public static Parser createParser() throws TikaConfigException, IOException {
