This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4638 in repository https://gitbox.apache.org/repos/asf/tika.git
commit edd627e17e62b5ff640d544db8495d44964fa64d Merge: 492183218d 48ca355225 Author: tallison <[email protected]> AuthorDate: Thu Jan 29 06:46:01 2026 -0500 Merge branch 'main' into TIKA-4638 Merged SAXOutputConfig loading with new loadOne() helper pattern and additional limit classes (EmbeddedLimits, OutputLimits, TimeoutLimits). .../ROOT/pages/advanced/setting-limits.adoc | 356 ++++++++++++++++----- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 27 +- .../src/main/java/org/apache/tika/gui/TikaGUI.java | 8 +- .../test/resources/configs/config-template.json | 1 - tika-core/src/main/java/org/apache/tika/Tika.java | 26 +- .../org/apache/tika/config/EmbeddedLimits.java | 222 +++++++++++++ .../java/org/apache/tika/config/OutputLimits.java | 269 ++++++++++++++++ .../java/org/apache/tika/config/TimeoutLimits.java | 135 ++++++++ .../exception/EmbeddedLimitReachedException.java | 62 ++++ .../tika/extractor/EmbeddedDocumentExtractor.java | 9 + .../tika/extractor/ParserContainerExtractor.java | 4 +- .../ParsingEmbeddedDocumentExtractor.java | 49 ++- .../java/org/apache/tika/metadata/Metadata.java | 24 ++ .../apache/tika/metadata/TikaCoreProperties.java | 7 + .../writefilter/MetadataWriteLimiterFactory.java | 2 +- .../org/apache/tika/parser/AutoDetectParser.java | 27 +- .../apache/tika/parser/AutoDetectParserConfig.java | 87 +---- .../org/apache/tika/parser/CompositeParser.java | 8 +- .../java/org/apache/tika/parser/ParseContext.java | 6 +- .../java/org/apache/tika/parser/ParseRecord.java | 80 +++-- .../java/org/apache/tika/parser/ParsingReader.java | 4 +- .../apache/tika/parser/RecursiveParserWrapper.java | 16 - .../sax/AbstractRecursiveParserWrapperHandler.java | 24 +- .../tika/sax/BasicContentHandlerFactory.java | 49 +-- .../tika/sax/RecursiveParserWrapperHandler.java | 20 +- .../org/apache/tika/sax/SecureContentHandler.java | 24 ++ .../apache/tika/sax/WriteOutContentHandler.java | 18 ++ .../org/apache/tika/MultiThreadedTikaTest.java | 3 +- .../org/apache/tika/example/ParsingExample.java | 2 +- .../src/test/resources/kafka/plugins-template.json | 1 - .../resources/opensearch/plugins-template.json | 1 - .../opensearch/tika-config-opensearch.json | 1 - .../src/test/resources/s3/plugins-template.json | 1 - .../src/test/resources/solr/plugins-template.json | 1 - .../apache/tika/parser/journal/TEIDOMParser.java | 2 +- .../tika/parser/apple/AppleSingleFileParser.java | 2 +- .../org/apache/tika/parser/apple/PListParser.java | 10 +- .../parser/iwork/iwana/IWork13PackageParser.java | 6 +- .../executable/UniversalExecutableParser.java | 2 +- .../org/apache/tika/parser/crypto/Pkcs7Parser.java | 2 +- .../org/apache/tika/parser/crypto/TSDParser.java | 4 +- .../org/apache/tika/parser/html/HtmlHandler.java | 8 +- .../apache/tika/parser/jdbc/JDBCTableReader.java | 4 +- .../tika/parser/mail/MailContentHandler.java | 4 +- .../org/apache/tika/parser/mbox/MboxParser.java | 2 +- .../parser/microsoft/AbstractPOIFSExtractor.java | 12 +- .../apache/tika/parser/microsoft/EMFParser.java | 4 +- .../tika/parser/microsoft/HSLFExtractor.java | 4 +- .../tika/parser/microsoft/JackcessExtractor.java | 2 +- .../apache/tika/parser/microsoft/OfficeParser.java | 6 +- .../tika/parser/microsoft/OutlookExtractor.java | 12 +- .../apache/tika/parser/microsoft/TNEFParser.java | 2 +- .../tika/parser/microsoft/chm/ChmParser.java | 2 +- .../tika/parser/microsoft/libpst/EmailVisitor.java | 2 +- .../microsoft/onenote/OneNoteTreeWalker.java | 2 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 10 +- .../ooxml/XWPFWordExtractorDecorator.java | 2 +- .../microsoft/ooxml/xps/XPSPageContentHandler.java | 2 +- .../ooxml/xwpf/ml2006/BinaryDataHandler.java | 2 +- .../parser/microsoft/pst/OutlookPSTParser.java | 2 +- .../parser/microsoft/pst/PSTMailItemParser.java | 6 +- .../parser/microsoft/rtf/RTFEmbObjHandler.java | 6 +- .../tika/parser/microsoft/xml/WordMLParser.java | 2 +- .../org/apache/tika/parser/epub/EpubParser.java | 4 +- .../apache/tika/parser/indesign/IDMLParser.java | 6 +- .../parser/odf/FlatOpenDocumentMacroHandler.java | 7 +- .../tika/parser/odf/OpenDocumentBodyHandler.java | 7 +- .../tika/parser/odf/OpenDocumentMacroHandler.java | 3 +- .../apache/tika/parser/odf/OpenDocumentParser.java | 4 +- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 12 +- .../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 2 +- .../java/org/apache/tika/parser/pdf/PDFParser.java | 2 +- .../tika/parser/pdf/image/ImageGraphicsEngine.java | 2 +- .../tika/renderer/pdf/mutool/MuPDFRenderer.java | 2 +- .../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java | 2 +- .../apache/tika/parser/pkg/CompressorParser.java | 2 +- .../org/apache/tika/parser/pkg/PackageParser.java | 2 +- .../org/apache/tika/parser/pkg/UnrarParser.java | 2 +- .../org/apache/tika/parser/http/HttpParser.java | 2 +- .../org/apache/tika/parser/wacz/WACZParser.java | 2 +- .../org/apache/tika/parser/warc/WARCParser.java | 2 +- .../apache/tika/parser/xml/FictionBookParser.java | 2 +- .../tika/parser/AutoDetectParserConfigTest.java | 2 +- .../tika/parser/RecursiveParserWrapperTest.java | 17 +- .../apache/tika/parser/image/JpegParserTest.java | 2 +- .../tika/parser/microsoft/rtf/RTFParserTest.java | 3 +- .../src/test/resources/configs/tika-4533.json | 3 - .../configs/tika-config-bc-digests-base32.json | 1 - .../configs/tika-config-bc-digests-basic.json | 1 - .../configs/tika-config-bc-digests-multiple.json | 1 - .../configs/tika-config-commons-digests-basic.json | 1 - .../configs/tika-config-digests-pdf-only.json | 1 - .../tika-config-digests-skip-container.json | 1 - .../resources/configs/tika-config-digests.json | 1 - ...a-config-doubling-custom-handler-decorator.json | 1 - .../resources/configs/tika-config-no-names.json | 1 - ...a-config-upcasing-custom-handler-decorator.json | 4 - .../resources/configs/tika-config-with-names.json | 1 - .../configs/tika-config-write-filter.json | 1 - .../src/main/resources/config-template.json | 1 - .../tika/pipes/core/server/ParseHandler.java | 20 +- .../apache/tika/pipes/fork/PipesForkParser.java | 6 + .../tika/pipes/fork/PipesForkParserConfig.java | 32 +- .../test/resources/configs/tika-config-basic.json | 2 - .../resources/configs/tika-config-passback.json | 2 - .../resources/configs/tika-config-truncate.json | 2 - .../resources/configs/tika-config-uppercasing.json | 1 - .../configs/tika-config-write-limiter.json | 2 - .../org/apache/tika/config/loader/TikaLoader.java | 34 +- .../java/org/apache/tika/config/AllLimitsTest.java | 156 +++++++++ .../org/apache/tika/config/EmbeddedLimitsTest.java | 109 +++++++ .../org/apache/tika/config/OutputLimitsTest.java | 119 +++++++ .../org/apache/tika/config/TimeoutLimitsTest.java | 95 ++++++ .../writefilter/StandardMetadataLimiterTest.java | 6 +- .../test/resources/configs/TIKA-3695-exclude.json | 3 - .../test/resources/configs/TIKA-3695-fields.json | 3 - .../src/test/resources/configs/TIKA-3695.json | 3 - .../configs/TIKA-4207-embedded-bytes-config.json | 1 - .../test/resources/configs/all-limits-test.json | 32 ++ .../resources/configs/embedded-limits-test.json | 10 + .../test/resources/configs/output-limits-test.json | 12 + .../resources/configs/timeout-limits-test.json | 7 + .../server/core/resource/DetectorResource.java | 2 +- .../server/core/resource/MetadataResource.java | 8 +- .../server/core/resource/PipesParsingHelper.java | 2 +- .../core/resource/RecursiveMetadataResource.java | 23 +- .../server/core/resource/ServerHandlerConfig.java | 5 +- .../tika/server/core/resource/TikaResource.java | 24 +- .../server/core/resource/UnpackerResource.java | 4 +- .../org/apache/tika/server/core/CXFTestBase.java | 1 - .../resources/configs/cxf-test-base-template.json | 1 - .../standard/resource/XMPMetadataResource.java | 4 +- .../resources/configs/cxf-test-base-template.json | 1 - .../configs/tika-config-for-server-tests.json | 1 - .../tika-config-langdetect-opennlp-filter.json | 1 - .../tika-config-langdetect-optimaize-filter.json | 1 - 136 files changed, 1968 insertions(+), 577 deletions(-) diff --cc tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java index 28a83569ab,516d3679ea..2aae9db385 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java @@@ -90,10 -90,10 +90,10 @@@ public class AppleSingleFileParser impl long bytesRead = 26; List<FieldInfo> fieldInfoList = getSortedFieldInfoList(tis, numEntries); bytesRead += 12 * numEntries; - Metadata embeddedMetadata = context.newMetadata(); + Metadata embeddedMetadata = Metadata.newInstance(context); bytesRead = processFieldEntries(tis, fieldInfoList, embeddedMetadata, bytesRead); FieldInfo contentFieldInfo = getContentFieldInfo(fieldInfoList); - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); xhtml.startDocument(); if (contentFieldInfo != null) { long diff = contentFieldInfo.offset - bytesRead; diff --cc tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index 7f9c051412,67674899b3..d8d1d40e6b --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@@ -236,10 -236,10 +236,10 @@@ public class CompressorParser implement } - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); xhtml.startDocument(); try { - Metadata entrydata = context.newMetadata(); + Metadata entrydata = Metadata.newInstance(context); if (cis instanceof GzipCompressorInputStream) { extractGzipMetadata((GzipCompressorInputStream) cis, entrydata); } diff --cc tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json index 25f0d5dce1,9687287ca5..8196d32f10 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json @@@ -1,8 -1,8 +1,7 @@@ { "auto-detect-parser": { - "outputThreshold": 678900, "embeddedDocumentExtractorFactory": { "runpack-extractor-factory": { - "writeFileNameToContent": false } } } diff --cc tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json index 0c812d63b2,39ddeff844..65148f0f61 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json @@@ -1,11 -1,8 +1,7 @@@ { "auto-detect-parser": { - "outputThreshold": 1000, - "maximumCompressionRatio": 0.8, - "maximumDepth": 1000, - "maximumPackageEntryDepth": 1000, "embeddedDocumentExtractorFactory": { "runpack-extractor-factory": { - "writeFileNameToContent": true, "embeddedBytesIncludeMimeTypes": [ "text/pdf" ], diff --cc tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json index 3edd26dedb,abea0d901e..0c90785bd1 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json @@@ -1,8 -1,8 +1,7 @@@ { "auto-detect-parser": { - "outputThreshold": 678900, "embeddedDocumentExtractorFactory": { "runpack-extractor-factory": { - "writeFileNameToContent": true } } } diff --cc tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json index 90141d91af,50d9875b25..b02932ebe7 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json @@@ -45,9 -44,9 +44,8 @@@ } }, "auto-detect-parser": { - "outputThreshold": 1000000, "embeddedDocumentExtractorFactory": { "runpack-extractor-factory": { - "writeFileNameToContent": false, "maxEmbeddedBytesForExtraction": 10 } }, diff --cc tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 7183d69962,2c5960ed25..95d6197598 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@@ -404,26 -407,19 +408,20 @@@ public class TikaLoader */ public ParseContext loadParseContext() throws TikaConfigException { ParseContext context = new ParseContext(); + loadOne(DigesterFactory.class, context); + loadOne(MetadataWriteLimiterFactory.class, context); + loadOne(EmbeddedLimits.class, context); + loadOne(OutputLimits.class, context); + loadOne(TimeoutLimits.class, context); ++ loadOne(SAXOutputConfig.class, context); + return context; + } - // Load DigesterFactory from other-configs if present - DigesterFactory digesterFactory = configs().load("digester-factory", DigesterFactory.class); - if (digesterFactory != null) { - context.set(DigesterFactory.class, digesterFactory); - } - - // Load MetadataWriteLimiterFactory from other-configs if present - MetadataWriteLimiterFactory metadataWriteLimiterFactory = configs().load(MetadataWriteLimiterFactory.class); - if (metadataWriteLimiterFactory != null) { - context.set(MetadataWriteLimiterFactory.class, metadataWriteLimiterFactory); - } - - // Load SAXOutputConfig from other-configs if present - SAXOutputConfig saxOutputConfig = configs().load(SAXOutputConfig.class); - if (saxOutputConfig != null) { - context.set(SAXOutputConfig.class, saxOutputConfig); + private <T> void loadOne(Class<T> clazz, ParseContext context) throws TikaConfigException { + T instnce = configs().load(clazz); + if (instnce != null) { + context.set(clazz, instnce); } - - return context; } /** diff --cc tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json index 0a0ef6e845,106205e5c5..5cc734f2be --- a/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json +++ b/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json @@@ -3,9 -3,9 +3,8 @@@ "default-parser" ], "auto-detect-parser": { - "outputThreshold": 678900, "embeddedDocumentExtractorFactory": { "runpack-extractor-factory": { - "writeFileNameToContent": false, "embeddedBytesIncludeMimeTypes": ["application/pdf", "application/rtf", "text/plain"], "embeddedBytesIncludeEmbeddedResourceTypes": ["ATTACHMENT", "INLINE"] }
