This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4638
in repository https://gitbox.apache.org/repos/asf/tika.git

commit edd627e17e62b5ff640d544db8495d44964fa64d
Merge: 492183218d 48ca355225
Author: tallison <[email protected]>
AuthorDate: Thu Jan 29 06:46:01 2026 -0500

    Merge branch 'main' into TIKA-4638
    
    Merged SAXOutputConfig loading with new loadOne() helper pattern
    and additional limit classes (EmbeddedLimits, OutputLimits, TimeoutLimits).

 .../ROOT/pages/advanced/setting-limits.adoc        | 356 ++++++++++++++++-----
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  27 +-
 .../src/main/java/org/apache/tika/gui/TikaGUI.java |   8 +-
 .../test/resources/configs/config-template.json    |   1 -
 tika-core/src/main/java/org/apache/tika/Tika.java  |  26 +-
 .../org/apache/tika/config/EmbeddedLimits.java     | 222 +++++++++++++
 .../java/org/apache/tika/config/OutputLimits.java  | 269 ++++++++++++++++
 .../java/org/apache/tika/config/TimeoutLimits.java | 135 ++++++++
 .../exception/EmbeddedLimitReachedException.java   |  62 ++++
 .../tika/extractor/EmbeddedDocumentExtractor.java  |   9 +
 .../tika/extractor/ParserContainerExtractor.java   |   4 +-
 .../ParsingEmbeddedDocumentExtractor.java          |  49 ++-
 .../java/org/apache/tika/metadata/Metadata.java    |  24 ++
 .../apache/tika/metadata/TikaCoreProperties.java   |   7 +
 .../writefilter/MetadataWriteLimiterFactory.java   |   2 +-
 .../org/apache/tika/parser/AutoDetectParser.java   |  27 +-
 .../apache/tika/parser/AutoDetectParserConfig.java |  87 +----
 .../org/apache/tika/parser/CompositeParser.java    |   8 +-
 .../java/org/apache/tika/parser/ParseContext.java  |   6 +-
 .../java/org/apache/tika/parser/ParseRecord.java   |  80 +++--
 .../java/org/apache/tika/parser/ParsingReader.java |   4 +-
 .../apache/tika/parser/RecursiveParserWrapper.java |  16 -
 .../sax/AbstractRecursiveParserWrapperHandler.java |  24 +-
 .../tika/sax/BasicContentHandlerFactory.java       |  49 +--
 .../tika/sax/RecursiveParserWrapperHandler.java    |  20 +-
 .../org/apache/tika/sax/SecureContentHandler.java  |  24 ++
 .../apache/tika/sax/WriteOutContentHandler.java    |  18 ++
 .../org/apache/tika/MultiThreadedTikaTest.java     |   3 +-
 .../org/apache/tika/example/ParsingExample.java    |   2 +-
 .../src/test/resources/kafka/plugins-template.json |   1 -
 .../resources/opensearch/plugins-template.json     |   1 -
 .../opensearch/tika-config-opensearch.json         |   1 -
 .../src/test/resources/s3/plugins-template.json    |   1 -
 .../src/test/resources/solr/plugins-template.json  |   1 -
 .../apache/tika/parser/journal/TEIDOMParser.java   |   2 +-
 .../tika/parser/apple/AppleSingleFileParser.java   |   2 +-
 .../org/apache/tika/parser/apple/PListParser.java  |  10 +-
 .../parser/iwork/iwana/IWork13PackageParser.java   |   6 +-
 .../executable/UniversalExecutableParser.java      |   2 +-
 .../org/apache/tika/parser/crypto/Pkcs7Parser.java |   2 +-
 .../org/apache/tika/parser/crypto/TSDParser.java   |   4 +-
 .../org/apache/tika/parser/html/HtmlHandler.java   |   8 +-
 .../apache/tika/parser/jdbc/JDBCTableReader.java   |   4 +-
 .../tika/parser/mail/MailContentHandler.java       |   4 +-
 .../org/apache/tika/parser/mbox/MboxParser.java    |   2 +-
 .../parser/microsoft/AbstractPOIFSExtractor.java   |  12 +-
 .../apache/tika/parser/microsoft/EMFParser.java    |   4 +-
 .../tika/parser/microsoft/HSLFExtractor.java       |   4 +-
 .../tika/parser/microsoft/JackcessExtractor.java   |   2 +-
 .../apache/tika/parser/microsoft/OfficeParser.java |   6 +-
 .../tika/parser/microsoft/OutlookExtractor.java    |  12 +-
 .../apache/tika/parser/microsoft/TNEFParser.java   |   2 +-
 .../tika/parser/microsoft/chm/ChmParser.java       |   2 +-
 .../tika/parser/microsoft/libpst/EmailVisitor.java |   2 +-
 .../microsoft/onenote/OneNoteTreeWalker.java       |   2 +-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |  10 +-
 .../ooxml/XWPFWordExtractorDecorator.java          |   2 +-
 .../microsoft/ooxml/xps/XPSPageContentHandler.java |   2 +-
 .../ooxml/xwpf/ml2006/BinaryDataHandler.java       |   2 +-
 .../parser/microsoft/pst/OutlookPSTParser.java     |   2 +-
 .../parser/microsoft/pst/PSTMailItemParser.java    |   6 +-
 .../parser/microsoft/rtf/RTFEmbObjHandler.java     |   6 +-
 .../tika/parser/microsoft/xml/WordMLParser.java    |   2 +-
 .../org/apache/tika/parser/epub/EpubParser.java    |   4 +-
 .../apache/tika/parser/indesign/IDMLParser.java    |   6 +-
 .../parser/odf/FlatOpenDocumentMacroHandler.java   |   7 +-
 .../tika/parser/odf/OpenDocumentBodyHandler.java   |   7 +-
 .../tika/parser/odf/OpenDocumentMacroHandler.java  |   3 +-
 .../apache/tika/parser/odf/OpenDocumentParser.java |   4 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  12 +-
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java |   2 +-
 .../java/org/apache/tika/parser/pdf/PDFParser.java |   2 +-
 .../tika/parser/pdf/image/ImageGraphicsEngine.java |   2 +-
 .../tika/renderer/pdf/mutool/MuPDFRenderer.java    |   2 +-
 .../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java   |   2 +-
 .../apache/tika/parser/pkg/CompressorParser.java   |   2 +-
 .../org/apache/tika/parser/pkg/PackageParser.java  |   2 +-
 .../org/apache/tika/parser/pkg/UnrarParser.java    |   2 +-
 .../org/apache/tika/parser/http/HttpParser.java    |   2 +-
 .../org/apache/tika/parser/wacz/WACZParser.java    |   2 +-
 .../org/apache/tika/parser/warc/WARCParser.java    |   2 +-
 .../apache/tika/parser/xml/FictionBookParser.java  |   2 +-
 .../tika/parser/AutoDetectParserConfigTest.java    |   2 +-
 .../tika/parser/RecursiveParserWrapperTest.java    |  17 +-
 .../apache/tika/parser/image/JpegParserTest.java   |   2 +-
 .../tika/parser/microsoft/rtf/RTFParserTest.java   |   3 +-
 .../src/test/resources/configs/tika-4533.json      |   3 -
 .../configs/tika-config-bc-digests-base32.json     |   1 -
 .../configs/tika-config-bc-digests-basic.json      |   1 -
 .../configs/tika-config-bc-digests-multiple.json   |   1 -
 .../configs/tika-config-commons-digests-basic.json |   1 -
 .../configs/tika-config-digests-pdf-only.json      |   1 -
 .../tika-config-digests-skip-container.json        |   1 -
 .../resources/configs/tika-config-digests.json     |   1 -
 ...a-config-doubling-custom-handler-decorator.json |   1 -
 .../resources/configs/tika-config-no-names.json    |   1 -
 ...a-config-upcasing-custom-handler-decorator.json |   4 -
 .../resources/configs/tika-config-with-names.json  |   1 -
 .../configs/tika-config-write-filter.json          |   1 -
 .../src/main/resources/config-template.json        |   1 -
 .../tika/pipes/core/server/ParseHandler.java       |  20 +-
 .../apache/tika/pipes/fork/PipesForkParser.java    |   6 +
 .../tika/pipes/fork/PipesForkParserConfig.java     |  32 +-
 .../test/resources/configs/tika-config-basic.json  |   2 -
 .../resources/configs/tika-config-passback.json    |   2 -
 .../resources/configs/tika-config-truncate.json    |   2 -
 .../resources/configs/tika-config-uppercasing.json |   1 -
 .../configs/tika-config-write-limiter.json         |   2 -
 .../org/apache/tika/config/loader/TikaLoader.java  |  34 +-
 .../java/org/apache/tika/config/AllLimitsTest.java | 156 +++++++++
 .../org/apache/tika/config/EmbeddedLimitsTest.java | 109 +++++++
 .../org/apache/tika/config/OutputLimitsTest.java   | 119 +++++++
 .../org/apache/tika/config/TimeoutLimitsTest.java  |  95 ++++++
 .../writefilter/StandardMetadataLimiterTest.java   |   6 +-
 .../test/resources/configs/TIKA-3695-exclude.json  |   3 -
 .../test/resources/configs/TIKA-3695-fields.json   |   3 -
 .../src/test/resources/configs/TIKA-3695.json      |   3 -
 .../configs/TIKA-4207-embedded-bytes-config.json   |   1 -
 .../test/resources/configs/all-limits-test.json    |  32 ++
 .../resources/configs/embedded-limits-test.json    |  10 +
 .../test/resources/configs/output-limits-test.json |  12 +
 .../resources/configs/timeout-limits-test.json     |   7 +
 .../server/core/resource/DetectorResource.java     |   2 +-
 .../server/core/resource/MetadataResource.java     |   8 +-
 .../server/core/resource/PipesParsingHelper.java   |   2 +-
 .../core/resource/RecursiveMetadataResource.java   |  23 +-
 .../server/core/resource/ServerHandlerConfig.java  |   5 +-
 .../tika/server/core/resource/TikaResource.java    |  24 +-
 .../server/core/resource/UnpackerResource.java     |   4 +-
 .../org/apache/tika/server/core/CXFTestBase.java   |   1 -
 .../resources/configs/cxf-test-base-template.json  |   1 -
 .../standard/resource/XMPMetadataResource.java     |   4 +-
 .../resources/configs/cxf-test-base-template.json  |   1 -
 .../configs/tika-config-for-server-tests.json      |   1 -
 .../tika-config-langdetect-opennlp-filter.json     |   1 -
 .../tika-config-langdetect-optimaize-filter.json   |   1 -
 136 files changed, 1968 insertions(+), 577 deletions(-)

diff --cc 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index 28a83569ab,516d3679ea..2aae9db385
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@@ -90,10 -90,10 +90,10 @@@ public class AppleSingleFileParser impl
          long bytesRead = 26;
          List<FieldInfo> fieldInfoList = getSortedFieldInfoList(tis, 
numEntries);
          bytesRead += 12 * numEntries;
-         Metadata embeddedMetadata = context.newMetadata();
+         Metadata embeddedMetadata = Metadata.newInstance(context);
          bytesRead = processFieldEntries(tis, fieldInfoList, embeddedMetadata, 
bytesRead);
          FieldInfo contentFieldInfo = getContentFieldInfo(fieldInfoList);
 -        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
 +        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata, context);
          xhtml.startDocument();
          if (contentFieldInfo != null) {
              long diff = contentFieldInfo.offset - bytesRead;
diff --cc 
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 7f9c051412,67674899b3..d8d1d40e6b
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@@ -236,10 -236,10 +236,10 @@@ public class CompressorParser implement
          }
  
  
 -        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
 +        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata, context);
          xhtml.startDocument();
          try {
-             Metadata entrydata = context.newMetadata();
+             Metadata entrydata = Metadata.newInstance(context);
              if (cis instanceof GzipCompressorInputStream) {
                  extractGzipMetadata((GzipCompressorInputStream) cis, 
entrydata);
              }
diff --cc 
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json
index 25f0d5dce1,9687287ca5..8196d32f10
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json
@@@ -1,8 -1,8 +1,7 @@@
  {
    "auto-detect-parser": {
-     "outputThreshold": 678900,
      "embeddedDocumentExtractorFactory": {
        "runpack-extractor-factory": {
 -        "writeFileNameToContent": false
        }
      }
    }
diff --cc 
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
index 0c812d63b2,39ddeff844..65148f0f61
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json
@@@ -1,11 -1,8 +1,7 @@@
  {
    "auto-detect-parser": {
-     "outputThreshold": 1000,
-     "maximumCompressionRatio": 0.8,
-     "maximumDepth": 1000,
-     "maximumPackageEntryDepth": 1000,
      "embeddedDocumentExtractorFactory": {
        "runpack-extractor-factory": {
 -        "writeFileNameToContent": true,
          "embeddedBytesIncludeMimeTypes": [
            "text/pdf"
          ],
diff --cc 
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json
index 3edd26dedb,abea0d901e..0c90785bd1
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json
@@@ -1,8 -1,8 +1,7 @@@
  {
    "auto-detect-parser": {
-     "outputThreshold": 678900,
      "embeddedDocumentExtractorFactory": {
        "runpack-extractor-factory": {
 -        "writeFileNameToContent": true
        }
      }
    }
diff --cc 
tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
index 90141d91af,50d9875b25..b02932ebe7
--- 
a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
+++ 
b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json
@@@ -45,9 -44,9 +44,8 @@@
      }
    },
    "auto-detect-parser": {
-     "outputThreshold": 1000000,
      "embeddedDocumentExtractorFactory": {
        "runpack-extractor-factory": {
 -        "writeFileNameToContent": false,
          "maxEmbeddedBytesForExtraction": 10
        }
      },
diff --cc 
tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
index 7183d69962,2c5960ed25..95d6197598
--- 
a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java
@@@ -404,26 -407,19 +408,20 @@@ public class TikaLoader 
       */
      public ParseContext loadParseContext() throws TikaConfigException {
          ParseContext context = new ParseContext();
+         loadOne(DigesterFactory.class, context);
+         loadOne(MetadataWriteLimiterFactory.class, context);
+         loadOne(EmbeddedLimits.class, context);
+         loadOne(OutputLimits.class, context);
+         loadOne(TimeoutLimits.class, context);
++        loadOne(SAXOutputConfig.class, context);
+         return context;
+     }
  
-         // Load DigesterFactory from other-configs if present
-         DigesterFactory digesterFactory = configs().load("digester-factory", 
DigesterFactory.class);
-         if (digesterFactory != null) {
-             context.set(DigesterFactory.class, digesterFactory);
-         }
- 
-         // Load MetadataWriteLimiterFactory from other-configs if present
-         MetadataWriteLimiterFactory metadataWriteLimiterFactory = 
configs().load(MetadataWriteLimiterFactory.class);
-         if (metadataWriteLimiterFactory != null) {
-             context.set(MetadataWriteLimiterFactory.class, 
metadataWriteLimiterFactory);
-         }
- 
-         // Load SAXOutputConfig from other-configs if present
-         SAXOutputConfig saxOutputConfig = 
configs().load(SAXOutputConfig.class);
-         if (saxOutputConfig != null) {
-             context.set(SAXOutputConfig.class, saxOutputConfig);
+     private <T> void loadOne(Class<T> clazz, ParseContext context) throws 
TikaConfigException {
+         T instnce = configs().load(clazz);
+         if (instnce != null) {
+             context.set(clazz, instnce);
          }
- 
-         return context;
      }
  
      /**
diff --cc 
tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json
index 0a0ef6e845,106205e5c5..5cc734f2be
--- 
a/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json
+++ 
b/tika-serialization/src/test/resources/configs/TIKA-4207-embedded-bytes-config.json
@@@ -3,9 -3,9 +3,8 @@@
      "default-parser"
    ],
    "auto-detect-parser": {
-     "outputThreshold": 678900,
      "embeddedDocumentExtractorFactory": {
        "runpack-extractor-factory": {
 -        "writeFileNameToContent": false,
          "embeddedBytesIncludeMimeTypes": ["application/pdf", 
"application/rtf", "text/plain"],
          "embeddedBytesIncludeEmbeddedResourceTypes": ["ATTACHMENT", "INLINE"]
        }

Reply via email to