This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new be3f6f825 TIKA--4007 -- allow users to turn of digesting for the 
container document (#1053)
be3f6f825 is described below

commit be3f6f825b3fcba2c1c02d41d2225aeca3a9c927
Author: Tim Allison <[email protected]>
AuthorDate: Mon Apr 10 15:59:56 2023 -0400

    TIKA--4007 -- allow users to turn of digesting for the container document 
(#1053)
    
    * TIKA-4007 -- allow users to turn off digesting of the container document
---
 .../batch/DigestingAutoDetectParserFactory.java    |  6 +++-
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  2 +-
 .../src/main/java/org/apache/tika/gui/TikaGUI.java |  2 +-
 .../org/apache/tika/parser/AutoDetectParser.java   |  6 ++--
 .../org/apache/tika/parser/DigestingParser.java    | 24 ++++++++++++++--
 .../parser/digestutils/CommonsDigesterFactory.java | 13 +++++++++
 .../tika/parser/AutoDetectParserConfigTest.java    | 21 ++++++++++++++
 .../parser/BouncyCastleDigestingParserTest.java    | 10 +++----
 .../apache/tika/parser/DigestingParserTest.java    | 11 +++++---
 .../tika/parser/RecursiveParserWrapperTest.java    |  2 +-
 .../configs/tika-config-digests-skip-container.xml | 33 ++++++++++++++++++++++
 .../tika/server/core/resource/TikaResource.java    |  7 ++++-
 12 files changed, 118 insertions(+), 19 deletions(-)

diff --git 
a/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
 
b/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
index 9f5b0cab6..5f54ed49e 100644
--- 
a/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
+++ 
b/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
@@ -33,7 +33,11 @@ public class DigestingAutoDetectParserFactory extends 
ParserFactory {
         if (digester == null) {
             return p;
         }
-        return new DigestingParser(p, digester);
+        boolean skipContainerDocument = false;
+        if (config.getAutoDetectParserConfig().getDigesterFactory() != null) {
+            skipContainerDocument = 
config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument();
+        }
+        return new DigestingParser(p, digester, skipContainerDocument);
     }
 
     public void setDigester(DigestingParser.Digester digester) {
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 5cd39bebc..46fe98fb1 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -698,7 +698,7 @@ public class TikaCLI {
 
             parser = new AutoDetectParser(config);
             if (digester != null) {
-                parser = new DigestingParser(parser, digester);
+                parser = new DigestingParser(parser, digester, false);
                 LOG.info("As of Tika 2.5.0, you can set the digester via the 
AutoDetectParserConfig in " +
                         "tika-config.xml. We plan to remove this commandline 
option in 2.8.0");
             }
diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java 
b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
index c942ee2ca..8bb54ac5e 100644
--- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
+++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
@@ -201,7 +201,7 @@ public class TikaGUI extends JFrame implements 
ActionListener, HyperlinkListener
         SwingUtilities.invokeLater(() -> new TikaGUI(
                 new DigestingParser(new AutoDetectParser(finalConfig),
                         new CommonsDigester(MAX_MARK, 
CommonsDigester.DigestAlgorithm.MD5,
-                                
CommonsDigester.DigestAlgorithm.SHA256))).setVisible(true));
+                                CommonsDigester.DigestAlgorithm.SHA256), 
false)).setVisible(true));
     }
 
     private void addMenuBar() {
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 491ad572e..45e972c20 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -109,7 +109,8 @@ public class AutoDetectParser extends CompositeParser {
             return fallback;
         } else {
             return new DigestingParser(fallback,
-                    
config.getAutoDetectParserConfig().getDigesterFactory().build());
+                    
config.getAutoDetectParserConfig().getDigesterFactory().build(),
+                    
config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument());
         }
 
     }
@@ -119,7 +120,8 @@ public class AutoDetectParser extends CompositeParser {
             return config.getParser();
         }
         return new DigestingParser(config.getParser(),
-                
config.getAutoDetectParserConfig().getDigesterFactory().build());
+                
config.getAutoDetectParserConfig().getDigesterFactory().build(),
+                
config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument());
     }
 
     /**
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
index 30736091a..8c0358da7 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
@@ -28,19 +28,21 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 
 public class DigestingParser extends ParserDecorator {
 
     private final Digester digester;
-
+    private final boolean skipContainerDocument;
     /**
      * Creates a decorator for the given parser.
      *
      * @param parser the parser instance to be decorated
      */
-    public DigestingParser(Parser parser, Digester digester) {
+    public DigestingParser(Parser parser, Digester digester, boolean 
skipContainerDocument) {
         super(parser);
         this.digester = digester;
+        this.skipContainerDocument = skipContainerDocument;
     }
 
     @Override
@@ -49,7 +51,7 @@ public class DigestingParser extends ParserDecorator {
         TemporaryResources tmp = new TemporaryResources();
         TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
         try {
-            if (digester != null) {
+            if (shouldDigest(metadata)) {
                 digester.digest(tis, metadata, context);
             }
             super.parse(tis, handler, metadata, context);
@@ -58,12 +60,28 @@ public class DigestingParser extends ParserDecorator {
         }
     }
 
+    private boolean shouldDigest(Metadata metadata) {
+        if (digester == null) {
+            return false;
+        }
+        if (! skipContainerDocument) {
+            return true;
+        }
+        Integer parseDepth = 
metadata.getInt(TikaCoreProperties.EMBEDDED_DEPTH);
+        if (parseDepth == null || parseDepth == 0) {
+            return false;
+        }
+        return true;
+    }
+
     /**
      * This is used in {@link AutoDetectParserConfig} to (optionally)
      * wrap the parser in a digesting parser.
      */
     public interface DigesterFactory {
         Digester build();
+        void setSkipContainerDocument(boolean skipContainerDocument);
+        boolean isSkipContainerDocument();
     }
 
         /**
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
index 17d0765bc..d37f7acb1 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
@@ -28,6 +28,8 @@ public class CommonsDigesterFactory implements 
DigestingParser.DigesterFactory {
     private int markLimit = 1000000;
     private String algorithmString = "md5";
 
+    private boolean skipContainerDocument = false;
+
     @Override
     public DigestingParser.Digester build() {
         return new CommonsDigester(markLimit, algorithmString);
@@ -42,4 +44,15 @@ public class CommonsDigesterFactory implements 
DigestingParser.DigesterFactory {
     public void setAlgorithmString(String algorithmString) {
         this.algorithmString = algorithmString;
     }
+
+    @Field
+    @Override
+    public void setSkipContainerDocument(boolean skipContainerDocument) {
+        this.skipContainerDocument = skipContainerDocument;
+    }
+
+    @Override
+    public boolean isSkipContainerDocument() {
+        return skipContainerDocument;
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 7ef747157..f1efedb59 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -17,6 +17,7 @@
 package org.apache.tika.parser;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
 
 import java.io.InputStream;
 import java.nio.file.Files;
@@ -124,6 +125,26 @@ public class AutoDetectParserConfigTest extends TikaTest {
                 metadataList.get(6).get("X-TIKA:digest:MD5"));
     }
 
+    @Test
+    public void testDigestsSkipContainer() throws Exception {
+        //test to make sure that the decorator is only applied once for
+        //legacy (e.g. not RecursiveParserWrapperHandler) parsing
+        TikaConfig tikaConfig = null;
+        try (InputStream is = 
AutoDetectParserConfigTest.class.getResourceAsStream(
+                "/configs/tika-config-digests-skip-container.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(tikaConfig);
+        List<Metadata> metadataList = 
getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p);
+        assertNull(metadataList.get(0).get("X-TIKA:digest:SHA256"));
+        assertNull(metadataList.get(0).get("X-TIKA:digest:MD5"));
+
+        
assertEquals("Q7D3RFV6DNGZ4BQIS6UKNWX4CDIKPIGDU2D7ADBUDVOBYSZHF7FQ====",
+                metadataList.get(6).get("X-TIKA:digest:SHA256"));
+        assertEquals("90a8b249a6d6b6cb127c59e01cef3aaa",
+                metadataList.get(6).get("X-TIKA:digest:MD5"));
+    }
+
     @Test
     public void testDigestsEmptyParser() throws Exception {
         //TIKA-3939 -- ensure that digesting happens even with EmptyParser
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
index 474ba7717..9971b7e03 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
@@ -76,7 +76,7 @@ public class BouncyCastleDigestingParserTest extends TikaTest 
{
             Metadata m = new Metadata();
             XMLResult xml = getXML("test_recursive_embedded.docx",
                     new DigestingParser(AUTO_DETECT_PARSER,
-                            new BouncyCastleDigester(UNLIMITED, algo)), m);
+                            new BouncyCastleDigester(UNLIMITED, algo), false), 
m);
             assertEquals(expected.get(algo), m.get(P + algo));
         }
 
@@ -105,7 +105,7 @@ public class BouncyCastleDigestingParserTest extends 
TikaTest {
         Metadata m = new Metadata();
         XMLResult xml = getXML("test_recursive_embedded.docx",
                 new DigestingParser(AUTO_DETECT_PARSER, new 
BouncyCastleDigester(UNLIMITED,
-                        "MD5,SHA256,SHA384,SHA512,SHA3-512,SHA1:32")), m);
+                        "MD5,SHA256,SHA384,SHA512,SHA3-512,SHA1:32"), false), 
m);
         for (String algo : new String[]{"MD5", "SHA256", "SHA384", "SHA512", 
"SHA3-512", "SHA1"}) {
             assertEquals(expected.get(algo), m.get(P + algo));
         }
@@ -119,7 +119,7 @@ public class BouncyCastleDigestingParserTest extends 
TikaTest {
         String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";
         Metadata m = new Metadata();
         XMLResult xml = getXML("test_recursive_embedded.docx",
-                new DigestingParser(AUTO_DETECT_PARSER, new 
BouncyCastleDigester(100, "MD5")), m);
+                new DigestingParser(AUTO_DETECT_PARSER, new 
BouncyCastleDigester(100, "MD5"), false), m);
         assertEquals(expectedMD5, m.get(P + "MD5"));
     }
 
@@ -127,7 +127,7 @@ public class BouncyCastleDigestingParserTest extends 
TikaTest {
     public void testNegativeMaxMarkLength() throws Exception {
         assertThrows(IllegalArgumentException.class, () -> {
             getXML("test_recursive_embedded.docx",
-                    new DigestingParser(AUTO_DETECT_PARSER, new 
BouncyCastleDigester(-1, "MD5")));
+                    new DigestingParser(AUTO_DETECT_PARSER, new 
BouncyCastleDigester(-1, "MD5"), false));
         });
     }
 
@@ -135,7 +135,7 @@ public class BouncyCastleDigestingParserTest extends 
TikaTest {
     public void testUnrecognizedEncodingOptions() throws Exception {
         assertThrows(IllegalArgumentException.class, () -> {
             getXML("test_recursive_embedded.docx", new 
DigestingParser(AUTO_DETECT_PARSER,
-                    new BouncyCastleDigester(100000, "MD5:33")));
+                    new BouncyCastleDigester(100000, "MD5:33"), false));
         });
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index 2a377c7f5..508d6f4d7 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -75,7 +75,7 @@ public class DigestingParserTest extends TikaTest {
         for (CommonsDigester.DigestAlgorithm algo : 
CommonsDigester.DigestAlgorithm.values()) {
             Metadata m = new Metadata();
             XMLResult xml = getXML("test_recursive_embedded.docx",
-                    new DigestingParser(AUTO_DETECT_PARSER, new 
CommonsDigester(UNLIMITED, algo)),
+                    new DigestingParser(AUTO_DETECT_PARSER, new 
CommonsDigester(UNLIMITED, algo), false),
                     m);
             assertEquals(expected.get(algo), m.get(P + algo.toString()), 
algo.toString());
         }
@@ -103,7 +103,8 @@ public class DigestingParserTest extends TikaTest {
         Metadata m = new Metadata();
         XMLResult xml = getXML("test_recursive_embedded.docx",
                 new DigestingParser(AUTO_DETECT_PARSER,
-                        new CommonsDigester(UNLIMITED, 
"md5,sha256,sha384,sha512,sha1:32")), m);
+                        new CommonsDigester(UNLIMITED, 
"md5,sha256,sha384,sha512,sha1:32"), false)
+                , m);
         for (CommonsDigester.DigestAlgorithm algo : new 
CommonsDigester.DigestAlgorithm[]{
                 CommonsDigester.DigestAlgorithm.MD5, 
CommonsDigester.DigestAlgorithm.SHA1,
                 CommonsDigester.DigestAlgorithm.SHA256, 
CommonsDigester.DigestAlgorithm.SHA384,
@@ -120,7 +121,8 @@ public class DigestingParserTest extends TikaTest {
         Metadata m = new Metadata();
         XMLResult xml = getXML("test_recursive_embedded.docx",
                 new DigestingParser(AUTO_DETECT_PARSER,
-                        new CommonsDigester(100, 
CommonsDigester.DigestAlgorithm.MD5)), m);
+                        new CommonsDigester(100, 
CommonsDigester.DigestAlgorithm.MD5),false)
+                , m);
         assertEquals(expectedMD5, m.get(P + "MD5"));
     }
 
@@ -131,7 +133,8 @@ public class DigestingParserTest extends TikaTest {
         try {
             XMLResult xml = getXML("test_recursive_embedded.docx",
                     new DigestingParser(AUTO_DETECT_PARSER,
-                            new CommonsDigester(-1, 
CommonsDigester.DigestAlgorithm.MD5)), m);
+                            new CommonsDigester(-1, 
CommonsDigester.DigestAlgorithm.MD5),
+                            false), m);
         } catch (IllegalArgumentException e) {
             ex = true;
         }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 28b22a29a..24800926a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -392,7 +392,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
         ParseContext context = new ParseContext();
         Parser wrapped = AUTO_DETECT_PARSER;
         if (digester != null) {
-            wrapped = new DigestingParser(wrapped, digester);
+            wrapped = new DigestingParser(wrapped, digester, false);
         }
         RecursiveParserWrapper wrapper =
                 new RecursiveParserWrapper(wrapped, catchEmbeddedExceptions);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.xml
new file mode 100644
index 000000000..22823dc3c
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+  </parsers>
+  <autoDetectParserConfig>
+    <spoolToDisk>1000000</spoolToDisk>
+    <outputThreshold>1000000</outputThreshold>
+    <digesterFactory
+        class="org.apache.tika.parser.digestutils.CommonsDigesterFactory">
+      <markLimit>100000</markLimit>
+      <algorithmString>sha256:32,md5</algorithmString>
+      <skipContainerDocument>true</skipContainerDocument>
+    </digesterFactory>
+    <throwOnZeroBytes>false</throwOnZeroBytes>
+  </autoDetectParserConfig>
+</properties>
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 3d4954df8..aadf86f30 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -120,7 +120,12 @@ public class TikaResource {
         final Parser parser = new AutoDetectParser(TIKA_CONFIG);
 
         if (DIGESTER != null) {
-            return new DigestingParser(parser, DIGESTER);
+            boolean skipContainer = false;
+            if (TIKA_CONFIG.getAutoDetectParserConfig().getDigesterFactory() 
!= null &&
+                    
TIKA_CONFIG.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument())
 {
+                skipContainer = true;
+            }
+            return new DigestingParser(parser, DIGESTER, skipContainer);
         }
         return parser;
     }

Reply via email to