This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new be3f6f825 TIKA--4007 -- allow users to turn of digesting for the
container document (#1053)
be3f6f825 is described below
commit be3f6f825b3fcba2c1c02d41d2225aeca3a9c927
Author: Tim Allison <[email protected]>
AuthorDate: Mon Apr 10 15:59:56 2023 -0400
TIKA--4007 -- allow users to turn of digesting for the container document
(#1053)
* TIKA-4007 -- allow users to turn off digesting of the container document
---
.../batch/DigestingAutoDetectParserFactory.java | 6 +++-
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +-
.../src/main/java/org/apache/tika/gui/TikaGUI.java | 2 +-
.../org/apache/tika/parser/AutoDetectParser.java | 6 ++--
.../org/apache/tika/parser/DigestingParser.java | 24 ++++++++++++++--
.../parser/digestutils/CommonsDigesterFactory.java | 13 +++++++++
.../tika/parser/AutoDetectParserConfigTest.java | 21 ++++++++++++++
.../parser/BouncyCastleDigestingParserTest.java | 10 +++----
.../apache/tika/parser/DigestingParserTest.java | 11 +++++---
.../tika/parser/RecursiveParserWrapperTest.java | 2 +-
.../configs/tika-config-digests-skip-container.xml | 33 ++++++++++++++++++++++
.../tika/server/core/resource/TikaResource.java | 7 ++++-
12 files changed, 118 insertions(+), 19 deletions(-)
diff --git
a/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
b/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
index 9f5b0cab6..5f54ed49e 100644
---
a/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
+++
b/tika-app/src/main/java/org/apache/tika/batch/DigestingAutoDetectParserFactory.java
@@ -33,7 +33,11 @@ public class DigestingAutoDetectParserFactory extends
ParserFactory {
if (digester == null) {
return p;
}
- return new DigestingParser(p, digester);
+ boolean skipContainerDocument = false;
+ if (config.getAutoDetectParserConfig().getDigesterFactory() != null) {
+ skipContainerDocument =
config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument();
+ }
+ return new DigestingParser(p, digester, skipContainerDocument);
}
public void setDigester(DigestingParser.Digester digester) {
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 5cd39bebc..46fe98fb1 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -698,7 +698,7 @@ public class TikaCLI {
parser = new AutoDetectParser(config);
if (digester != null) {
- parser = new DigestingParser(parser, digester);
+ parser = new DigestingParser(parser, digester, false);
LOG.info("As of Tika 2.5.0, you can set the digester via the
AutoDetectParserConfig in " +
"tika-config.xml. We plan to remove this commandline
option in 2.8.0");
}
diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
index c942ee2ca..8bb54ac5e 100644
--- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
+++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
@@ -201,7 +201,7 @@ public class TikaGUI extends JFrame implements
ActionListener, HyperlinkListener
SwingUtilities.invokeLater(() -> new TikaGUI(
new DigestingParser(new AutoDetectParser(finalConfig),
new CommonsDigester(MAX_MARK,
CommonsDigester.DigestAlgorithm.MD5,
-
CommonsDigester.DigestAlgorithm.SHA256))).setVisible(true));
+ CommonsDigester.DigestAlgorithm.SHA256),
false)).setVisible(true));
}
private void addMenuBar() {
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 491ad572e..45e972c20 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -109,7 +109,8 @@ public class AutoDetectParser extends CompositeParser {
return fallback;
} else {
return new DigestingParser(fallback,
-
config.getAutoDetectParserConfig().getDigesterFactory().build());
+
config.getAutoDetectParserConfig().getDigesterFactory().build(),
+
config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument());
}
}
@@ -119,7 +120,8 @@ public class AutoDetectParser extends CompositeParser {
return config.getParser();
}
return new DigestingParser(config.getParser(),
-
config.getAutoDetectParserConfig().getDigesterFactory().build());
+
config.getAutoDetectParserConfig().getDigesterFactory().build(),
+
config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument());
}
/**
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
index 30736091a..8c0358da7 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
@@ -28,19 +28,21 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
public class DigestingParser extends ParserDecorator {
private final Digester digester;
-
+ private final boolean skipContainerDocument;
/**
* Creates a decorator for the given parser.
*
* @param parser the parser instance to be decorated
*/
- public DigestingParser(Parser parser, Digester digester) {
+ public DigestingParser(Parser parser, Digester digester, boolean
skipContainerDocument) {
super(parser);
this.digester = digester;
+ this.skipContainerDocument = skipContainerDocument;
}
@Override
@@ -49,7 +51,7 @@ public class DigestingParser extends ParserDecorator {
TemporaryResources tmp = new TemporaryResources();
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
try {
- if (digester != null) {
+ if (shouldDigest(metadata)) {
digester.digest(tis, metadata, context);
}
super.parse(tis, handler, metadata, context);
@@ -58,12 +60,28 @@ public class DigestingParser extends ParserDecorator {
}
}
+ private boolean shouldDigest(Metadata metadata) {
+ if (digester == null) {
+ return false;
+ }
+ if (! skipContainerDocument) {
+ return true;
+ }
+ Integer parseDepth =
metadata.getInt(TikaCoreProperties.EMBEDDED_DEPTH);
+ if (parseDepth == null || parseDepth == 0) {
+ return false;
+ }
+ return true;
+ }
+
/**
* This is used in {@link AutoDetectParserConfig} to (optionally)
* wrap the parser in a digesting parser.
*/
public interface DigesterFactory {
Digester build();
+ void setSkipContainerDocument(boolean skipContainerDocument);
+ boolean isSkipContainerDocument();
}
/**
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
index 17d0765bc..d37f7acb1 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigesterFactory.java
@@ -28,6 +28,8 @@ public class CommonsDigesterFactory implements
DigestingParser.DigesterFactory {
private int markLimit = 1000000;
private String algorithmString = "md5";
+ private boolean skipContainerDocument = false;
+
@Override
public DigestingParser.Digester build() {
return new CommonsDigester(markLimit, algorithmString);
@@ -42,4 +44,15 @@ public class CommonsDigesterFactory implements
DigestingParser.DigesterFactory {
public void setAlgorithmString(String algorithmString) {
this.algorithmString = algorithmString;
}
+
+ @Field
+ @Override
+ public void setSkipContainerDocument(boolean skipContainerDocument) {
+ this.skipContainerDocument = skipContainerDocument;
+ }
+
+ @Override
+ public boolean isSkipContainerDocument() {
+ return skipContainerDocument;
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 7ef747157..f1efedb59 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
import java.io.InputStream;
import java.nio.file.Files;
@@ -124,6 +125,26 @@ public class AutoDetectParserConfigTest extends TikaTest {
metadataList.get(6).get("X-TIKA:digest:MD5"));
}
+ @Test
+ public void testDigestsSkipContainer() throws Exception {
+ //test to make sure that the decorator is only applied once for
+ //legacy (e.g. not RecursiveParserWrapperHandler) parsing
+ TikaConfig tikaConfig = null;
+ try (InputStream is =
AutoDetectParserConfigTest.class.getResourceAsStream(
+ "/configs/tika-config-digests-skip-container.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ Parser p = new AutoDetectParser(tikaConfig);
+ List<Metadata> metadataList =
getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p);
+ assertNull(metadataList.get(0).get("X-TIKA:digest:SHA256"));
+ assertNull(metadataList.get(0).get("X-TIKA:digest:MD5"));
+
+
assertEquals("Q7D3RFV6DNGZ4BQIS6UKNWX4CDIKPIGDU2D7ADBUDVOBYSZHF7FQ====",
+ metadataList.get(6).get("X-TIKA:digest:SHA256"));
+ assertEquals("90a8b249a6d6b6cb127c59e01cef3aaa",
+ metadataList.get(6).get("X-TIKA:digest:MD5"));
+ }
+
@Test
public void testDigestsEmptyParser() throws Exception {
//TIKA-3939 -- ensure that digesting happens even with EmptyParser
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
index 474ba7717..9971b7e03 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
@@ -76,7 +76,7 @@ public class BouncyCastleDigestingParserTest extends TikaTest
{
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
new DigestingParser(AUTO_DETECT_PARSER,
- new BouncyCastleDigester(UNLIMITED, algo)), m);
+ new BouncyCastleDigester(UNLIMITED, algo), false),
m);
assertEquals(expected.get(algo), m.get(P + algo));
}
@@ -105,7 +105,7 @@ public class BouncyCastleDigestingParserTest extends
TikaTest {
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
new DigestingParser(AUTO_DETECT_PARSER, new
BouncyCastleDigester(UNLIMITED,
- "MD5,SHA256,SHA384,SHA512,SHA3-512,SHA1:32")), m);
+ "MD5,SHA256,SHA384,SHA512,SHA3-512,SHA1:32"), false),
m);
for (String algo : new String[]{"MD5", "SHA256", "SHA384", "SHA512",
"SHA3-512", "SHA1"}) {
assertEquals(expected.get(algo), m.get(P + algo));
}
@@ -119,7 +119,7 @@ public class BouncyCastleDigestingParserTest extends
TikaTest {
String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(AUTO_DETECT_PARSER, new
BouncyCastleDigester(100, "MD5")), m);
+ new DigestingParser(AUTO_DETECT_PARSER, new
BouncyCastleDigester(100, "MD5"), false), m);
assertEquals(expectedMD5, m.get(P + "MD5"));
}
@@ -127,7 +127,7 @@ public class BouncyCastleDigestingParserTest extends
TikaTest {
public void testNegativeMaxMarkLength() throws Exception {
assertThrows(IllegalArgumentException.class, () -> {
getXML("test_recursive_embedded.docx",
- new DigestingParser(AUTO_DETECT_PARSER, new
BouncyCastleDigester(-1, "MD5")));
+ new DigestingParser(AUTO_DETECT_PARSER, new
BouncyCastleDigester(-1, "MD5"), false));
});
}
@@ -135,7 +135,7 @@ public class BouncyCastleDigestingParserTest extends
TikaTest {
public void testUnrecognizedEncodingOptions() throws Exception {
assertThrows(IllegalArgumentException.class, () -> {
getXML("test_recursive_embedded.docx", new
DigestingParser(AUTO_DETECT_PARSER,
- new BouncyCastleDigester(100000, "MD5:33")));
+ new BouncyCastleDigester(100000, "MD5:33"), false));
});
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index 2a377c7f5..508d6f4d7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -75,7 +75,7 @@ public class DigestingParserTest extends TikaTest {
for (CommonsDigester.DigestAlgorithm algo :
CommonsDigester.DigestAlgorithm.values()) {
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(AUTO_DETECT_PARSER, new
CommonsDigester(UNLIMITED, algo)),
+ new DigestingParser(AUTO_DETECT_PARSER, new
CommonsDigester(UNLIMITED, algo), false),
m);
assertEquals(expected.get(algo), m.get(P + algo.toString()),
algo.toString());
}
@@ -103,7 +103,8 @@ public class DigestingParserTest extends TikaTest {
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
new DigestingParser(AUTO_DETECT_PARSER,
- new CommonsDigester(UNLIMITED,
"md5,sha256,sha384,sha512,sha1:32")), m);
+ new CommonsDigester(UNLIMITED,
"md5,sha256,sha384,sha512,sha1:32"), false)
+ , m);
for (CommonsDigester.DigestAlgorithm algo : new
CommonsDigester.DigestAlgorithm[]{
CommonsDigester.DigestAlgorithm.MD5,
CommonsDigester.DigestAlgorithm.SHA1,
CommonsDigester.DigestAlgorithm.SHA256,
CommonsDigester.DigestAlgorithm.SHA384,
@@ -120,7 +121,8 @@ public class DigestingParserTest extends TikaTest {
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
new DigestingParser(AUTO_DETECT_PARSER,
- new CommonsDigester(100,
CommonsDigester.DigestAlgorithm.MD5)), m);
+ new CommonsDigester(100,
CommonsDigester.DigestAlgorithm.MD5),false)
+ , m);
assertEquals(expectedMD5, m.get(P + "MD5"));
}
@@ -131,7 +133,8 @@ public class DigestingParserTest extends TikaTest {
try {
XMLResult xml = getXML("test_recursive_embedded.docx",
new DigestingParser(AUTO_DETECT_PARSER,
- new CommonsDigester(-1,
CommonsDigester.DigestAlgorithm.MD5)), m);
+ new CommonsDigester(-1,
CommonsDigester.DigestAlgorithm.MD5),
+ false), m);
} catch (IllegalArgumentException e) {
ex = true;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 28b22a29a..24800926a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -392,7 +392,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
ParseContext context = new ParseContext();
Parser wrapped = AUTO_DETECT_PARSER;
if (digester != null) {
- wrapped = new DigestingParser(wrapped, digester);
+ wrapped = new DigestingParser(wrapped, digester, false);
}
RecursiveParserWrapper wrapper =
new RecursiveParserWrapper(wrapped, catchEmbeddedExceptions);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.xml
new file mode 100644
index 000000000..22823dc3c
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ </parsers>
+ <autoDetectParserConfig>
+ <spoolToDisk>1000000</spoolToDisk>
+ <outputThreshold>1000000</outputThreshold>
+ <digesterFactory
+ class="org.apache.tika.parser.digestutils.CommonsDigesterFactory">
+ <markLimit>100000</markLimit>
+ <algorithmString>sha256:32,md5</algorithmString>
+ <skipContainerDocument>true</skipContainerDocument>
+ </digesterFactory>
+ <throwOnZeroBytes>false</throwOnZeroBytes>
+ </autoDetectParserConfig>
+</properties>
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 3d4954df8..aadf86f30 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -120,7 +120,12 @@ public class TikaResource {
final Parser parser = new AutoDetectParser(TIKA_CONFIG);
if (DIGESTER != null) {
- return new DigestingParser(parser, DIGESTER);
+ boolean skipContainer = false;
+ if (TIKA_CONFIG.getAutoDetectParserConfig().getDigesterFactory()
!= null &&
+
TIKA_CONFIG.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument())
{
+ skipContainer = true;
+ }
+ return new DigestingParser(parser, DIGESTER, skipContainer);
}
return parser;
}