This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 90d854faa TIKA-4349 -- allow uppercasing of hex encoded digests in the
CommonsDigester (#2044)
90d854faa is described below
commit 90d854faa2a711e0c467ec851399a0c928d037de
Author: Tim Allison <[email protected]>
AuthorDate: Wed Nov 13 13:57:36 2024 -0500
TIKA-4349 -- allow uppercasing of hex encoded digests in the
CommonsDigester (#2044)
* TIKA-4349 -- allow uppercasing of hex encoded digests in the
CommonsDigester
---
.../test/java/org/apache/tika/cli/TikaCLITest.java | 8 +--
.../tika/parser/digestutils/CommonsDigester.java | 62 ++++++++++++++++------
.../apache/tika/parser/DigestingParserTest.java | 4 +-
3 files changed, 51 insertions(+), 23 deletions(-)
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 0e55ac7db..ec6e7df1a 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -139,7 +139,7 @@ public class TikaCLITest {
String content = getParamOutContent("-x", resourcePrefix +
"alice.cli.test");
assertTrue(content.contains("?xml version=\"1.0\"
encoding=\"UTF-8\"?"));
- content = getParamOutContent("-x", "--digest=SHA256", resourcePrefix +
"alice.cli.test");
+ content = getParamOutContent("-x", "--digest=sha256", resourcePrefix +
"alice.cli.test");
assertTrue(content.contains("<meta name=\"X-TIKA:digest:SHA256\"
content=\"e90779adbac09c4ee"));
}
@@ -155,7 +155,7 @@ public class TikaCLITest {
assertTrue(content.contains("html
xmlns=\"http://www.w3.org/1999/xhtml"));
assertTrue(content.contains("<title></title>"), "Expanded
<title></title> element should be present");
- content = getParamOutContent("-h", "--digest=SHA384", resourcePrefix +
"alice.cli.test");
+ content = getParamOutContent("-h", "--digest=sha384", resourcePrefix +
"alice.cli.test");
assertTrue(content.contains("<meta name=\"X-TIKA:digest:SHA384\"
content=\"c69ea023f5da95a026"));
}
@@ -207,7 +207,7 @@ public class TikaCLITest {
content = getParamOutContent("-m", "--digest=SHA512", resourcePrefix +
"alice.cli.test");
assertTrue(content.contains("text/plain"));
- assertTrue(content.contains("X-TIKA:digest:SHA512:
dd459d99bc19ff78fd31fbae46e0"));
+ assertTrue(content.contains("X-TIKA:digest:SHA512:
DD459D99BC19FF78FD31FBAE46E0"));
}
/**
@@ -459,7 +459,7 @@ public class TikaCLITest {
@Test
public void testDigestInJson() throws Exception {
- String content = getParamOutContent("-J", "-r", "-t", "--digest=MD5",
resourcePrefix + "test_recursive_embedded.docx");
+ String content = getParamOutContent("-J", "-r", "-t", "--digest=md5",
resourcePrefix + "test_recursive_embedded.docx");
assertTrue(content.contains("\"X-TIKA:digest:MD5\" :
\"59f626e09a8c16ab6dbc2800c685f772\","));
assertTrue(content.contains("\"X-TIKA:digest:MD5\" :
\"f9627095ef86c482e61d99f0cc1cf87d\""));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
index 7da68e7af..44617ab34 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java
@@ -43,7 +43,8 @@ public class CommonsDigester extends CompositeDigester {
/**
* Include a string representing the comma-separated algorithms to run:
e.g. "md5,sha1".
* If you want base 32 encoding instead of hexadecimal, add ":32" to the
algorithm, e.g.
- * "md5,sha1:32"
+ * "md5,sha1:32". If you want uppercase digests for the hexadecimal
encoder,
+ * use uppercase in the algorithm name, e.g. "MD5".
* <p/>
* Will throw an IllegalArgumentException if an algorithm isn't supported
*
@@ -70,7 +71,7 @@ public class CommonsDigester extends CompositeDigester {
for (DigestAlgorithm algorithm : algorithms) {
digesters[i++] =
new InputStreamDigester(markLimit,
algorithm.getJavaName(), algorithm.name(),
- new HexEncoder());
+ new HexEncoder(false));
}
return digesters;
}
@@ -129,20 +130,7 @@ public class CommonsDigester extends CompositeDigester {
int i = 0;
for (String digest : digests) {
String[] parts = digest.split(":");
- DigestingParser.Encoder encoder = null;
- if (parts.length > 1) {
- if (parts[1].equals("16")) {
- encoder = new HexEncoder();
- } else if (parts[1].equals("32")) {
- encoder = new Base32Encoder();
- } else if (parts[1].equals("64")) {
- encoder = new Base64Encoder();
- } else {
- throw new IllegalArgumentException("Value must be '16',
'32' or '64'");
- }
- } else {
- encoder = new HexEncoder();
- }
+ DigestingParser.Encoder encoder = getEncoder(parts);
DigestAlgorithm digestAlgorithm = getDigestAlgorithm(parts[0]);
digesters[i++] = new InputStreamDigester(markLimit,
digestAlgorithm.getJavaName(),
digestAlgorithm.name(), encoder);
@@ -150,6 +138,25 @@ public class CommonsDigester extends CompositeDigester {
return digesters;
}
+ private static DigestingParser.Encoder getEncoder(String[] parts) {
+ DigestingParser.Encoder encoder = null;
+ boolean uc = parts[0].matches("[A-Z0-9]{1,20}");
+ if (parts.length > 1) {
+ if (parts[1].equals("16")) {
+ encoder = new HexEncoder(uc);
+ } else if (parts[1].equals("32")) {
+ encoder = new Base32Encoder();
+ } else if (parts[1].equals("64")) {
+ encoder = new Base64Encoder();
+ } else {
+ throw new IllegalArgumentException("Value must be '16', '32'
or '64'");
+ }
+ } else {
+ encoder = new HexEncoder(uc);
+ }
+ return encoder;
+ }
+
public enum DigestAlgorithm {
//those currently available in commons.digest
MD2("MD2"), MD5("MD5"), SHA1("SHA-1"), SHA256("SHA-256"),
SHA384("SHA-384"),
@@ -171,10 +178,31 @@ public class CommonsDigester extends CompositeDigester {
}
}
+ private static abstract class CasingEncoderBase implements
DigestingParser.Encoder {
+ private final boolean upperCase;
+ private CasingEncoderBase(boolean upperCase) {
+ this.upperCase = upperCase;
+ }
+
+ }
private static class HexEncoder implements DigestingParser.Encoder {
+ private final boolean upperCase;
+ private HexEncoder(boolean upperCase) {
+ this.upperCase = upperCase;
+ }
+
@Override
public String encode(byte[] bytes) {
- return Hex.encodeHexString(bytes);
+ return toCase(Hex.encodeHexString(bytes));
+ }
+
+ String toCase(String digest) {
+ if (upperCase) {
+ return digest.toUpperCase(Locale.ROOT);
+ } else {
+ //this is redundant, but useful for future proofing?
+ return digest.toLowerCase(Locale.ROOT);
+ }
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index 7ed78e055..44922ecf7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -90,7 +90,7 @@ public class DigestingParserTest extends TikaTest {
expected.put(CommonsDigester.DigestAlgorithm.MD2,
"d768c8e27b0b52c6eaabfaa7122d1d4f");
- expected.put(CommonsDigester.DigestAlgorithm.MD5,
"59f626e09a8c16ab6dbc2800c685f772");
+ expected.put(CommonsDigester.DigestAlgorithm.MD5,
"59F626E09A8C16AB6DBC2800C685F772");
expected.put(CommonsDigester.DigestAlgorithm.SHA1,
"PIPQAHIWHLEQ3DVFJQCQ7L22HADZPCFG");
expected.put(CommonsDigester.DigestAlgorithm.SHA256,
"c4b7fab030a8b6a9d6691f6699ac8e6f" +
"82bc53764a0f1430d134ae3b70c32654");
@@ -105,7 +105,7 @@ public class DigestingParserTest extends TikaTest {
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
new DigestingParser(AUTO_DETECT_PARSER,
- new CommonsDigester(UNLIMITED,
"md5,sha256,sha384,sha512,sha1:32"), false)
+ new CommonsDigester(UNLIMITED,
"MD5,sha256,sha384,sha512,sha1:32"), false)
, m);
for (CommonsDigester.DigestAlgorithm algo : new
CommonsDigester.DigestAlgorithm[]{
CommonsDigester.DigestAlgorithm.MD5,
CommonsDigester.DigestAlgorithm.SHA1,