This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 48c9e93734 TIKA-4563 -- on main: cherry-pick updates from branch_3x
found during regression tests and the release process (#2699)
48c9e93734 is described below
commit 48c9e9373411838d28bd797f1c73dcc08f61d2d5
Author: Tim Allison <[email protected]>
AuthorDate: Wed Mar 18 19:00:31 2026 -0400
TIKA-4563 -- on main: cherry-pick updates from branch_3x found during
regression tests and the release process (#2699)
---
CHANGES.txt | 4 ++
tika-bom/pom.xml | 61 ++++++++++++++++++++++
.../org/apache/tika/mime/tika-mimetypes.xml | 6 ++-
.../org/apache/tika/eval/app/ExtractComparer.java | 57 +++++++++++++++++---
.../src/main/resources/comparison-reports-tags.xml | 25 +++++++++
.../src/main/resources/comparison-reports.xml | 26 +++++++++
tika-parent/pom.xml | 9 +---
.../parser/microsoft/AbstractPOIFSExtractor.java | 2 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 23 +++++---
.../java/org/apache/tika/parser/pkg/ZipParser.java | 4 ++
10 files changed, 193 insertions(+), 24 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index b9348576fa..b135cb3894 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -35,6 +35,10 @@ Release 4.0.0-BETA1 - ???
Release 3.3.0 - ???
+ * Various fixes based on regression testing (TIKA-4563).
+
+ * Improve zip parsing (TIKA-4650).
+
* Add detection of compressed bmp (TIKA-4511).
* Allow per file timeouts in tika-pipes (TIKA-4497).
diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml
index 7d453cfaa0..35c94a2927 100644
--- a/tika-bom/pom.xml
+++ b/tika-bom/pom.xml
@@ -31,8 +31,41 @@
<artifactId>tika-bom</artifactId>
<packaging>pom</packaging>
<name>Apache Tika BOM</name>
+ <description>Apache Tika Bill of Materials</description>
<url>https://tika.apache.org/</url>
+ <licenses>
+ <license>
+ <name>Apache-2.0</name>
+ <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ <distribution>repo</distribution>
+ </license>
+ </licenses>
+
+ <organization>
+ <name>The Apache Software Foundation</name>
+ <url>https://www.apache.org/</url>
+ </organization>
+
+ <scm>
+ <connection>scm:git:https://github.com/apache/tika.git</connection>
+
<developerConnection>scm:git:https://github.com/apache/tika.git</developerConnection>
+ <url>https://github.com/apache/tika</url>
+ </scm>
+
+ <distributionManagement>
+ <repository>
+ <id>apache.releases.https</id>
+ <name>Apache Release Distribution Repository</name>
+
<url>https://repository.apache.org/service/local/staging/deploy/maven2</url>
+ </repository>
+ <snapshotRepository>
+ <id>apache.snapshots.https</id>
+ <name>Apache Development Snapshot Repository</name>
+ <url>https://repository.apache.org/content/repositories/snapshots</url>
+ </snapshotRepository>
+ </distributionManagement>
+
<dependencyManagement>
<dependencies>
<dependency>
@@ -409,4 +442,32 @@
</dependency>
</dependencies>
</dependencyManagement>
+
+ <profiles>
+ <profile>
+ <id>apache-release</id>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-gpg-plugin</artifactId>
+ <version>3.2.8</version>
+ <configuration>
+ <gpgArguments>
+ <arg>--digest-algo=SHA512</arg>
+ </gpgArguments>
+ </configuration>
+ <executions>
+ <execution>
+ <id>sign-release-artifacts</id>
+ <goals>
+ <goal>sign</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ </profiles>
</project>
diff --git
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index d531459614..e5863a1998 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -4291,7 +4291,7 @@
<mime-type type="application/x-iso9660-image">
<acronym>ISO</acronym>
<_comment>ISO 9660 CD-ROM filesystem data</_comment>
- <magic priority="50">
+ <magic priority="60">
<match value="CD001" type="string" offset="32769"/>
<match value="CD001" type="string" offset="34817"/>
<match value="CD001" type="string" offset="36865"/>
@@ -5952,7 +5952,9 @@
<match value="0xffe3" type="string" offset="0"/> <!-- MP3 2.5 from
pronom -->
<!-- TIKA-417: This is the UTF-16 LE byte order mark! -->
<!-- match value="0xfffe" type="string" offset="0"/ --> <!-- V1, L1, CRC
-->
- <match value="0xffff" type="string" offset="0"/> <!-- V1, L1 -->
+ <!-- 0xffff has layer bits 00 which is "reserved" in the MPEG spec, not
L1.
+ Removed: caused false positives with binary files. -->
+ <!-- match value="0xffff" type="string" offset="0"/ -->
<!-- TIKA-4582: Require MP3 frame sync after ID3 tag to avoid false
positives with other ID3-tagged formats -->
<match value="ID3" type="string" offset="0">
<match type="regex" value="\\xFF[\\xE3\\xF2-\\xF7\\xFA-\\xFD\\xFF]"
offset="512:8192" />
diff --git
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
index 644bcbe345..b4679c1845 100644
---
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
+++
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
@@ -18,10 +18,13 @@ package org.apache.tika.eval.app;
import java.io.IOException;
import java.nio.file.Path;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
import java.sql.Types;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.Set;
@@ -187,6 +190,7 @@ public class ExtractComparer extends ProfilerBase {
List<Integer> numAttachmentsB = countAttachments(metadataListB);
String sharedDigestKey = findSharedDigestKey(metadataListA,
metadataListB);
+ String emptyDigest = computeEmptyDigest(sharedDigestKey);
Map<Class, Object> tokenStatsA = null;
Map<Class, Object> tokenStatsB = null;
//now get that metadata
@@ -204,7 +208,7 @@ public class ExtractComparer extends ProfilerBase {
writeProfileData(fpsA, i, contentTagsA, metadataA, fileId,
containerID, numAttachmentsA, PROFILES_A);
writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
- int matchIndex = getMatch(i, sharedDigestKey, handledB,
metadataListA, metadataListB);
+ int matchIndex = getMatch(i, sharedDigestKey, emptyDigest,
handledB, metadataListA, metadataListB);
if (matchIndex > -1 && !handledB.contains(matchIndex)) {
metadataB = metadataListB.get(matchIndex);
@@ -357,7 +361,7 @@ public class ExtractComparer extends ProfilerBase {
* @param metadataListB
* @return
*/
- private int getMatch(int aIndex, String sharedDigestKey, Set<Integer>
handledB, List<Metadata> metadataListA, List<Metadata> metadataListB) {
+ private int getMatch(int aIndex, String sharedDigestKey, String
emptyDigest, Set<Integer> handledB, List<Metadata> metadataListA,
List<Metadata> metadataListB) {
//TODO: could make this more robust
if (metadataListB == null || metadataListB.size() == 0) {
return -1;
@@ -367,16 +371,23 @@ public class ExtractComparer extends ProfilerBase {
return 0;
}
+ Metadata thisMetadata = metadataListA.get(aIndex);
+
if (sharedDigestKey != null) {
//first try to find matching digests
- return findMatchingDigests(sharedDigestKey, handledB,
metadataListA.get(aIndex), metadataListB);
+ int digestMatch = findMatchingDigests(sharedDigestKey,
emptyDigest, handledB, thisMetadata, metadataListB);
+ if (digestMatch > -1) {
+ return digestMatch;
+ }
}
- //assume same embedded resource path. Not always true!
- Metadata thisMetadata = metadataListA.get(aIndex);
+ //try matching by embedded resource path
String embeddedPath =
thisMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
if (embeddedPath != null) {
for (int j = 0; j < metadataListB.size(); j++) {
+ if (handledB.contains(j)) {
+ continue;
+ }
String thatEmbeddedPath = metadataListB
.get(j)
.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
@@ -394,11 +405,16 @@ public class ExtractComparer extends ProfilerBase {
return -1;
}
- private int findMatchingDigests(String sharedDigestKey, Set<Integer>
handledB, Metadata metadata, List<Metadata> metadataListB) {
+ private int findMatchingDigests(String sharedDigestKey, String
emptyDigest, Set<Integer> handledB, Metadata metadata, List<Metadata>
metadataListB) {
String digestA = metadata.get(sharedDigestKey);
if (digestA == null) {
return -1;
}
+ // Skip matching on the empty-content digest -- it's meaningless
+ // and causes false matches among unrelated zero-byte embedded docs
+ if (digestA.equalsIgnoreCase(emptyDigest)) {
+ return -1;
+ }
String resourceName =
metadata.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH);
int cand = -1;
@@ -418,6 +434,35 @@ public class ExtractComparer extends ProfilerBase {
return cand;
}
+ /**
+ * Computes the hex-encoded digest of empty (zero-byte) content for the
+ * algorithm identified by the shared digest key (e.g.
"X-TIKA:digest:MD5").
+ * Returns null if the algorithm cannot be resolved.
+ */
+ private static String computeEmptyDigest(String sharedDigestKey) {
+ if (sharedDigestKey == null) {
+ return null;
+ }
+ // key format: "X-TIKA:digest:MD5" or "X-TIKA:digest:SHA256" etc.
+ String algo = sharedDigestKey.substring(DIGEST_KEY_PREFIX.length());
+ // normalize common names to MessageDigest algorithm names
+ // e.g. SHA256 -> SHA-256
+ if (algo.matches("(?i)SHA(\\d+)")) {
+ algo = algo.toUpperCase(Locale.ROOT).replaceFirst("SHA(\\d+)",
"SHA-$1");
+ }
+ try {
+ MessageDigest md = MessageDigest.getInstance(algo);
+ byte[] emptyHash = md.digest(new byte[0]);
+ StringBuilder sb = new StringBuilder();
+ for (byte b : emptyHash) {
+ sb.append(String.format(Locale.ROOT, "%02x", b));
+ }
+ return sb.toString();
+ } catch (NoSuchAlgorithmException e) {
+ return null;
+ }
+ }
+
private void writeContrasts(Map<Cols, String> data, ContrastStatistics
contrastStatistics) {
writeContrastString(data, Cols.TOP_10_MORE_IN_A,
contrastStatistics.getTopNMoreA());
writeContrastString(data, Cols.TOP_10_MORE_IN_B,
contrastStatistics.getTopNMoreB());
diff --git
a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml
index 6acb475901..25dcc74b0d 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml
@@ -1775,6 +1775,31 @@
limit 20000;
</sql>
</report>
+ <report reportName="Attachment Name Changes"
+ reportFilename="attachments/attachment_name_diffs.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select file_path,
+ pa.embedded_file_path as EMBEDDED_NAME_A,
+ pb.embedded_file_path as EMBEDDED_NAME_B,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B
+ from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ where pa.is_embedded=true
+ and pb.is_embedded=true
+ and pa.embedded_file_path is not null
+ and pb.embedded_file_path is not null
+ and pa.embedded_file_path <> pb.embedded_file_path
+ order by file_path, pa.embedded_file_path
+ limit 100000
+ </sql>
+ </report>
<after>
<sql>drop table if exists md5_multiples_tmp_a</sql>
<sql>drop table if exists md5_multiples_tmp_b</sql>
diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
index 667578daa9..082949e9cf 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
@@ -1489,6 +1489,32 @@
limit 20000;
</sql>
</report>
+ <!-- attachment name changes -->
+ <report reportName="Attachment Name Changes"
+ reportFilename="attachments/attachment_name_diffs.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select file_path,
+ pa.embedded_file_path as EMBEDDED_NAME_A,
+ pb.embedded_file_path as EMBEDDED_NAME_B,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B
+ from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ where pa.is_embedded=true
+ and pb.is_embedded=true
+ and pa.embedded_file_path is not null
+ and pb.embedded_file_path is not null
+ and pa.embedded_file_path <> pb.embedded_file_path
+ order by file_path, pa.embedded_file_path
+ limit 100000
+ </sql>
+ </report>
<after>
<sql>drop table if exists md5_multiples_tmp_a</sql>
<sql>drop table if exists md5_multiples_tmp_b</sql>
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 5a07ab53a5..e723ed6b9b 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -1762,15 +1762,10 @@
</profile>
</profiles>
- <!--
- <connection>scm:git:https://github.com/apache/</connection>
-
<developerConnection>scm:git:https://github.com/apache/</developerConnection>
- <url>https://github.com/apache/tika</url>
- -->
<scm>
<tag>3.0.0-rc1</tag>
- <connection>scm:git:https://github.com/apache/</connection>
-
<developerConnection>scm:git:https://github.com/apache/</developerConnection>
+ <connection>scm:git:https://github.com/apache/tika.git</connection>
+
<developerConnection>scm:git:https://github.com/apache/tika.git</developerConnection>
<url>https://github.com/apache/tika</url>
</scm>
</project>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 09a343b914..37267b6644 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -319,7 +319,7 @@ abstract class AbstractPOIFSExtractor {
}
// Record what we can do about it
- metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
+ metadata.set(Metadata.CONTENT_TYPE, mediaType.toString());
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName +
extension);
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED,
true);
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 9572e5e27c..e25bdf6d09 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -303,8 +303,10 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
metadata.set(Office.HAS_EXTERNAL_PIVOT_DATA, true);
}
}
- } catch (IOException | TikaException | SAXException e) {
- // swallow
+ } catch (IOException | TikaException | SAXException |
IllegalArgumentException e) {
+ // swallow -- POI throws IllegalArgumentException when a
+ // relationship references a part missing from the package
+ // (e.g. truncated files)
}
}
}
@@ -363,8 +365,9 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
metadata.set(Office.HAS_DDE_LINKS, true);
}
}
- } catch (IOException | TikaException e) {
- // swallow
+ } catch (IOException | TikaException |
IllegalArgumentException e) {
+ // swallow -- POI can throw IllegalArgumentException
+ // for malformed relationships
}
}
}
@@ -394,8 +397,10 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
metadata.set(Office.HAS_WEB_QUERIES, true);
}
}
- } catch (IOException | TikaException e) {
- // swallow
+ } catch (IOException | TikaException | IllegalArgumentException e)
{
+ // swallow -- POI throws IllegalArgumentException when a
+ // relationship references a part missing from the package
+ // (e.g. truncated files)
}
}
}
@@ -417,8 +422,10 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
XMLReaderUtils.parseSAX(is, new
QueryTableHandler(xhtml), parseContext);
}
}
- } catch (IOException | TikaException e) {
- // swallow
+ } catch (IOException | TikaException | IllegalArgumentException e)
{
+ // swallow -- POI throws IllegalArgumentException when a
+ // relationship references a part missing from the package
+ // (e.g. truncated files)
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index 62ec3bc8e3..698bd26991 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -475,6 +475,8 @@ public class ZipParser extends AbstractArchiveParser {
try (InputStream entryStream = zipFile.getInputStream(entry)) {
TikaInputStream tis = TikaInputStream.get(entryStream, tmp,
entryMetadata);
extractor.parseEmbedded(tis, xhtml, entryMetadata, new
ParseContext(), true);
+ } catch (UnsupportedZipFeatureException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
} finally {
tmp.dispose();
}
@@ -515,6 +517,8 @@ public class ZipParser extends AbstractArchiveParser {
try {
TikaInputStream tis = TikaInputStream.get(zis, tmp,
entryMetadata);
extractor.parseEmbedded(tis, xhtml, entryMetadata, new
ParseContext(), true);
+ } catch (UnsupportedZipFeatureException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
} finally {
tmp.dispose();
}