This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4563-on-main in repository https://gitbox.apache.org/repos/asf/tika.git
commit b9685742710a10580eae0ca56a01e6a062253e3c Author: tallison <[email protected]> AuthorDate: Wed Mar 18 17:56:54 2026 -0400 TIKA-4563 -- on main: cherry-pick updates from branch_3x found during regression tests and the release process --- CHANGES.txt | 4 ++ tika-bom/pom.xml | 61 ++++++++++++++++++++++ .../org/apache/tika/mime/tika-mimetypes.xml | 6 ++- .../org/apache/tika/eval/app/ExtractComparer.java | 57 +++++++++++++++++--- .../src/main/resources/comparison-reports-tags.xml | 25 +++++++++ .../src/main/resources/comparison-reports.xml | 26 +++++++++ tika-parent/pom.xml | 9 +--- .../parser/microsoft/AbstractPOIFSExtractor.java | 2 +- .../ooxml/XSSFExcelExtractorDecorator.java | 23 +++++--- .../java/org/apache/tika/parser/pkg/ZipParser.java | 4 ++ 10 files changed, 193 insertions(+), 24 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index b9348576fa..b135cb3894 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -35,6 +35,10 @@ Release 4.0.0-BETA1 - ??? Release 3.3.0 - ??? + * Various fixes based on regression testing (TIKA-4563). + + * Improve zip parsing (TIKA-4650). + * Add detection of compressed bmp (TIKA-4511). * Allow per file timeouts in tika-pipes (TIKA-4497). diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml index 7d453cfaa0..35c94a2927 100644 --- a/tika-bom/pom.xml +++ b/tika-bom/pom.xml @@ -31,8 +31,41 @@ <artifactId>tika-bom</artifactId> <packaging>pom</packaging> <name>Apache Tika BOM</name> + <description>Apache Tika Bill of Materials</description> <url>https://tika.apache.org/</url> + <licenses> + <license> + <name>Apache-2.0</name> + <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url> + <distribution>repo</distribution> + </license> + </licenses> + + <organization> + <name>The Apache Software Foundation</name> + <url>https://www.apache.org/</url> + </organization> + + <scm> + <connection>scm:git:https://github.com/apache/tika.git</connection> + <developerConnection>scm:git:https://github.com/apache/tika.git</developerConnection> + <url>https://github.com/apache/tika</url> + </scm> + + <distributionManagement> + <repository> + <id>apache.releases.https</id> + <name>Apache Release Distribution Repository</name> + <url>https://repository.apache.org/service/local/staging/deploy/maven2</url> + </repository> + <snapshotRepository> + <id>apache.snapshots.https</id> + <name>Apache Development Snapshot Repository</name> + <url>https://repository.apache.org/content/repositories/snapshots</url> + </snapshotRepository> + </distributionManagement> + <dependencyManagement> <dependencies> <dependency> @@ -409,4 +442,32 @@ </dependency> </dependencies> </dependencyManagement> + + <profiles> + <profile> + <id>apache-release</id> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-gpg-plugin</artifactId> + <version>3.2.8</version> + <configuration> + <gpgArguments> + <arg>--digest-algo=SHA512</arg> + </gpgArguments> + </configuration> + <executions> + <execution> + <id>sign-release-artifacts</id> + <goals> + <goal>sign</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> + </profile> + </profiles> </project> diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index d531459614..e5863a1998 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -4291,7 +4291,7 @@ <mime-type type="application/x-iso9660-image"> <acronym>ISO</acronym> <_comment>ISO 9660 CD-ROM filesystem data</_comment> - <magic priority="50"> + <magic priority="60"> <match value="CD001" type="string" offset="32769"/> <match value="CD001" type="string" offset="34817"/> <match value="CD001" type="string" offset="36865"/> @@ -5952,7 +5952,9 @@ <match value="0xffe3" type="string" offset="0"/> <!-- MP3 2.5 from pronom --> <!-- TIKA-417: This is the UTF-16 LE byte order mark! --> <!-- match value="0xfffe" type="string" offset="0"/ --> <!-- V1, L1, CRC --> - <match value="0xffff" type="string" offset="0"/> <!-- V1, L1 --> + <!-- 0xffff has layer bits 00 which is "reserved" in the MPEG spec, not L1. + Removed: caused false positives with binary files. --> + <!-- match value="0xffff" type="string" offset="0"/ --> <!-- TIKA-4582: Require MP3 frame sync after ID3 tag to avoid false positives with other ID3-tagged formats --> <match value="ID3" type="string" offset="0"> <match type="regex" value="\\xFF[\\xE3\\xF2-\\xF7\\xFA-\\xFD\\xFF]" offset="512:8192" /> diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java index 644bcbe345..b4679c1845 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java @@ -18,10 +18,13 @@ package org.apache.tika.eval.app; import java.io.IOException; import java.nio.file.Path; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import java.sql.Types; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; @@ -187,6 +190,7 @@ public class ExtractComparer extends ProfilerBase { List<Integer> numAttachmentsB = countAttachments(metadataListB); String sharedDigestKey = findSharedDigestKey(metadataListA, metadataListB); + String emptyDigest = computeEmptyDigest(sharedDigestKey); Map<Class, Object> tokenStatsA = null; Map<Class, Object> tokenStatsB = null; //now get that metadata @@ -204,7 +208,7 @@ public class ExtractComparer extends ProfilerBase { writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A); writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A); - int matchIndex = getMatch(i, sharedDigestKey, handledB, metadataListA, metadataListB); + int matchIndex = getMatch(i, sharedDigestKey, emptyDigest, handledB, metadataListA, metadataListB); if (matchIndex > -1 && !handledB.contains(matchIndex)) { metadataB = metadataListB.get(matchIndex); @@ -357,7 +361,7 @@ public class ExtractComparer extends ProfilerBase { * @param metadataListB * @return */ - private int getMatch(int aIndex, String sharedDigestKey, Set<Integer> handledB, List<Metadata> metadataListA, List<Metadata> metadataListB) { + private int getMatch(int aIndex, String sharedDigestKey, String emptyDigest, Set<Integer> handledB, List<Metadata> metadataListA, List<Metadata> metadataListB) { //TODO: could make this more robust if (metadataListB == null || metadataListB.size() == 0) { return -1; @@ -367,16 +371,23 @@ public class ExtractComparer extends ProfilerBase { return 0; } + Metadata thisMetadata = metadataListA.get(aIndex); + if (sharedDigestKey != null) { //first try to find matching digests - return findMatchingDigests(sharedDigestKey, handledB, metadataListA.get(aIndex), metadataListB); + int digestMatch = findMatchingDigests(sharedDigestKey, emptyDigest, handledB, thisMetadata, metadataListB); + if (digestMatch > -1) { + return digestMatch; + } } - //assume same embedded resource path. Not always true! - Metadata thisMetadata = metadataListA.get(aIndex); + //try matching by embedded resource path String embeddedPath = thisMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); if (embeddedPath != null) { for (int j = 0; j < metadataListB.size(); j++) { + if (handledB.contains(j)) { + continue; + } String thatEmbeddedPath = metadataListB .get(j) .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); @@ -394,11 +405,16 @@ public class ExtractComparer extends ProfilerBase { return -1; } - private int findMatchingDigests(String sharedDigestKey, Set<Integer> handledB, Metadata metadata, List<Metadata> metadataListB) { + private int findMatchingDigests(String sharedDigestKey, String emptyDigest, Set<Integer> handledB, Metadata metadata, List<Metadata> metadataListB) { String digestA = metadata.get(sharedDigestKey); if (digestA == null) { return -1; } + // Skip matching on the empty-content digest -- it's meaningless + // and causes false matches among unrelated zero-byte embedded docs + if (digestA.equalsIgnoreCase(emptyDigest)) { + return -1; + } String resourceName = metadata.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH); int cand = -1; @@ -418,6 +434,35 @@ public class ExtractComparer extends ProfilerBase { return cand; } + /** + * Computes the hex-encoded digest of empty (zero-byte) content for the + * algorithm identified by the shared digest key (e.g. "X-TIKA:digest:MD5"). + * Returns null if the algorithm cannot be resolved. + */ + private static String computeEmptyDigest(String sharedDigestKey) { + if (sharedDigestKey == null) { + return null; + } + // key format: "X-TIKA:digest:MD5" or "X-TIKA:digest:SHA256" etc. + String algo = sharedDigestKey.substring(DIGEST_KEY_PREFIX.length()); + // normalize common names to MessageDigest algorithm names + // e.g. SHA256 -> SHA-256 + if (algo.matches("(?i)SHA(\\d+)")) { + algo = algo.toUpperCase(Locale.ROOT).replaceFirst("SHA(\\d+)", "SHA-$1"); + } + try { + MessageDigest md = MessageDigest.getInstance(algo); + byte[] emptyHash = md.digest(new byte[0]); + StringBuilder sb = new StringBuilder(); + for (byte b : emptyHash) { + sb.append(String.format(Locale.ROOT, "%02x", b)); + } + return sb.toString(); + } catch (NoSuchAlgorithmException e) { + return null; + } + } + private void writeContrasts(Map<Cols, String> data, ContrastStatistics contrastStatistics) { writeContrastString(data, Cols.TOP_10_MORE_IN_A, contrastStatistics.getTopNMoreA()); writeContrastString(data, Cols.TOP_10_MORE_IN_B, contrastStatistics.getTopNMoreB()); diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml index 6acb475901..25dcc74b0d 100644 --- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml +++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml @@ -1775,6 +1775,31 @@ limit 20000; </sql> </report> + <report reportName="Attachment Name Changes" + reportFilename="attachments/attachment_name_diffs.xlsx" + format="xlsx" + includeSql="true"> + <sql> + select file_path, + pa.embedded_file_path as EMBEDDED_NAME_A, + pb.embedded_file_path as EMBEDDED_NAME_B, + c.length as CONTAINER_LENGTH, + ma.mime_string as MIME_STRING_A, + mb.mime_string as MIME_STRING_B + from profiles_a pa + join profiles_b pb on pa.id=pb.id + join containers c on pa.container_id=c.container_id + join mimes ma on pa.mime_id=ma.mime_id + join mimes mb on pb.mime_id=mb.mime_id + where pa.is_embedded=true + and pb.is_embedded=true + and pa.embedded_file_path is not null + and pb.embedded_file_path is not null + and pa.embedded_file_path <> pb.embedded_file_path + order by file_path, pa.embedded_file_path + limit 100000 + </sql> + </report> <after> <sql>drop table if exists md5_multiples_tmp_a</sql> <sql>drop table if exists md5_multiples_tmp_b</sql> diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml index 667578daa9..082949e9cf 100644 --- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml +++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml @@ -1489,6 +1489,32 @@ limit 20000; </sql> </report> + <!-- attachment name changes --> + <report reportName="Attachment Name Changes" + reportFilename="attachments/attachment_name_diffs.xlsx" + format="xlsx" + includeSql="true"> + <sql> + select file_path, + pa.embedded_file_path as EMBEDDED_NAME_A, + pb.embedded_file_path as EMBEDDED_NAME_B, + c.length as CONTAINER_LENGTH, + ma.mime_string as MIME_STRING_A, + mb.mime_string as MIME_STRING_B + from profiles_a pa + join profiles_b pb on pa.id=pb.id + join containers c on pa.container_id=c.container_id + join mimes ma on pa.mime_id=ma.mime_id + join mimes mb on pb.mime_id=mb.mime_id + where pa.is_embedded=true + and pb.is_embedded=true + and pa.embedded_file_path is not null + and pb.embedded_file_path is not null + and pa.embedded_file_path <> pb.embedded_file_path + order by file_path, pa.embedded_file_path + limit 100000 + </sql> + </report> <after> <sql>drop table if exists md5_multiples_tmp_a</sql> <sql>drop table if exists md5_multiples_tmp_b</sql> diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 5a07ab53a5..e723ed6b9b 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -1762,15 +1762,10 @@ </profile> </profiles> - <!-- - <connection>scm:git:https://github.com/apache/</connection> - <developerConnection>scm:git:https://github.com/apache/</developerConnection> - <url>https://github.com/apache/tika</url> - --> <scm> <tag>3.0.0-rc1</tag> - <connection>scm:git:https://github.com/apache/</connection> - <developerConnection>scm:git:https://github.com/apache/</developerConnection> + <connection>scm:git:https://github.com/apache/tika.git</connection> + <developerConnection>scm:git:https://github.com/apache/tika.git</developerConnection> <url>https://github.com/apache/tika</url> </scm> </project> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java index 09a343b914..37267b6644 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java @@ -319,7 +319,7 @@ abstract class AbstractPOIFSExtractor { } // Record what we can do about it - metadata.set(Metadata.CONTENT_TYPE, mediaType.getType()); + metadata.set(Metadata.CONTENT_TYPE, mediaType.toString()); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + extension); metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true); metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index 9572e5e27c..e25bdf6d09 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -303,8 +303,10 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { metadata.set(Office.HAS_EXTERNAL_PIVOT_DATA, true); } } - } catch (IOException | TikaException | SAXException e) { - // swallow + } catch (IOException | TikaException | SAXException | IllegalArgumentException e) { + // swallow -- POI throws IllegalArgumentException when a + // relationship references a part missing from the package + // (e.g. truncated files) } } } @@ -363,8 +365,9 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { metadata.set(Office.HAS_DDE_LINKS, true); } } - } catch (IOException | TikaException e) { - // swallow + } catch (IOException | TikaException | IllegalArgumentException e) { + // swallow -- POI can throw IllegalArgumentException + // for malformed relationships } } } @@ -394,8 +397,10 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { metadata.set(Office.HAS_WEB_QUERIES, true); } } - } catch (IOException | TikaException e) { - // swallow + } catch (IOException | TikaException | IllegalArgumentException e) { + // swallow -- POI throws IllegalArgumentException when a + // relationship references a part missing from the package + // (e.g. truncated files) } } } @@ -417,8 +422,10 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { XMLReaderUtils.parseSAX(is, new QueryTableHandler(xhtml), parseContext); } } - } catch (IOException | TikaException e) { - // swallow + } catch (IOException | TikaException | IllegalArgumentException e) { + // swallow -- POI throws IllegalArgumentException when a + // relationship references a part missing from the package + // (e.g. truncated files) } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java index 62ec3bc8e3..698bd26991 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java @@ -475,6 +475,8 @@ public class ZipParser extends AbstractArchiveParser { try (InputStream entryStream = zipFile.getInputStream(entry)) { TikaInputStream tis = TikaInputStream.get(entryStream, tmp, entryMetadata); extractor.parseEmbedded(tis, xhtml, entryMetadata, new ParseContext(), true); + } catch (UnsupportedZipFeatureException e) { + EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); } finally { tmp.dispose(); } @@ -515,6 +517,8 @@ public class ZipParser extends AbstractArchiveParser { try { TikaInputStream tis = TikaInputStream.get(zis, tmp, entryMetadata); extractor.parseEmbedded(tis, xhtml, entryMetadata, new ParseContext(), true); + } catch (UnsupportedZipFeatureException e) { + EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); } finally { tmp.dispose(); }
