This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 48c9e93734 TIKA-4563 -- on main: cherry-pick updates from branch_3x 
found during regression tests and the release process (#2699)
48c9e93734 is described below

commit 48c9e9373411838d28bd797f1c73dcc08f61d2d5
Author: Tim Allison <[email protected]>
AuthorDate: Wed Mar 18 19:00:31 2026 -0400

    TIKA-4563 -- on main: cherry-pick updates from branch_3x found during 
regression tests and the release process (#2699)
---
 CHANGES.txt                                        |  4 ++
 tika-bom/pom.xml                                   | 61 ++++++++++++++++++++++
 .../org/apache/tika/mime/tika-mimetypes.xml        |  6 ++-
 .../org/apache/tika/eval/app/ExtractComparer.java  | 57 +++++++++++++++++---
 .../src/main/resources/comparison-reports-tags.xml | 25 +++++++++
 .../src/main/resources/comparison-reports.xml      | 26 +++++++++
 tika-parent/pom.xml                                |  9 +---
 .../parser/microsoft/AbstractPOIFSExtractor.java   |  2 +-
 .../ooxml/XSSFExcelExtractorDecorator.java         | 23 +++++---
 .../java/org/apache/tika/parser/pkg/ZipParser.java |  4 ++
 10 files changed, 193 insertions(+), 24 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index b9348576fa..b135cb3894 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -35,6 +35,10 @@ Release 4.0.0-BETA1 - ???
 
 Release 3.3.0 - ???
 
+  * Various fixes based on regression testing (TIKA-4563).
+
+  * Improve zip parsing (TIKA-4650).
+
   * Add detection of compressed bmp (TIKA-4511).
 
   * Allow per file timeouts in tika-pipes (TIKA-4497).
diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml
index 7d453cfaa0..35c94a2927 100644
--- a/tika-bom/pom.xml
+++ b/tika-bom/pom.xml
@@ -31,8 +31,41 @@
   <artifactId>tika-bom</artifactId>
   <packaging>pom</packaging>
   <name>Apache Tika BOM</name>
+  <description>Apache Tika Bill of Materials</description>
   <url>https://tika.apache.org/</url>
 
+  <licenses>
+    <license>
+      <name>Apache-2.0</name>
+      <url>https://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      <distribution>repo</distribution>
+    </license>
+  </licenses>
+
+  <organization>
+    <name>The Apache Software Foundation</name>
+    <url>https://www.apache.org/</url>
+  </organization>
+
+  <scm>
+    <connection>scm:git:https://github.com/apache/tika.git</connection>
+    
<developerConnection>scm:git:https://github.com/apache/tika.git</developerConnection>
+    <url>https://github.com/apache/tika</url>
+  </scm>
+
+  <distributionManagement>
+    <repository>
+      <id>apache.releases.https</id>
+      <name>Apache Release Distribution Repository</name>
+      
<url>https://repository.apache.org/service/local/staging/deploy/maven2</url>
+    </repository>
+    <snapshotRepository>
+      <id>apache.snapshots.https</id>
+      <name>Apache Development Snapshot Repository</name>
+      <url>https://repository.apache.org/content/repositories/snapshots</url>
+    </snapshotRepository>
+  </distributionManagement>
+
   <dependencyManagement>
     <dependencies>
       <dependency>
@@ -409,4 +442,32 @@
       </dependency>
     </dependencies>
   </dependencyManagement>
+
+  <profiles>
+    <profile>
+      <id>apache-release</id>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-gpg-plugin</artifactId>
+            <version>3.2.8</version>
+            <configuration>
+              <gpgArguments>
+                <arg>--digest-algo=SHA512</arg>
+              </gpgArguments>
+            </configuration>
+            <executions>
+              <execution>
+                <id>sign-release-artifacts</id>
+                <goals>
+                  <goal>sign</goal>
+                </goals>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
+  </profiles>
 </project>
diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index d531459614..e5863a1998 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -4291,7 +4291,7 @@
   <mime-type type="application/x-iso9660-image">
     <acronym>ISO</acronym>
     <_comment>ISO 9660 CD-ROM filesystem data</_comment>
-    <magic priority="50">
+    <magic priority="60">
       <match value="CD001" type="string" offset="32769"/>
       <match value="CD001" type="string" offset="34817"/>
       <match value="CD001" type="string" offset="36865"/>
@@ -5952,7 +5952,9 @@
       <match value="0xffe3" type="string" offset="0"/> <!-- MP3 2.5 from 
pronom     -->
       <!-- TIKA-417: This is the UTF-16 LE byte order mark! -->
       <!-- match value="0xfffe" type="string" offset="0"/ --> <!-- V1, L1, CRC 
-->
-      <match value="0xffff" type="string" offset="0"/> <!-- V1, L1      -->
+      <!-- 0xffff has layer bits 00 which is "reserved" in the MPEG spec, not 
L1.
+           Removed: caused false positives with binary files. -->
+      <!-- match value="0xffff" type="string" offset="0"/ -->
       <!-- TIKA-4582: Require MP3 frame sync after ID3 tag to avoid false 
positives with other ID3-tagged formats -->
       <match value="ID3" type="string" offset="0">
          <match type="regex" value="\\xFF[\\xE3\\xF2-\\xF7\\xFA-\\xFD\\xFF]" 
offset="512:8192" />
diff --git 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
index 644bcbe345..b4679c1845 100644
--- 
a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
+++ 
b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
@@ -18,10 +18,13 @@ package org.apache.tika.eval.app;
 
 import java.io.IOException;
 import java.nio.file.Path;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 import java.sql.Types;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 
@@ -187,6 +190,7 @@ public class ExtractComparer extends ProfilerBase {
         List<Integer> numAttachmentsB = countAttachments(metadataListB);
 
         String sharedDigestKey = findSharedDigestKey(metadataListA, 
metadataListB);
+        String emptyDigest = computeEmptyDigest(sharedDigestKey);
         Map<Class, Object> tokenStatsA = null;
         Map<Class, Object> tokenStatsB = null;
         //now get that metadata
@@ -204,7 +208,7 @@ public class ExtractComparer extends ProfilerBase {
 
                 writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, 
containerID, numAttachmentsA, PROFILES_A);
                 writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
-                int matchIndex = getMatch(i, sharedDigestKey, handledB, 
metadataListA, metadataListB);
+                int matchIndex = getMatch(i, sharedDigestKey, emptyDigest, 
handledB, metadataListA, metadataListB);
 
                 if (matchIndex > -1 && !handledB.contains(matchIndex)) {
                     metadataB = metadataListB.get(matchIndex);
@@ -357,7 +361,7 @@ public class ExtractComparer extends ProfilerBase {
      * @param metadataListB
      * @return
      */
-    private int getMatch(int aIndex, String sharedDigestKey, Set<Integer> 
handledB, List<Metadata> metadataListA, List<Metadata> metadataListB) {
+    private int getMatch(int aIndex, String sharedDigestKey, String 
emptyDigest, Set<Integer> handledB, List<Metadata> metadataListA, 
List<Metadata> metadataListB) {
         //TODO: could make this more robust
         if (metadataListB == null || metadataListB.size() == 0) {
             return -1;
@@ -367,16 +371,23 @@ public class ExtractComparer extends ProfilerBase {
             return 0;
         }
 
+        Metadata thisMetadata = metadataListA.get(aIndex);
+
         if (sharedDigestKey != null) {
             //first try to find matching digests
-            return findMatchingDigests(sharedDigestKey, handledB, 
metadataListA.get(aIndex), metadataListB);
+            int digestMatch = findMatchingDigests(sharedDigestKey, 
emptyDigest, handledB, thisMetadata, metadataListB);
+            if (digestMatch > -1) {
+                return digestMatch;
+            }
         }
 
-        //assume same embedded resource path.  Not always true!
-        Metadata thisMetadata = metadataListA.get(aIndex);
+        //try matching by embedded resource path
         String embeddedPath = 
thisMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
         if (embeddedPath != null) {
             for (int j = 0; j < metadataListB.size(); j++) {
+                if (handledB.contains(j)) {
+                    continue;
+                }
                 String thatEmbeddedPath = metadataListB
                         .get(j)
                         .get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
@@ -394,11 +405,16 @@ public class ExtractComparer extends ProfilerBase {
         return -1;
     }
 
-    private int findMatchingDigests(String sharedDigestKey, Set<Integer> 
handledB, Metadata metadata, List<Metadata> metadataListB) {
+    private int findMatchingDigests(String sharedDigestKey, String 
emptyDigest, Set<Integer> handledB, Metadata metadata, List<Metadata> 
metadataListB) {
         String digestA = metadata.get(sharedDigestKey);
         if (digestA == null) {
             return -1;
         }
+        // Skip matching on the empty-content digest -- it's meaningless
+        // and causes false matches among unrelated zero-byte embedded docs
+        if (digestA.equalsIgnoreCase(emptyDigest)) {
+            return -1;
+        }
         String resourceName = 
metadata.get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH);
 
         int cand = -1;
@@ -418,6 +434,35 @@ public class ExtractComparer extends ProfilerBase {
         return cand;
     }
 
+    /**
+     * Computes the hex-encoded digest of empty (zero-byte) content for the
+     * algorithm identified by the shared digest key (e.g. 
"X-TIKA:digest:MD5").
+     * Returns null if the algorithm cannot be resolved.
+     */
+    private static String computeEmptyDigest(String sharedDigestKey) {
+        if (sharedDigestKey == null) {
+            return null;
+        }
+        // key format: "X-TIKA:digest:MD5" or "X-TIKA:digest:SHA256" etc.
+        String algo = sharedDigestKey.substring(DIGEST_KEY_PREFIX.length());
+        // normalize common names to MessageDigest algorithm names
+        // e.g. SHA256 -> SHA-256
+        if (algo.matches("(?i)SHA(\\d+)")) {
+            algo = algo.toUpperCase(Locale.ROOT).replaceFirst("SHA(\\d+)", 
"SHA-$1");
+        }
+        try {
+            MessageDigest md = MessageDigest.getInstance(algo);
+            byte[] emptyHash = md.digest(new byte[0]);
+            StringBuilder sb = new StringBuilder();
+            for (byte b : emptyHash) {
+                sb.append(String.format(Locale.ROOT, "%02x", b));
+            }
+            return sb.toString();
+        } catch (NoSuchAlgorithmException e) {
+            return null;
+        }
+    }
+
     private void writeContrasts(Map<Cols, String> data, ContrastStatistics 
contrastStatistics) {
         writeContrastString(data, Cols.TOP_10_MORE_IN_A, 
contrastStatistics.getTopNMoreA());
         writeContrastString(data, Cols.TOP_10_MORE_IN_B, 
contrastStatistics.getTopNMoreB());
diff --git 
a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml 
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml
index 6acb475901..25dcc74b0d 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-tags.xml
@@ -1775,6 +1775,31 @@
       limit 20000;
     </sql>
   </report>
+  <report reportName="Attachment Name Changes"
+          reportFilename="attachments/attachment_name_diffs.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select file_path,
+      pa.embedded_file_path as EMBEDDED_NAME_A,
+      pb.embedded_file_path as EMBEDDED_NAME_B,
+      c.length as CONTAINER_LENGTH,
+      ma.mime_string as MIME_STRING_A,
+      mb.mime_string as MIME_STRING_B
+      from profiles_a pa
+      join profiles_b pb on pa.id=pb.id
+      join containers c on pa.container_id=c.container_id
+      join mimes ma on pa.mime_id=ma.mime_id
+      join mimes mb on pb.mime_id=mb.mime_id
+      where pa.is_embedded=true
+      and pb.is_embedded=true
+      and pa.embedded_file_path is not null
+      and pb.embedded_file_path is not null
+      and pa.embedded_file_path &lt;&gt; pb.embedded_file_path
+      order by file_path, pa.embedded_file_path
+      limit 100000
+    </sql>
+  </report>
   <after>
     <sql>drop table if exists md5_multiples_tmp_a</sql>
     <sql>drop table if exists md5_multiples_tmp_b</sql>
diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml 
b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
index 667578daa9..082949e9cf 100644
--- a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
@@ -1489,6 +1489,32 @@
       limit 20000;
     </sql>
   </report>
+  <!-- attachment name changes -->
+  <report reportName="Attachment Name Changes"
+          reportFilename="attachments/attachment_name_diffs.xlsx"
+          format="xlsx"
+          includeSql="true">
+    <sql>
+      select file_path,
+      pa.embedded_file_path as EMBEDDED_NAME_A,
+      pb.embedded_file_path as EMBEDDED_NAME_B,
+      c.length as CONTAINER_LENGTH,
+      ma.mime_string as MIME_STRING_A,
+      mb.mime_string as MIME_STRING_B
+      from profiles_a pa
+      join profiles_b pb on pa.id=pb.id
+      join containers c on pa.container_id=c.container_id
+      join mimes ma on pa.mime_id=ma.mime_id
+      join mimes mb on pb.mime_id=mb.mime_id
+      where pa.is_embedded=true
+      and pb.is_embedded=true
+      and pa.embedded_file_path is not null
+      and pb.embedded_file_path is not null
+      and pa.embedded_file_path &lt;&gt; pb.embedded_file_path
+      order by file_path, pa.embedded_file_path
+      limit 100000
+    </sql>
+  </report>
   <after>
     <sql>drop table if exists md5_multiples_tmp_a</sql>
     <sql>drop table if exists md5_multiples_tmp_b</sql>
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 5a07ab53a5..e723ed6b9b 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -1762,15 +1762,10 @@
     </profile>
 
   </profiles>
-  <!--
-      <connection>scm:git:https://github.com/apache/</connection>
-      
<developerConnection>scm:git:https://github.com/apache/</developerConnection>
-      <url>https://github.com/apache/tika</url>
-  -->
   <scm>
     <tag>3.0.0-rc1</tag>
-    <connection>scm:git:https://github.com/apache/</connection>
-    
<developerConnection>scm:git:https://github.com/apache/</developerConnection>
+    <connection>scm:git:https://github.com/apache/tika.git</connection>
+    
<developerConnection>scm:git:https://github.com/apache/tika.git</developerConnection>
     <url>https://github.com/apache/tika</url>
   </scm>
 </project>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 09a343b914..37267b6644 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -319,7 +319,7 @@ abstract class AbstractPOIFSExtractor {
             }
 
             // Record what we can do about it
-            metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
+            metadata.set(Metadata.CONTENT_TYPE, mediaType.toString());
             metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + 
extension);
             metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, 
true);
             metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 9572e5e27c..e25bdf6d09 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -303,8 +303,10 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                         metadata.set(Office.HAS_EXTERNAL_PIVOT_DATA, true);
                     }
                 }
-            } catch (IOException | TikaException | SAXException e) {
-                // swallow
+            } catch (IOException | TikaException | SAXException | 
IllegalArgumentException e) {
+                // swallow -- POI throws IllegalArgumentException when a
+                // relationship references a part missing from the package
+                // (e.g. truncated files)
             }
         }
     }
@@ -363,8 +365,9 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                             metadata.set(Office.HAS_DDE_LINKS, true);
                         }
                     }
-                } catch (IOException | TikaException e) {
-                    // swallow
+                } catch (IOException | TikaException | 
IllegalArgumentException e) {
+                    // swallow -- POI can throw IllegalArgumentException
+                    // for malformed relationships
                 }
             }
         }
@@ -394,8 +397,10 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                         metadata.set(Office.HAS_WEB_QUERIES, true);
                     }
                 }
-            } catch (IOException | TikaException e) {
-                // swallow
+            } catch (IOException | TikaException | IllegalArgumentException e) 
{
+                // swallow -- POI throws IllegalArgumentException when a
+                // relationship references a part missing from the package
+                // (e.g. truncated files)
             }
         }
     }
@@ -417,8 +422,10 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                         XMLReaderUtils.parseSAX(is, new 
QueryTableHandler(xhtml), parseContext);
                     }
                 }
-            } catch (IOException | TikaException e) {
-                // swallow
+            } catch (IOException | TikaException | IllegalArgumentException e) 
{
+                // swallow -- POI throws IllegalArgumentException when a
+                // relationship references a part missing from the package
+                // (e.g. truncated files)
             }
         }
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index 62ec3bc8e3..698bd26991 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -475,6 +475,8 @@ public class ZipParser extends AbstractArchiveParser {
             try (InputStream entryStream = zipFile.getInputStream(entry)) {
                 TikaInputStream tis = TikaInputStream.get(entryStream, tmp, 
entryMetadata);
                 extractor.parseEmbedded(tis, xhtml, entryMetadata, new 
ParseContext(), true);
+            } catch (UnsupportedZipFeatureException e) {
+                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, 
parentMetadata);
             } finally {
                 tmp.dispose();
             }
@@ -515,6 +517,8 @@ public class ZipParser extends AbstractArchiveParser {
             try {
                 TikaInputStream tis = TikaInputStream.get(zis, tmp, 
entryMetadata);
                 extractor.parseEmbedded(tis, xhtml, entryMetadata, new 
ParseContext(), true);
+            } catch (UnsupportedZipFeatureException e) {
+                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, 
parentMetadata);
             } finally {
                 tmp.dispose();
             }

Reply via email to