This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2c878feaa164d92fc171531e8258b74be1784c80
Author: tallison <[email protected]>
AuthorDate: Fri Apr 3 16:30:43 2026 -0400

    further improvements
---
 .../org/apache/tika/sax/XHTMLContentHandler.java   |  40 +++++
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |  26 ++-
 .../microsoft/ooxml/FormattingTagManager.java      |  90 +++++++++-
 .../parser/microsoft/ooxml/InlineTagManager.java   | 196 ---------------------
 .../microsoft/ooxml/OOXMLPartContentCollector.java |  22 ++-
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  |  14 +-
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   |  33 +++-
 .../tika/parser/microsoft/ooxml/RunProperties.java |  11 ++
 .../ooxml/SXSLFPowerPointExtractorDecorator.java   |  18 +-
 .../ooxml/SXWPFWordExtractorDecorator.java         |  34 ++--
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java    |  13 +-
 .../parser/microsoft/ooxml/OOXMLDocxSAXTest.java   |  29 +++
 .../parser/microsoft/ooxml/OOXMLPptxSAXTest.java   |  39 +++-
 13 files changed, 314 insertions(+), 251 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java 
b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
index 3fd7766d03..6f9af40421 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
@@ -16,8 +16,10 @@
  */
 package org.apache.tika.sax;
 
+import java.util.ArrayDeque;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.Deque;
 import java.util.HashSet;
 import java.util.Set;
 
@@ -94,6 +96,15 @@ public class XHTMLContentHandler extends SafeContentHandler {
     private boolean headStarted = false;
     private boolean headEnded = false;
     private boolean useFrameset = false;
+
+    /**
+     * When true, tracks a stack of opened element names and throws
+     * a RuntimeException on mismatched endElement calls. This is a
+     * debugging aid for finding unbalanced SAX events in parsers.
+     * Enable via {@link #setStrictTagBalanceChecking(boolean)}.
+     */
+    private static boolean strictTagBalanceChecking = false;
+    private final Deque<String> tagStack = new ArrayDeque<>();
     public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
         this(handler, metadata, null);
     }
@@ -124,6 +135,17 @@ public class XHTMLContentHandler extends 
SafeContentHandler {
         }
     }
 
+    /**
+     * Enables or disables strict tag balance checking. When enabled,
+     * every startElement pushes onto a stack and every endElement
+     * verifies the tag matches, throwing a RuntimeException with the
+     * full stack trace on mismatch. This is a debugging tool, not for
+     * production use.
+     */
+    public static void setStrictTagBalanceChecking(boolean strict) {
+        strictTagBalanceChecking = strict;
+    }
+
     private static Set<String> unmodifiableSet(String... elements) {
         return Collections.unmodifiableSet(new 
HashSet<>(Arrays.asList(elements)));
     }
@@ -282,6 +304,9 @@ public class XHTMLContentHandler extends SafeContentHandler 
{
             }
 
             super.startElement(uri, local, name, attributes);
+            if (strictTagBalanceChecking) {
+                tagStack.push(name);
+            }
         }
     }
 
@@ -292,6 +317,21 @@ public class XHTMLContentHandler extends 
SafeContentHandler {
     @Override
     public void endElement(String uri, String local, String name) throws 
SAXException {
         if (!AUTO.contains(name)) {
+            if (strictTagBalanceChecking) {
+                if (tagStack.isEmpty()) {
+                    throw new RuntimeException(
+                            "STRICT TAG CHECK: endElement('" + name +
+                                    "') but tag stack is empty! No matching 
startElement.");
+                }
+                String expected = tagStack.peek();
+                if (!name.equals(expected)) {
+                    throw new RuntimeException(
+                            "STRICT TAG CHECK: endElement('" + name +
+                                    "') but expected '" + expected +
+                                    "'. Tag stack (top to bottom): " + 
tagStack);
+                }
+                tagStack.pop();
+            }
             super.endElement(uri, local, name);
             if (XHTML.equals(uri) && ENDLINE.contains(name)) {
                 newline();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 70d5920800..3074a802b8 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -624,6 +624,27 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
      * @param parentPart
      * @param contentHandler
      */
+    /**
+     * Safely resolves a related part, returning null if the part cannot be 
found
+     * instead of throwing {@link IllegalArgumentException}.
+     */
+    public static PackagePart safeGetRelatedPart(PackagePart source,
+                                           PackageRelationship relationship)
+            throws InvalidFormatException {
+        if (source == null || relationship == null) {
+            return null;
+        }
+        if (!source.isRelationshipExists(relationship)) {
+            return null;
+        }
+        try {
+            return source.getRelatedPart(relationship);
+        } catch (IllegalArgumentException e) {
+            // Relationship exists but target part is missing from the package
+            return null;
+        }
+    }
+
     void handleGeneralTextContainingPart(String contentType, String 
xhtmlClassLabel,
                                          PackagePart parentPart, Metadata 
parentMetadata,
                                          ContentHandler contentHandler) throws 
SAXException {
@@ -647,7 +668,10 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
                         relatedPartPRC.getRelationship(i);
                 try {
                     PackagePart relatedPartPart =
-                            
parentPart.getRelatedPart(relatedPartPackageRelationship);
+                            safeGetRelatedPart(parentPart, 
relatedPartPackageRelationship);
+                    if (relatedPartPart == null) {
+                        continue;
+                    }
                     try (InputStream stream = 
relatedPartPart.getInputStream()) {
                         XMLReaderUtils.parseSAX(stream,
                                 new EmbeddedContentHandler(contentHandler), 
context);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
index db88eedbae..0545cd0037 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
@@ -16,24 +16,37 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import java.util.Objects;
+
 import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.sax.XHTMLContentHandler;
 
 /**
- * Manages XHTML formatting tags (b, i, u, s) as a state machine,
- * ensuring proper nesting. Tags are always ordered from outermost to 
innermost:
- * {@code <b><i><s><u>text</u></s></i></b>}.
+ * Single owner of all run-scoped XHTML wrapper tags, ensuring proper nesting.
+ * Nesting order from outermost to innermost:
+ * {@code <a href="..."><b><i><s><u>text</u></s></i></b></a>}.
  * <p>
- * When a formatting change occurs, all tags that are "inside" the changing tag
- * must be closed first, then the change applied, then inner tags reopened.
- * This avoids generating malformed XHTML with overlapping tags.
+ * Hyperlinks come from two OOXML sources with different lifecycles:
+ * <ul>
+ *   <li><b>Wrapper hyperlinks</b> (DOCX {@code <w:hyperlink>}, field-code 
HYPERLINK):
+ *       opened/closed explicitly via {@link #openHyperlink}/{@link 
#closeHyperlink},
+ *       span multiple runs.</li>
+ *   <li><b>Run-property hyperlinks</b> (PPTX {@code <a:hlinkClick>}):
+ *       set on {@link RunProperties#setHlinkClickUrl}, managed automatically
+ *       by {@link #applyFormatting} per-run.</li>
+ * </ul>
+ * Both emit the same {@code <a href="...">} XHTML. Wrapper hyperlinks take
+ * precedence — run properties cannot override an active wrapper.
  */
 class FormattingTagManager {
 
     private final XHTMLContentHandler xhtml;
 
+    // Outermost to innermost: hyperlink > bold > italic > strike > underline
+    private String currentHyperlink = null;
+    private boolean wrapperHyperlinkActive = false;
     private boolean isBold = false;
     private boolean isItalics = false;
     private boolean isStrikeThrough = false;
@@ -43,13 +56,64 @@ class FormattingTagManager {
         this.xhtml = xhtml;
     }
 
+    /**
+     * Opens a wrapper-style hyperlink (DOCX {@code <w:hyperlink>} or 
field-code).
+     * Closes any open formatting tags first to maintain nesting.
+     * No-op if url is null.
+     */
+    void openHyperlink(String url) throws SAXException {
+        if (url == null) {
+            return;
+        }
+        closeFormattingTags();
+        if (currentHyperlink != null) {
+            xhtml.endElement("a");
+        }
+        xhtml.startElement("a", "href", url);
+        currentHyperlink = url;
+        wrapperHyperlinkActive = true;
+    }
+
+    /**
+     * Closes the active wrapper-style hyperlink. No-op if none was opened.
+     */
+    void closeHyperlink() throws SAXException {
+        if (currentHyperlink != null && wrapperHyperlinkActive) {
+            closeFormattingTags();
+            xhtml.endElement("a");
+            currentHyperlink = null;
+            wrapperHyperlinkActive = false;
+        }
+    }
+
+    /**
+     * Returns true if any hyperlink (wrapper or run-property) is currently 
open.
+     */
+    boolean isHyperlinkActive() {
+        return currentHyperlink != null;
+    }
+
     /**
      * Reconciles the current formatting state with the given run properties,
      * opening and closing XHTML tags as needed to maintain proper nesting.
      */
     void applyFormatting(RunProperties runProperties) throws SAXException {
+        // Run-property hyperlinks only when no wrapper is active
+        if (!wrapperHyperlinkActive) {
+            String newHyperlink = runProperties.getHlinkClickUrl();
+            if (!Objects.equals(newHyperlink, currentHyperlink)) {
+                closeFormattingTags();
+                if (currentHyperlink != null) {
+                    xhtml.endElement("a");
+                }
+                if (newHyperlink != null) {
+                    xhtml.startElement("a", "href", newHyperlink);
+                }
+                currentHyperlink = newHyperlink;
+            }
+        }
+
         if (runProperties.isBold() != isBold) {
-            // Bold is outermost — close everything inside it
             if (isStrikeThrough) {
                 xhtml.endElement("s");
                 isStrikeThrough = false;
@@ -112,10 +176,18 @@ class FormattingTagManager {
     }
 
     /**
-     * Closes all currently open formatting tags in proper nesting order
-     * (innermost first: u, s, i, b).
+     * Closes all currently open tags in proper nesting order.
      */
     void closeAll() throws SAXException {
+        closeFormattingTags();
+        if (currentHyperlink != null) {
+            xhtml.endElement("a");
+            currentHyperlink = null;
+            wrapperHyperlinkActive = false;
+        }
+    }
+
+    private void closeFormattingTags() throws SAXException {
         if (isUnderline) {
             xhtml.endElement("u");
             isUnderline = false;
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java
deleted file mode 100644
index 45eee33b57..0000000000
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml;
-
-import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
-import org.xml.sax.SAXException;
-
-import org.apache.tika.sax.XHTMLContentHandler;
-
-/**
- * Manages all inline XHTML elements (hyperlinks and formatting tags) as a
- * unified state machine, ensuring proper nesting.  The nesting order from
- * outermost to innermost is:
- * <pre>
- *   {@code <a> <b> <i> <s> <u> text </u> </s> </i> </b> </a>}
- * </pre>
- * <p>
- * When a change occurs to an outer element, all inner elements are closed
- * first, the change is applied, then inner elements are reopened as needed.
- * This prevents generating malformed XHTML with overlapping or unbalanced 
tags.
- * <p>
- * This class replaces the separate {@code FormattingTagManager} and the
- * {@code wroteHyperlinkStart} boolean that were previously tracked 
independently
- * in {@link OOXMLTikaBodyPartHandler}.
- */
-class InlineTagManager {
-
-    private final XHTMLContentHandler xhtml;
-
-    private boolean hyperlinkOpen = false;
-    private boolean isBold = false;
-    private boolean isItalics = false;
-    private boolean isStrikeThrough = false;
-    private boolean isUnderline = false;
-
-    InlineTagManager(XHTMLContentHandler xhtml) {
-        this.xhtml = xhtml;
-    }
-
-    /**
-     * Opens a hyperlink.  Since {@code <a>} is the outermost inline element,
-     * any existing inline elements (including a prior hyperlink) are closed
-     * first.
-     *
-     * @param href the link target; if {@code null} this is a no-op
-     */
-    void openHyperlink(String href) throws SAXException {
-        if (href == null) {
-            return;
-        }
-        // Close everything — formatting then any existing hyperlink
-        closeAll();
-        xhtml.startElement("a", "href", href);
-        hyperlinkOpen = true;
-    }
-
-    /**
-     * Closes the current hyperlink and all formatting inside it.
-     * No-op if no hyperlink is open.
-     */
-    void closeHyperlink() throws SAXException {
-        if (!hyperlinkOpen) {
-            return;
-        }
-        closeFormatting();
-        xhtml.endElement("a");
-        hyperlinkOpen = false;
-    }
-
-    /**
-     * Returns {@code true} if a hyperlink is currently open.
-     */
-    boolean isHyperlinkOpen() {
-        return hyperlinkOpen;
-    }
-
-    /**
-     * Reconciles the current formatting state with the given run properties,
-     * opening and closing XHTML tags as needed to maintain proper nesting.
-     * The nesting order for formatting is: {@code <b> <i> <s> <u>}.
-     */
-    void applyFormatting(RunProperties runProperties) throws SAXException {
-        if (runProperties.isBold() != isBold) {
-            // Bold is outermost formatting — close everything inside it
-            if (isUnderline) {
-                xhtml.endElement("u");
-                isUnderline = false;
-            }
-            if (isStrikeThrough) {
-                xhtml.endElement("s");
-                isStrikeThrough = false;
-            }
-            if (isItalics) {
-                xhtml.endElement("i");
-                isItalics = false;
-            }
-            if (runProperties.isBold()) {
-                xhtml.startElement("b");
-            } else {
-                xhtml.endElement("b");
-            }
-            isBold = runProperties.isBold();
-        }
-
-        if (runProperties.isItalics() != isItalics) {
-            if (isUnderline) {
-                xhtml.endElement("u");
-                isUnderline = false;
-            }
-            if (isStrikeThrough) {
-                xhtml.endElement("s");
-                isStrikeThrough = false;
-            }
-            if (runProperties.isItalics()) {
-                xhtml.startElement("i");
-            } else {
-                xhtml.endElement("i");
-            }
-            isItalics = runProperties.isItalics();
-        }
-
-        if (runProperties.isStrikeThrough() != isStrikeThrough) {
-            if (isUnderline) {
-                xhtml.endElement("u");
-                isUnderline = false;
-            }
-            if (runProperties.isStrikeThrough()) {
-                xhtml.startElement("s");
-            } else {
-                xhtml.endElement("s");
-            }
-            isStrikeThrough = runProperties.isStrikeThrough();
-        }
-
-        boolean runIsUnderlined = runProperties.getUnderline() != 
UnderlinePatterns.NONE;
-        if (runIsUnderlined != isUnderline) {
-            if (runIsUnderlined) {
-                xhtml.startElement("u");
-            } else {
-                xhtml.endElement("u");
-            }
-            isUnderline = runIsUnderlined;
-        }
-    }
-
-    /**
-     * Closes all currently open formatting tags in proper nesting order
-     * (innermost first: u, s, i, b).  Does NOT close the hyperlink.
-     */
-    void closeFormatting() throws SAXException {
-        if (isUnderline) {
-            xhtml.endElement("u");
-            isUnderline = false;
-        }
-        if (isStrikeThrough) {
-            xhtml.endElement("s");
-            isStrikeThrough = false;
-        }
-        if (isItalics) {
-            xhtml.endElement("i");
-            isItalics = false;
-        }
-        if (isBold) {
-            xhtml.endElement("b");
-            isBold = false;
-        }
-    }
-
-    /**
-     * Closes ALL open inline elements — formatting first, then hyperlink.
-     * This is the primary safety mechanism: call at every structural boundary
-     * (end of paragraph, table cell, table row, table, etc.) to guarantee
-     * well-formed XHTML.
-     */
-    void closeAll() throws SAXException {
-        closeFormatting();
-        if (hyperlinkOpen) {
-            xhtml.endElement("a");
-            hyperlinkOpen = false;
-        }
-    }
-}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
index 48dcc692d5..6cece158e8 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
@@ -96,7 +96,9 @@ class OOXMLPartContentCollector extends DefaultHandler {
             if (id != null && !skipIds.contains(id)) {
                 currentId = id;
                 buffer = new ByteArrayOutputStream();
-                writeString(buildWrapperOpenTag());
+                // Don't write wrapper open tag yet — inline xmlns declarations
+                // (e.g., xmlns:a on nested elements) haven't been captured via
+                // startPrefixMapping. Defer to endElement when all are known.
                 depth = 0;
             }
         }
@@ -110,8 +112,16 @@ class OOXMLPartContentCollector extends DefaultHandler {
         }
 
         if (depth == 0) {
-            writeString("</w:body>");
-            contentMap.put(currentId, buffer.toByteArray());
+            // Build the wrapper now — all startPrefixMapping calls from nested
+            // elements have been captured, so inline xmlns declarations are 
included.
+            byte[] wrapperOpen = 
buildWrapperOpenTag().getBytes(StandardCharsets.UTF_8);
+            byte[] content = buffer.toByteArray();
+            ByteArrayOutputStream combined =
+                    new ByteArrayOutputStream(wrapperOpen.length + 
content.length + 16);
+            combined.write(wrapperOpen, 0, wrapperOpen.length);
+            combined.write(content, 0, content.length);
+            writeString(combined, "</w:body>");
+            contentMap.put(currentId, combined.toByteArray());
             currentId = null;
             buffer = null;
             return;
@@ -171,8 +181,12 @@ class OOXMLPartContentCollector extends DefaultHandler {
     }
 
     private void writeString(String s) {
+        writeString(buffer, s);
+    }
+
+    private static void writeString(ByteArrayOutputStream target, String s) {
         byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
-        buffer.write(bytes, 0, bytes.length);
+        target.write(bytes, 0, bytes.length);
     }
 
     static String escape(String s) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index e104867db4..a18f52a4d2 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -56,7 +56,6 @@ public class OOXMLTikaBodyPartHandler
     private int tableDepth = 0;//table depth
     private int sdtDepth = 0;//
     private FormattingTagManager formattingTags;
-    private boolean wroteHyperlinkStart = false;
 
     //TODO: fix this
     //pWithinCell should be an array/stack of given cell depths
@@ -127,19 +126,12 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void hyperlinkStart(String link) throws SAXException {
-        if (link != null) {
-            xhtml.startElement("a", "href", link);
-            wroteHyperlinkStart = true;
-        }
+        formattingTags.openHyperlink(link);
     }
 
     @Override
     public void hyperlinkEnd() throws SAXException {
-        if (wroteHyperlinkStart) {
-            formattingTags.closeAll();
-            wroteHyperlinkStart = false;
-            xhtml.endElement("a");
-        }
+        formattingTags.closeHyperlink();
     }
 
     @Override
@@ -438,7 +430,7 @@ public class OOXMLTikaBodyPartHandler
     @Override
     public void startBookmark(String id, String name) throws SAXException {
         //skip bookmarks within hyperlinks
-        if (name != null && !wroteHyperlinkStart) {
+        if (name != null && !formattingTags.isHyperlinkActive()) {
             xhtml.startElement("a", "name", name);
             xhtml.endElement("a");
         }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index 7b0b5ceea5..46e25b299d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -154,9 +154,13 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
     private int inACFallbackDepth = 0;
     private boolean inDelText = false;
     //buffers rt in ruby sections (see 17.3.3.25)
-    private boolean inHlinkClick = false;
     private boolean inTextBox = false;
     private boolean inV = false; //in c:v in chart file
+    // True when we're inside a <pPr> that was a direct child of <p> (the 
first child).
+    // Only those pPr elements should trigger startParagraph on close.
+    // pPr elements nested inside other elements (e.g., <a:pPr> inside <a:fld>)
+    // must not be treated as paragraph-level properties.
+    private boolean inParagraphLevelPPr = false;
     // Field code tracking for instrText-based hyperlinks
     private boolean inField = false;
     private boolean inInstrText = false;
@@ -225,7 +229,12 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             throws SAXException {
         //TODO: checkBox, textBox, sym, headerReference, footerReference, 
commentRangeEnd
 
-        if (lastStartElementWasP && !PPR.equals(localName)) {
+        if (lastStartElementWasP && PPR.equals(localName)) {
+            // pPr is the first child of <p> — this is a paragraph-level pPr.
+            // Defer startParagraph until </pPr> so properties (style, 
numbering) are set first.
+            inParagraphLevelPPr = true;
+        } else if (lastStartElementWasP) {
+            // First child of <p> is not pPr — start paragraph immediately 
with defaults.
             bodyContentsHandler.startParagraph(currPProperties);
         }
 
@@ -321,8 +330,14 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             String hyperlink = null;
             if (hyperlinkId != null) {
                 hyperlink = linkedRelationships.get(hyperlinkId);
-                bodyContentsHandler.hyperlinkStart(hyperlink);
-                inHlinkClick = true;
+                if (inR) {
+                    // hlinkClick inside a run — treat as run property.
+                    // FormattingTagManager opens/closes <a> with the run 
lifecycle.
+                    currRunProperties.setHlinkClickUrl(hyperlink);
+                } else if (hyperlink != null) {
+                    // hlinkClick on a shape/picture (not in a run) — emit as 
self-closing ref
+                    bodyContentsHandler.externalRef("hlinkClick", hyperlink);
+                }
             }
         } else if (TBL.equals(localName)) {
             bodyContentsHandler.startTable();
@@ -498,12 +513,15 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
             handleEndOfRun();
         } else if (T.equals(localName)) {
             inT = false;
-        } else if (PPR.equals(localName)) {
+        } else if (PPR.equals(localName) && inParagraphLevelPPr) {
+            // Only process as paragraph properties if this pPr was a direct 
child of <p>.
+            // pPr inside other elements (e.g., <a:fld> fields) must be 
ignored.
             if (!pStarted) {
                 bodyContentsHandler.startParagraph(currPProperties);
                 pStarted = true;
             }
             currPProperties.reset();
+            inParagraphLevelPPr = false;
         } else if (P.equals(localName)) {
             if (runBuffer.length() > 0) {
                 //<p><tab></p>...this will treat that as if it were
@@ -553,16 +571,13 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
 
     private void handleEndOfRun() throws SAXException {
         bodyContentsHandler.run(currRunProperties, runBuffer.toString());
-        if (inHlinkClick) {
-            bodyContentsHandler.hyperlinkEnd();
-            inHlinkClick = false;
-        }
         inR = false;
         runBuffer.setLength(0);
         currRunProperties.setBold(false);
         currRunProperties.setItalics(false);
         currRunProperties.setStrike(false);
         currRunProperties.setUnderline(UnderlinePatterns.NONE.name());
+        currRunProperties.setHlinkClickUrl(null);
     }
 
     @Override
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
index 54d149f333..efed9c1348 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
@@ -30,6 +30,9 @@ public class RunProperties {
 
     UnderlinePatterns underline = UnderlinePatterns.NONE;
 
+    // PPTX hlinkClick hyperlink URL — set from <a:hlinkClick> inside <a:rPr>
+    String hlinkClickUrl = null;
+
     public boolean isItalics() {
         return italics;
     }
@@ -68,4 +71,12 @@ public class RunProperties {
             underline = UnderlinePatterns.SINGLE;
         }
     }
+
+    public String getHlinkClickUrl() {
+        return hlinkClickUrl;
+    }
+
+    public void setHlinkClickUrl(String url) {
+        this.hlinkClickUrl = url;
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index 7697452a41..1497a17b92 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -110,9 +110,12 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
         if (slidesPRC != null && slidesPRC.size() > 0) {
             for (int i = 0; i < slidesPRC.size(); i++) {
                 try {
-                    hiddenSlideCount += handleSlidePart(
-                            
mainDocument.getRelatedPart(slidesPRC.getRelationship(i)),
-                            xhtml);
+                    PackagePart slidePart =
+                            safeGetRelatedPart(mainDocument, 
slidesPRC.getRelationship(i));
+                    if (slidePart == null) {
+                        continue;
+                    }
+                    hiddenSlideCount += handleSlidePart(slidePart, xhtml);
                 } catch (InvalidFormatException | ZipException e) {
                     
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                             ExceptionUtils.getStackTrace(e));
@@ -152,7 +155,7 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
         for (int i = 0; i < prc.size(); i++) {
             PackagePart commentAuthorsPart = null;
             try {
-                commentAuthorsPart = 
mainDocument.getRelatedPart(prc.getRelationship(i));
+                commentAuthorsPart = safeGetRelatedPart(mainDocument, 
prc.getRelationship(i));
             } catch (InvalidFormatException e) {
                 metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                         ExceptionUtils.getStackTrace(e));
@@ -263,7 +266,7 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
             for (int i = 0; i < slidePRC.size(); i++) {
                 PackagePart slidePart = null;
                 try {
-                    slidePart = 
mainDocument.getRelatedPart(slidePRC.getRelationship(i));
+                    slidePart = safeGetRelatedPart(mainDocument, 
slidePRC.getRelationship(i));
                 } catch (InvalidFormatException e) {
                     
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                             ExceptionUtils.getStackTrace(e));
@@ -286,7 +289,7 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
                 for (int i = 0; i < prc.size(); i++) {
                     PackagePart pp = null;
                     try {
-                        pp = 
mainDocument.getRelatedPart(prc.getRelationship(i));
+                        pp = safeGetRelatedPart(mainDocument, 
prc.getRelationship(i));
                     } catch (InvalidFormatException e) {
                         
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
                                 ExceptionUtils.getStackTrace(e));
@@ -305,7 +308,8 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
 
         for (String relation : new 
String[]{XSLFRelation.VML_DRAWING.getRelation(),
                 XSLFRelation.SLIDE_LAYOUT.getRelation(), 
XSLFRelation.NOTES_MASTER.getRelation(),
-                XSLFRelation.NOTES.getRelation()}) {
+                XSLFRelation.NOTES.getRelation(), 
XSLFRelation.CHART.getRelation(),
+                XSLFRelation.DIAGRAM_DRAWING.getRelation()}) {
             PackageRelationshipCollection prc = null;
             try {
                 prc = slidePart.getRelationshipsByType(relation);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 45c2725e67..2f42c7d8a8 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -75,7 +75,8 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
             new String[]{XWPFRelation.HEADER.getRelation(), 
XWPFRelation.FOOTER.getRelation(),
                     XWPFRelation.FOOTNOTE.getRelation(),
                     
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes";,
-                    
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"};
+                    
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments";,
+                    AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA};
 
     // Relationship types for Word settings
     private static final String SETTINGS_RELATION =
@@ -198,7 +199,7 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
             PackageRelationshipCollection settingsRels =
                     documentPart.getRelationshipsByType(SETTINGS_RELATION);
             if (settingsRels != null && settingsRels.size() > 0) {
-                PackagePart settingsPart = 
documentPart.getRelatedPart(settingsRels.getRelationship(0));
+                PackagePart settingsPart = safeGetRelatedPart(documentPart, 
settingsRels.getRelationship(0));
                 if (settingsPart != null) {
                     try (InputStream is = settingsPart.getInputStream()) {
                         WordSettingsHandler handler = new 
WordSettingsHandler(xhtml);
@@ -218,7 +219,7 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
             PackageRelationshipCollection webSettingsRels =
                     documentPart.getRelationshipsByType(WEB_SETTINGS_RELATION);
             if (webSettingsRels != null && webSettingsRels.size() > 0) {
-                PackagePart webSettingsPart = 
documentPart.getRelatedPart(webSettingsRels.getRelationship(0));
+                PackagePart webSettingsPart = safeGetRelatedPart(documentPart, 
webSettingsRels.getRelationship(0));
                 if (webSettingsPart != null) {
                     try (InputStream is = webSettingsPart.getInputStream()) {
                         WebSettingsHandler handler = new 
WebSettingsHandler(xhtml);
@@ -276,7 +277,10 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                 if (headersPRC != null) {
                     for (int i = 0; i < headersPRC.size(); i++) {
                         PackagePart header =
-                                
documentPart.getRelatedPart(headersPRC.getRelationship(i));
+                                safeGetRelatedPart(documentPart, 
headersPRC.getRelationship(i));
+                        if (header == null) {
+                            continue;
+                        }
                         handlePart(header, styles, listManager, xhtml,
                                 OOXMLInlineBodyPartMap.EMPTY);
                     }
@@ -315,7 +319,10 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                 if (prc != null) {
                     for (int i = 0; i < prc.size(); i++) {
                         PackagePart packagePart =
-                                
documentPart.getRelatedPart(prc.getRelationship(i));
+                                safeGetRelatedPart(documentPart, 
prc.getRelationship(i));
+                        if (packagePart == null) {
+                            continue;
+                        }
                         handlePart(packagePart, styles, listManager, xhtml,
                                 OOXMLInlineBodyPartMap.EMPTY);
                     }
@@ -396,7 +403,7 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                 continue;
             }
             try {
-                PackagePart emfPart = documentPart.getRelatedPart(
+                PackagePart emfPart = safeGetRelatedPart(documentPart,
                         documentPart.getRelationship(emfRId));
                 if (emfPart == null || emfPart.getContentType() == null) {
                     continue;
@@ -457,7 +464,10 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
             OOXMLPartContentCollector collector =
                     new OOXMLPartContentCollector(wrapperElements, skipIds);
             for (int i = 0; i < prc.size(); i++) {
-                PackagePart part = 
documentPart.getRelatedPart(prc.getRelationship(i));
+                PackagePart part = safeGetRelatedPart(documentPart, 
prc.getRelationship(i));
+                if (part == null) {
+                    continue;
+                }
                 // collect the part's linked relationships (for picture 
resolution)
                 Map<String, String> partRels =
                         loadLinkedRelationships(part, true, metadata);
@@ -484,7 +494,7 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
             if (stylesRelationShip == null) {
                 return null;
             }
-            PackagePart stylesPart = 
packagePart.getRelatedPart(stylesRelationShip);
+            PackagePart stylesPart = safeGetRelatedPart(packagePart, 
stylesRelationShip);
             if (stylesPart == null) {
                 return null;
             }
@@ -504,7 +514,7 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                 if (numberingRelationShip == null) {
                     return null;
                 }
-                PackagePart numberingPart = 
packagePart.getRelatedPart(numberingRelationShip);
+                PackagePart numberingPart = safeGetRelatedPart(packagePart, 
numberingRelationShip);
                 if (numberingPart == null) {
                     return null;
                 }
@@ -545,8 +555,10 @@ public class SXWPFWordExtractorDecorator extends 
AbstractOOXMLExtractor {
                 if (prc != null) {
                     for (int i = 0; i < prc.size(); i++) {
                         PackagePart packagePart =
-                                
documentPart.getRelatedPart(prc.getRelationship(i));
-                        relatedParts.add(packagePart);
+                                safeGetRelatedPart(documentPart, 
prc.getRelationship(i));
+                        if (packagePart != null) {
+                            relatedParts.add(packagePart);
+                        }
                     }
                 }
             } catch (InvalidFormatException e) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index ec1a1fa437..45c30355e0 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -43,6 +43,7 @@ import org.apache.tika.exception.RuntimeSAXException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
 import org.apache.tika.parser.microsoft.ooxml.EditType;
 import 
org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
 import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
@@ -193,7 +194,11 @@ public class XWPFEventBasedWordExtractor implements 
POIXMLTextExtractor {
                     
documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
             if (headersPRC != null) {
                 for (int i = 0; i < headersPRC.size(); i++) {
-                    PackagePart header = 
documentPart.getRelatedPart(headersPRC.getRelationship(i));
+                    PackagePart header = 
AbstractOOXMLExtractor.safeGetRelatedPart(
+                            documentPart, headersPRC.getRelationship(i));
+                    if (header == null) {
+                        continue;
+                    }
                     handlePart(header, xwpfListManager, sb);
                 }
             }
@@ -213,7 +218,11 @@ public class XWPFEventBasedWordExtractor implements 
POIXMLTextExtractor {
                 if (prc != null) {
                     for (int i = 0; i < prc.size(); i++) {
                         PackagePart packagePart =
-                                
documentPart.getRelatedPart(prc.getRelationship(i));
+                                AbstractOOXMLExtractor.safeGetRelatedPart(
+                                        documentPart, prc.getRelationship(i));
+                        if (packagePart == null) {
+                            continue;
+                        }
                         handlePart(packagePart, xwpfListManager, sb);
                     }
                 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
index 02d5247d44..ac533400e0 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
@@ -30,6 +30,7 @@ import org.junit.jupiter.api.Test;
 import org.apache.tika.config.loader.TikaLoader;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -363,4 +364,32 @@ public class OOXMLDocxSAXTest extends 
AbstractOOXMLDocxTest {
         Metadata m = metadataList.get(0);
         assertEquals("true", m.get(Office.HAS_FRAMESETS));
     }
+
+    /**
+     * Test with external DOCX files known to trigger "prefix not bound"
+     * from missing namespace declarations in footnote/endnote fragments.
+     * Enable by setting system property "tika.test.docx.namespace" to a file 
path.
+     */
+    @Test
+    public void testNamespaceInFragments() throws Exception {
+        String filePath = System.getProperty("tika.test.docx.namespace");
+        if (filePath == null) {
+            return;
+        }
+        java.io.File f = new java.io.File(filePath);
+        if (!f.isFile()) {
+            return;
+        }
+        AutoDetectParser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        org.xml.sax.ContentHandler handler =
+                new org.apache.tika.sax.BodyContentHandler(-1);
+        try (TikaInputStream tis = TikaInputStream.get(f.toPath())) {
+            parser.parse(tis, handler, metadata, getParseContext());
+        }
+        String[] warnings = 
metadata.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING);
+        for (String w : warnings) {
+            assertNotContained("not bound", w);
+        }
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPptxSAXTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPptxSAXTest.java
index 0d0554d049..81e22df764 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPptxSAXTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPptxSAXTest.java
@@ -40,6 +40,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
 
 /**
  * Runs the shared PPTX tests using the SAX-based streaming parser,
@@ -62,7 +63,7 @@ public class OOXMLPptxSAXTest extends AbstractOOXMLPptxTest {
         List<Metadata> metadataList =
                 getRecursiveMetadata("testPPT_various2.pptx", 
getParseContext());
 
-        assertEquals(14, metadataList.size(), "right number of attachments");
+        assertEquals(15, metadataList.size(), "right number of attachments");
 
         String mainContent = 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
 
@@ -234,6 +235,42 @@ public class OOXMLPptxSAXTest extends 
AbstractOOXMLPptxTest {
         assertContainsAtLeast(parsedBy, metadataList);
     }
 
+    @Test
+    public void testStrictTagBalancePptx() throws Exception {
+        XHTMLContentHandler.setStrictTagBalanceChecking(true);
+        try {
+            // Test with the standard test file first — should not throw
+            getRecursiveMetadata("testPPT_various2.pptx", getParseContext());
+        } finally {
+            XHTMLContentHandler.setStrictTagBalanceChecking(false);
+        }
+    }
+
+    /**
+     * Test with external PPTX files known to trigger "suspected zip bomb"
+     * from unbalanced SAX tags. Enable by setting system property
+     * "tika.test.pptx.zipbomb" to a file path.
+     */
+    @Test
+    public void testStrictTagBalanceExternalPptx() throws Exception {
+        String filePath = System.getProperty("tika.test.pptx.zipbomb");
+        if (filePath == null) {
+            return;
+        }
+        java.io.File f = new java.io.File(filePath);
+        if (!f.isFile()) {
+            return;
+        }
+        XHTMLContentHandler.setStrictTagBalanceChecking(true);
+        try (TikaInputStream tis = TikaInputStream.get(f.toPath())) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler(-1);
+            AUTO_DETECT_PARSER.parse(tis, handler, metadata, 
getParseContext());
+        } finally {
+            XHTMLContentHandler.setStrictTagBalanceChecking(false);
+        }
+    }
+
     @Test
     public void testEncrypted() throws Exception {
         Map<String, String> tests = new HashMap<>();

Reply via email to